From 49ec05c8eeda8330af625b7e580a064e8f8c6dfa Mon Sep 17 00:00:00 2001 From: JamesLim-sy Date: Mon, 10 Apr 2023 16:56:07 +0800 Subject: [PATCH 001/405] first commit --- cmake/external/flashattn.cmake | 7 +- .../operators/fused/fused_gate_attention.h | 362 ++++++++++++++---- paddle/phi/backends/dynload/flashattn.h | 3 +- 3 files changed, 287 insertions(+), 85 deletions(-) diff --git a/cmake/external/flashattn.cmake b/cmake/external/flashattn.cmake index eae35d90f50f0..53b11da6a6732 100644 --- a/cmake/external/flashattn.cmake +++ b/cmake/external/flashattn.cmake @@ -19,8 +19,11 @@ add_definitions(-DPADDLE_WITH_FLASHATTN) set(FLASHATTN_PREFIX_DIR ${THIRD_PARTY_PATH}/flashattn) set(FLASHATTN_SOURCE_SUBDIR csrc/flash_attn) set(FLASHATTN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/flashattn) -set(FLASHATTN_REPOSITORY ${GIT_URL}/PaddlePaddle/flash-attention.git) -set(FLASHATTN_TAG f0edf243a813a65d05c75fcb331b2a95faf96bbc) +set(FLASHATTN_REPOSITORY ${GIT_URL}/JamesLim-sy/flash-attention.git) +set(FLASHATTN_TAG 05ecd140356cffe46444cd364944f6ec5067e724) + +#set(FLASHATTN_REPOSITORY ${GIT_URL}/PaddlePaddle/flash-attention.git) +#set(FLASHATTN_TAG f0edf243a813a65d05c75fcb331b2a95faf96bbc) set(FLASHATTN_INCLUDE_DIR "${FLASHATTN_INSTALL_DIR}/include" diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h index c8c4733df2e2e..7f634e835cadf 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention.h +++ b/paddle/fluid/operators/fused/fused_gate_attention.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include "paddle/phi/kernels/arange_kernel.h" #include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" @@ -146,6 +147,22 @@ struct GateAttentionConfig { gate_out_dims = {batch_size, seq_len_m, seq_len_r, num_heads, head_dim}; } + bool UseFlashAttn(const bool merge_qkv, const int64_t head_dim) { + if (merge_qkv) { + switch (head_dim) { + case 16: + case 32: + case 64: + case 128: + return true; + default: + return false; + } + } else { + return false; + } + } + int64_t GetQuerySize() const { return batch_size * seq_len_m * seq_len_r * num_heads * head_dim; } @@ -317,6 +334,15 @@ struct GateAttentionGradConfig : public GateAttentionConfig { phi::DenseTensor qk_out_grad; }; +template +__global__ void FlashAttRange( + int start, int step, int size, OUT_TYPE* out1, OUT_TYPE* out2) { + CUDA_KERNEL_LOOP(index, size) { + out1[index] = static_cast(start + step * index); + out2[index] = static_cast(start + step * index); + } +} + template class FMHAGateRef { public: @@ -336,106 +362,278 @@ class FMHAGateRef { T* q_ptr = nullptr; T* k_ptr = nullptr; T* v_ptr = nullptr; - if (merge_qkv_) { - // qkv_transpose_out = transpose(qkv_out) + + if (config->UseFlashAttn(merge_qkv_, config->head_dim)) { PADDLE_ENFORCE_NOT_NULL( qkv_transpose_out, platform::errors::NotFound("The input qkv_transpose_out can not be " "nullptr when merge_qkv is true.")); - phi::DenseTensor* qkv_out = config->GetQKVOut(); ComputeQKVTransposeForward(*qkv_out, qkv_transpose_out); config->ClearQKVOut(); + // batch_dims = q.shape[:-3]; + // n, no_heads, c = q.shape[-3:] + // 1. Dealing with qkv_out for flash_attn. + auto& qkv_dims = qkv_transpose_out.dims(); + auto rank = qkv_dims.size(); + auto n = qkv_dims[rank - 3]; + auto no_heads = qkv_dims[rank - 2]; + auto c = qkv_dims[rank - 1]; + + int64_t q_batch_size = 1; + for (int i = 0; i < (rank - 3); ++i) { + q_batch_size *= qkv_dims[i]; + } + qkv_transpose_out.Resize({q_batch_size * n, no_heads, c}); // q_size == k_size int64_t q_size = config->GetQuerySize(); q_ptr = qkv_transpose_out->data(); k_ptr = q_ptr + q_size; v_ptr = k_ptr + q_size; - } else { - PADDLE_ENFORCE_NOT_NULL( - q_transpose_out, - platform::errors::NotFound("The input q_transpose_out can not be " - "nullptr when merge_qkv is false.")); - PADDLE_ENFORCE_NOT_NULL( - k_transpose_out, - platform::errors::NotFound("The input k_transpose_out can not be " - "nullptr when merge_qkv is false.")); - PADDLE_ENFORCE_NOT_NULL( - v_transpose_out, - platform::errors::NotFound("The input v_transpose_out can not be " - "nullptr when merge_qkv is false.")); - - phi::DenseTensor* query_out = config->GetQueryOut(); - phi::DenseTensor* key_out = config->GetKeyOut(); - phi::DenseTensor* value_out = config->GetValueOut(); - ComputeQKVTransposeForward(*query_out, - *key_out, - *value_out, - q_transpose_out, - k_transpose_out, - v_transpose_out); - - // q_size != k_size - q_ptr = q_transpose_out->data(); - k_ptr = k_transpose_out->data(); - v_ptr = v_transpose_out->data(); - } - - // qk_out = BatchedGEMM(Q, K^T) - // [batch_size, seq_len_m, num_heads, seq_len_r, head_dim] * - // [batch_size, seq_len_m, num_heads, m_size, head_dim] - // -> [batch_size, seq_len_m, num_heads, seq_len_r, m_size] - phi::DenseTensor* qk_out = config->GetQKOut(softmax_out); - T* qk_out_ptr = qk_out->data(); - - int64_t gemm_batch_size = - config->batch_size * config->seq_len_m * config->num_heads; - int64_t gemm_m = config->seq_len_r; - int64_t gemm_n = config->m_size; - int64_t gemm_k = config->head_dim; - - T alpha = static_cast(1.0 / sqrt(config->head_dim)); - ComputeBatchedGEMM(q_ptr, - k_ptr, - qk_out_ptr, - false, - true, - gemm_m, - gemm_n, - gemm_k, - gemm_batch_size, - alpha); - // softmax_out = softmax(qk_out + nonbatched_bias + src_mask) - ComputeBiasMaskSoftmaxForward( - nonbatched_bias, src_mask, qk_out, softmax_out); - config->ClearQKOut(); + // 2. Dealing with cu_seq_q and cu_seq_k for flash_attn. + phi::DenseTensor cu_seq_q, cu_seq_k; + int64_t temp_size = (q_batch_size + 1) * n; + int64_t seq_size = 0; + int64_t start = 0, end = temp_size, step = n; + phi::funcs::GetSize(start, end, step, &seq_size); + cu_seq_q.Resize({seq_size}); + cu_seq_k.Resize({seq_size}); + AllocWithDebugInfo(dev_ctx_, "flash_attn: cu_seq_q", &cu_seq_q); + AllocWithDebugInfo(dev_ctx_, "flash_attn: cu_seq_k", &cu_seq_k); + + int64_t block = std::min(seq_size, static_cast(256)); + int64_t grid = (range_size + block - 1) / block; + FlashAttRange<<>>( + start, step, end, ` cu_seq_q.data(), cu_seq_k.data()); + + // 1. Dealing with mask and bias for flash_attn. + phi::DenseTensor tmp_mask, tmp_bias; + if (src_mask) { + int64_t mask_first_dim = 0; + auto& mask_dim = src_mask->dims(); + for (int i = 0; i < mask_dim.size() - 3; ++i) { + mask_first_dim *= mask_dim[i]; + } + tmp_mask.ShareDataWith(*src_mask); + tmp_mask.Resize( + {mask_first_dim, mask_dim[-3], mask_dim[-2], mask_dim[-1]}); + } + if (nonbatched_bias) { + int64_t bias_first_dim = 0; + auto& bias_dim = nonbatched_bias->dims(); + for (int i = 0; i < bias_dim.size() - 3; ++i) { + bias_first_dim *= bias_dim[i]; + } + tmp_bias.ShareDataWith(*nonbatched_bias); + tmp_bias.Resize( + {bias_first_dim, bias_dim[-3], bias_dim[-2], bias_dim[-1]}); + } - // qktv_out = BatchedGEMM(softmax_out, V) - // [batch_size, seq_len_m, num_heads, seq_len_r, m_size] * - // [batch_size, seq_len_m, num_heads, m_size, head_dim] - // -> [batch_size, seq_len_m, num_heads, seq_len_r, head_dim] - phi::DenseTensor* qktv_out = config->GetQKTVOut(gate_out); - T* qktv_out_ptr = qktv_out->data(); + // For flash_attn + bool is_bf16_ = + qkv_transpose_out.dtype() == DataType::BFLOAT16 ? true : false; + int64_t batch_size_ = cu_seq_q.numel() - 1; + int64_t total_q_ = qkv_dims[1]; // q.dims()[0] + int64_t total_k_ = qkv_dims[1]; // q.dims()[0] + int64_t num_heads_ = qkv_dims[2]; // q.dims()[1] + int64_t head_size_ = qkv_dims[3]; // q.dims()[2] + int max_seqlen_q_; + int max_seqlen_k_; + int num_splits = 0; // 0 for an internal heuristic, which is optimal + + phi::DenseTensor softmax_lse; + softmax_lse.Resize({batch_size_, num_heads_, max_seqlen_q_}); + AllocWithDebugInfo( + dev_ctx_, "flash_attn: softmax_lse", &softmax_lse); + + auto gen = ctx.GetGenerator(); + uint64_t inc = batch_size * num_heads * 32; + auto seed_offset_pair = gen->IncrementOffset(inc); + + uint64_t seed = seed_offset_pair.first; + uint64_t offset = seed_offset_pair.second; + + seed_offset->Resize({2}); + auto* seed_offset_data = ctx.template HostAlloc(seed_offset); + seed_offset_data[0] = static_cast(seed); + seed_offset_data[1] = static_cast(offset); + + uint64_t workspace_size; + bool succ = phi::dynload::flash_attn_fwd( + static_cast(q_ptr), + static_cast(k_ptr), + static_cast(v_ptr), + nullptr, // for calculation workspace size + static_cast(cu_seq_q.data()), + static_cast(cu_seq_k.data()), + total_q_, + total_k_, + batch_size_, + num_heads_, + head_size_, + max_seqlen_q_, + max_seqlen_k_, + /*p_dropout=*/0.f, + /*softmax_scale=*/static_cast(1.0 / sqrt(config->head_size_)); + /*zero_tensors=*/false, + /*is_causal=*/false, + is_bf16_, + num_splits, + softmax_lse.data(); + softmax_out->data(); + nullptr, + &workspace_size, + dev_ctx_.stream(), + seed, + offset, + src_mask ? src_mask->data() : nullptr, + nonbatched_bias ? nonbatched_bias->data() : nullptr, + src_mask->dims().Get(), + nonbatched_bias->dims().Get()); + if (!succ) { + PADDLE_THROW(phi::errors::External(phi::dynload::flash_attn_error())); + } + DenseTensor workspace; + if (workspace_size > 0) { + workspace = + Empty(ctx, {int64_t(workspace_size / sizeof(float))}); + } + bool succ = phi::dynload::flash_attn_fwd( + static_cast(q_ptr), + static_cast(k_ptr), + static_cast(v_ptr), + static_cast(fmha_out), // for calculation workspace size + static_cast(cu_seq_q.data()), + static_cast(cu_seq_k.data()), + total_q_, + total_k_, + batch_size_, + num_heads_, + head_size_, + max_seqlen_q_, + max_seqlen_k_, + /*p_dropout=*/0.f, + /*softmax_scale=*/static_cast(1.0 / sqrt(config->head_size_)); + /*zero_tensors=*/false, + /*is_causal=*/false, + is_bf16_, + num_splits, + softmax_lse.data(); + softmax_out->data(); + workspace_size > 0 ? workspace.data() : nullptr, + &workspace_size, + dev_ctx_.stream(), + seed, + offset, + src_mask ? src_mask->data() : nullptr, + nonbatched_bias ? nonbatched_bias->data() : nullptr, + src_mask->dims().Get(), + nonbatched_bias->dims().Get()); + } else { + if (merge_qkv_) { + // qkv_transpose_out = transpose(qkv_out) + PADDLE_ENFORCE_NOT_NULL( + qkv_transpose_out, + platform::errors::NotFound("The input qkv_transpose_out can not be " + "nullptr when merge_qkv is true.")); + + phi::DenseTensor* qkv_out = config->GetQKVOut(); + ComputeQKVTransposeForward(*qkv_out, qkv_transpose_out); + config->ClearQKVOut(); + + // q_size == k_size + int64_t q_size = config->GetQuerySize(); + q_ptr = qkv_transpose_out->data(); + k_ptr = q_ptr + q_size; + v_ptr = k_ptr + q_size; + } else { + PADDLE_ENFORCE_NOT_NULL( + q_transpose_out, + platform::errors::NotFound("The input q_transpose_out can not be " + "nullptr when merge_qkv is false.")); + PADDLE_ENFORCE_NOT_NULL( + k_transpose_out, + platform::errors::NotFound("The input k_transpose_out can not be " + "nullptr when merge_qkv is false.")); + PADDLE_ENFORCE_NOT_NULL( + v_transpose_out, + platform::errors::NotFound("The input v_transpose_out can not be " + "nullptr when merge_qkv is false.")); + + phi::DenseTensor* query_out = config->GetQueryOut(); + phi::DenseTensor* key_out = config->GetKeyOut(); + phi::DenseTensor* value_out = config->GetValueOut(); + ComputeQKVTransposeForward(*query_out, + *key_out, + *value_out, + q_transpose_out, + k_transpose_out, + v_transpose_out); + + // q_size != k_size + q_ptr = q_transpose_out->data(); + k_ptr = k_transpose_out->data(); + v_ptr = v_transpose_out->data(); + } + // qk_out = BatchedGEMM(Q, K^T) + // [batch_size, seq_len_m, num_heads, seq_len_r, head_dim] * + // [batch_size, seq_len_m, num_heads, m_size, head_dim] + // -> [batch_size, seq_len_m, num_heads, seq_len_r, m_size] + phi::DenseTensor* qk_out = config->GetQKOut(softmax_out); + T* qk_out_ptr = qk_out->data(); + + int64_t gemm_batch_size = + config->batch_size * config->seq_len_m * config->num_heads; + int64_t gemm_m = config->seq_len_r; + int64_t gemm_n = config->m_size; + int64_t gemm_k = config->head_dim; + // attn = torch.matmul(q, k.transpose(-1, -2)) + T alpha = static_cast(1.0 / sqrt(config->head_dim)); + ComputeBatchedGEMM(q_ptr, + k_ptr, + qk_out_ptr, + false, + true, + gemm_m, + gemm_n, + gemm_k, + gemm_batch_size, + alpha); + // attn = softmax_dropout(attn, 0, self.training, mask=mask, bias=bias) + // softmax_out = softmax(qk_out + nonbatched_bias + src_mask) + ComputeBiasMaskSoftmaxForward( + nonbatched_bias, src_mask, qk_out, softmax_out); + config->ClearQKOut(); + + // qktv_out = BatchedGEMM(softmax_out, V) + // [batch_size, seq_len_m, num_heads, seq_len_r, m_size] * + // [batch_size, seq_len_m, num_heads, m_size, head_dim] + // -> [batch_size, seq_len_m, num_heads, seq_len_r, head_dim] + phi::DenseTensor* qktv_out = config->GetQKTVOut(gate_out); + T* qktv_out_ptr = qktv_out->data(); - gemm_m = config->seq_len_r; - gemm_n = config->head_dim; - gemm_k = config->m_size; + gemm_m = config->seq_len_r; + gemm_n = config->head_dim; + gemm_k = config->m_size; - T* softmax_out_ptr = softmax_out->data(); - ComputeBatchedGEMM(softmax_out_ptr, - v_ptr, - qktv_out_ptr, - false, - false, - gemm_m, - gemm_n, - gemm_k, - gemm_batch_size); + // o = torch.matmul(attn, v) + T* softmax_out_ptr = softmax_out->data(); + ComputeBatchedGEMM(softmax_out_ptr, + v_ptr, + qktv_out_ptr, + false, + false, + gemm_m, + gemm_n, + gemm_k, + gemm_batch_size); + // fmha_out = transpose(qktv_out) + // o = o.transpose(-2, -3).contiguous() + ComputeQKTVTransposeForward(*qktv_out, fmha_out); + } - // fmha_out = transpose(qktv_out) - ComputeQKTVTransposeForward(*qktv_out, fmha_out); config->ClearQKTVOut(); if (config->has_gating) { gate_out->Resize(config->gate_out_dims); diff --git a/paddle/phi/backends/dynload/flashattn.h b/paddle/phi/backends/dynload/flashattn.h index ec443fd9f8e8e..87836a49a700d 100644 --- a/paddle/phi/backends/dynload/flashattn.h +++ b/paddle/phi/backends/dynload/flashattn.h @@ -46,7 +46,8 @@ extern void* flashattn_dso_handle; #define FLASHATTN_ROUTINE_EACH(__macro) \ __macro(flash_attn_fwd); \ __macro(flash_attn_bwd); \ - __macro(flash_attn_error); + __macro(flash_attn_error); \ + __macro(flash_attn_fwd_with_bias_and_mask); FLASHATTN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_FLASHATTN_WRAP); From 76b87c2855b97116cef4f2b4404a8ab89eec6df3 Mon Sep 17 00:00:00 2001 From: JamesLim-sy Date: Tue, 18 Apr 2023 18:43:05 +0800 Subject: [PATCH 002/405] fix some bugs --- cmake/external/flashattn.cmake | 6 +- .../operators/fused/fused_gate_attention.h | 291 ++++++++++++------ paddle/phi/backends/dynload/flashattn.h | 4 +- 3 files changed, 205 insertions(+), 96 deletions(-) diff --git a/cmake/external/flashattn.cmake b/cmake/external/flashattn.cmake index 53b11da6a6732..a37559c483421 100644 --- a/cmake/external/flashattn.cmake +++ b/cmake/external/flashattn.cmake @@ -20,10 +20,10 @@ set(FLASHATTN_PREFIX_DIR ${THIRD_PARTY_PATH}/flashattn) set(FLASHATTN_SOURCE_SUBDIR csrc/flash_attn) set(FLASHATTN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/flashattn) set(FLASHATTN_REPOSITORY ${GIT_URL}/JamesLim-sy/flash-attention.git) -set(FLASHATTN_TAG 05ecd140356cffe46444cd364944f6ec5067e724) +set(FLASHATTN_TAG c7442255b553e8f8ddee21b48a4bca992678cb89) -#set(FLASHATTN_REPOSITORY ${GIT_URL}/PaddlePaddle/flash-attention.git) -#set(FLASHATTN_TAG f0edf243a813a65d05c75fcb331b2a95faf96bbc) +# set(FLASHATTN_REPOSITORY ${GIT_URL}/PaddlePaddle/flash-attention.git) +# set(FLASHATTN_TAG f0edf243a813a65d05c75fcb331b2a95faf96bbc) set(FLASHATTN_INCLUDE_DIR "${FLASHATTN_INSTALL_DIR}/include" diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h index 7f634e835cadf..fcbebdda20a52 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention.h +++ b/paddle/fluid/operators/fused/fused_gate_attention.h @@ -21,6 +21,13 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/reduce_function.h" #include "paddle/phi/kernels/funcs/transpose_function.cu.h" #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" +#include "paddle/phi/kernels/funcs/range_function.h" +#include "paddle/phi/kernels/empty_kernel.h" + +#ifdef PADDLE_WITH_FLASHATTN +#include "paddle/phi/kernels/flash_attn_kernel.h" +#include "paddle/phi/backends/dynload/flashattn.h" +#endif namespace paddle { namespace operators { @@ -147,7 +154,11 @@ struct GateAttentionConfig { gate_out_dims = {batch_size, seq_len_m, seq_len_r, num_heads, head_dim}; } - bool UseFlashAttn(const bool merge_qkv, const int64_t head_dim) { + bool UseFlashAttn(const bool merge_qkv, const bool is_amp) { + if (!is_amp) { + return false; + } + if (merge_qkv) { switch (head_dim) { case 16: @@ -334,15 +345,53 @@ struct GateAttentionGradConfig : public GateAttentionConfig { phi::DenseTensor qk_out_grad; }; -template + +#define DEBUG_HERE printf("[%s, %d]: Run here!\n", __func__, __LINE__); +#define DEBUG_DATA_INT(name, x) do { \ + printf("[%s, %d]: %s = %d\n", __func__, __LINE__, name, static_cast(x)); \ +} whilie(0); + +#define DEBUG_DATA_FlOAT(name, x) do {\ + printf("[%s, %d]: %s = %f\n", __func__, __LINE__, std::string(name), static_cast(x)); \ +} whilie(0); + +#define DEBUG_DIMS(x) do { \ + printf("[%s, %d]: dims is : [", __func__, __LINE__); \ + for (int i = 0; i < x.size(); ++i) { \ + printf("%d, ", x[i]); \ + } \ + printf(" ]\n"); \ +} whilie(0); + + +template __global__ void FlashAttRange( - int start, int step, int size, OUT_TYPE* out1, OUT_TYPE* out2) { + int start, int step, int size, T* out1, T* out2) { CUDA_KERNEL_LOOP(index, size) { - out1[index] = static_cast(start + step * index); - out2[index] = static_cast(start + step * index); + out1[index] = static_cast(start + step * index); + out2[index] = static_cast(start + step * index); } } +static void GetFlashAttnDimsString(const std::string& prefix, + const phi::DDim dim_val) { + // if (VLOG_IS_ON(4)) { + std::ostringstream out_string; + out_string << "FlashAttn - " << prefix << ".dims() is [ "; + for (int i = 0; i < dim_val.size(); ++i) { + out_string << dim_val[i] << ", "; + } + out_string << "]\n"; + VLOG(4) << out_string.str(); + std::cout << out_string.str(); + // } +} + +#define DBG_WAIT do {\ + printf("[%s, %d] Run here.\n", __func__, __LINE__); \ + dev_ctx_.Wait(); \ +} while(0); + template class FMHAGateRef { public: @@ -362,30 +411,46 @@ class FMHAGateRef { T* q_ptr = nullptr; T* k_ptr = nullptr; T* v_ptr = nullptr; + bool is_bf16 = qkv_transpose_out->dtype() == DataType::BFLOAT16 ? true : false; + + if (std::is_same::value) { + std::cout << "T is phi::dtype::float16. \n"; + } else if (std::is_same::value) { + std::cout << "T is phi::dtype::bfloat16. \n"; + } else if (std::is_same::value) { + std::cout << "T is float. \n"; + } - if (config->UseFlashAttn(merge_qkv_, config->head_dim)) { + if (config->UseFlashAttn(merge_qkv_, is_bf16)) { PADDLE_ENFORCE_NOT_NULL( qkv_transpose_out, platform::errors::NotFound("The input qkv_transpose_out can not be " "nullptr when merge_qkv is true.")); phi::DenseTensor* qkv_out = config->GetQKVOut(); - ComputeQKVTransposeForward(*qkv_out, qkv_transpose_out); + ComputeQKVTransposeForwardForFlashAttn(*qkv_out, qkv_transpose_out); config->ClearQKVOut(); + + dev_ctx_.Wait(); + qkv_transpose_out->Resize({3, + static_cast(config->batch_size), + static_cast(config->seq_len_m), + static_cast(config->seq_len_r), + static_cast(config->num_heads), + static_cast(config->head_dim)}); - // batch_dims = q.shape[:-3]; - // n, no_heads, c = q.shape[-3:] // 1. Dealing with qkv_out for flash_attn. - auto& qkv_dims = qkv_transpose_out.dims(); + auto& qkv_dims = qkv_transpose_out->dims(); auto rank = qkv_dims.size(); - auto n = qkv_dims[rank - 3]; - auto no_heads = qkv_dims[rank - 2]; - auto c = qkv_dims[rank - 1]; - int64_t q_batch_size = 1; - for (int i = 0; i < (rank - 3); ++i) { + int64_t rest_dim = qkv_dims[rank - 3]; + for (int i = 1; i < (rank - 3); ++i) { q_batch_size *= qkv_dims[i]; } - qkv_transpose_out.Resize({q_batch_size * n, no_heads, c}); + qkv_transpose_out->Resize({3, + q_batch_size * rest_dim, + config->num_heads, + config->head_dim}); + DBG_WAIT; // q_size == k_size int64_t q_size = config->GetQuerySize(); q_ptr = qkv_transpose_out->data(); @@ -394,80 +459,106 @@ class FMHAGateRef { // 2. Dealing with cu_seq_q and cu_seq_k for flash_attn. phi::DenseTensor cu_seq_q, cu_seq_k; - int64_t temp_size = (q_batch_size + 1) * n; + int64_t end_size = (q_batch_size + 1); int64_t seq_size = 0; - int64_t start = 0, end = temp_size, step = n; + int64_t start = 0, end = end_size, step = rest_dim; phi::funcs::GetSize(start, end, step, &seq_size); - cu_seq_q.Resize({seq_size}); - cu_seq_k.Resize({seq_size}); + cu_seq_q.Resize({end_size}); + cu_seq_k.Resize({end_size}); AllocWithDebugInfo(dev_ctx_, "flash_attn: cu_seq_q", &cu_seq_q); AllocWithDebugInfo(dev_ctx_, "flash_attn: cu_seq_k", &cu_seq_k); - int64_t block = std::min(seq_size, static_cast(256)); - int64_t grid = (range_size + block - 1) / block; - FlashAttRange<<>>( - start, step, end, ` cu_seq_q.data(), cu_seq_k.data()); - - // 1. Dealing with mask and bias for flash_attn. - phi::DenseTensor tmp_mask, tmp_bias; + int64_t grid = (seq_size + block - 1) / block; + FlashAttRange<<>>( + start, step, end, cu_seq_q.data(), cu_seq_k.data()); + VLOG(4) << "[Flash_attn] cu_seq_len : start = " << start + << ", step = " << step << ", end = " << end; + DBG_WAIT; + + // 3. Dealing with mask and bias for flash_attn. + phi::DenseTensor temp_mask, temp_bias; if (src_mask) { - int64_t mask_first_dim = 0; - auto& mask_dim = src_mask->dims(); + int64_t mask_first_dim = 1; + temp_mask.ShareDataWith(*src_mask); + + auto mask_dim = temp_mask.dims(); for (int i = 0; i < mask_dim.size() - 3; ++i) { mask_first_dim *= mask_dim[i]; } - tmp_mask.ShareDataWith(*src_mask); - tmp_mask.Resize( - {mask_first_dim, mask_dim[-3], mask_dim[-2], mask_dim[-1]}); + auto mask_dim_rank = mask_dim.size(); + temp_mask.Resize({mask_first_dim, + mask_dim[mask_dim_rank - 3], + mask_dim[mask_dim_rank - 2], + mask_dim[mask_dim_rank - 1]}); + GetFlashAttnDimsString("mask_dim", temp_mask.dims()); } if (nonbatched_bias) { - int64_t bias_first_dim = 0; - auto& bias_dim = nonbatched_bias->dims(); + int64_t bias_first_dim = 1; + temp_bias.ShareDataWith(*nonbatched_bias); + + auto bias_dim = nonbatched_bias->dims(); for (int i = 0; i < bias_dim.size() - 3; ++i) { bias_first_dim *= bias_dim[i]; } - tmp_bias.ShareDataWith(*nonbatched_bias); - tmp_bias.Resize( - {bias_first_dim, bias_dim[-3], bias_dim[-2], bias_dim[-1]}); + auto bias_dim_rank = temp_bias.dims().size(); + temp_bias.Resize({bias_first_dim, + bias_dim[bias_dim_rank - 3], + bias_dim[bias_dim_rank - 2], + bias_dim[bias_dim_rank - 1]}); + GetFlashAttnDimsString("bias_dim", temp_bias.dims()); } - - // For flash_attn - bool is_bf16_ = - qkv_transpose_out.dtype() == DataType::BFLOAT16 ? true : false; - int64_t batch_size_ = cu_seq_q.numel() - 1; - int64_t total_q_ = qkv_dims[1]; // q.dims()[0] - int64_t total_k_ = qkv_dims[1]; // q.dims()[0] - int64_t num_heads_ = qkv_dims[2]; // q.dims()[1] - int64_t head_size_ = qkv_dims[3]; // q.dims()[2] - int max_seqlen_q_; - int max_seqlen_k_; + DBG_WAIT; + GetFlashAttnDimsString("qkv_transpose_out", qkv_transpose_out->dims()); + + // 4. flash_attn parameter setting. + int batch_size_ = q_batch_size; + int total_q_ = qkv_dims[1]; // q.dims()[0] + int total_k_ = qkv_dims[1]; // q.dims()[0] + int num_heads_ = qkv_dims[2]; // q.dims()[1] + int head_size_ = qkv_dims[3]; // q.dims()[2] + int max_seqlen_q_ = batch_size_; + int max_seqlen_k_ = batch_size_; int num_splits = 0; // 0 for an internal heuristic, which is optimal - + DBG_WAIT; + + VLOG(6) << "[Flash_attn] batch_size : " << batch_size_; + VLOG(6) << "[Flash_attn] total_q : " << total_q_; + VLOG(6) << "[Flash_attn] total_k : " << total_k_; + VLOG(6) << "[Flash_attn] num_heads : " << num_heads_; + VLOG(6) << "[Flash_attn] head_size : " << head_size_; + VLOG(6) << "[Flash_attn] max_seqlen_q : " << max_seqlen_q_; + VLOG(6) << "[Flash_attn] max_seqlen_k : " << max_seqlen_k_; + + // 5. construct softmax_lse phi::DenseTensor softmax_lse; - softmax_lse.Resize({batch_size_, num_heads_, max_seqlen_q_}); - AllocWithDebugInfo( - dev_ctx_, "flash_attn: softmax_lse", &softmax_lse); - - auto gen = ctx.GetGenerator(); - uint64_t inc = batch_size * num_heads * 32; + int softmax_lse_last_dim = ((max_seqlen_q_ + 16 - 1) / 16) * 16; + softmax_lse.Resize({batch_size_, num_heads_, softmax_lse_last_dim}); + AllocWithDebugInfo(dev_ctx_, "flash_attn: softmax_lse", &softmax_lse); + + DBG_WAIT; + // 6. construct random seed + auto gen = dev_ctx_.GetGenerator(); + uint64_t inc = batch_size_ * num_heads_ * 32; auto seed_offset_pair = gen->IncrementOffset(inc); - uint64_t seed = seed_offset_pair.first; uint64_t offset = seed_offset_pair.second; - seed_offset->Resize({2}); - auto* seed_offset_data = ctx.template HostAlloc(seed_offset); - seed_offset_data[0] = static_cast(seed); - seed_offset_data[1] = static_cast(offset); + GetFlashAttnDimsString("softmax_out", softmax_out->dims()); + GetFlashAttnDimsString("softmax_lse", softmax_lse.dims()); + DBG_WAIT; + // 7. flas_attn part one, get temp worksapce size. + float p_dropout = 0.f; + float softmax_scale = static_cast(1); + cudaStream_t stream = dev_ctx_.stream(); uint64_t workspace_size; - bool succ = phi::dynload::flash_attn_fwd( + bool succ = phi::dynload::flash_attn_fwd_with_bias_and_mask( static_cast(q_ptr), static_cast(k_ptr), static_cast(v_ptr), nullptr, // for calculation workspace size - static_cast(cu_seq_q.data()), - static_cast(cu_seq_k.data()), + static_cast(cu_seq_q.data()), + static_cast(cu_seq_k.data()), total_q_, total_k_, batch_size_, @@ -475,38 +566,41 @@ class FMHAGateRef { head_size_, max_seqlen_q_, max_seqlen_k_, - /*p_dropout=*/0.f, - /*softmax_scale=*/static_cast(1.0 / sqrt(config->head_size_)); + p_dropout, + softmax_scale, /*zero_tensors=*/false, /*is_causal=*/false, - is_bf16_, + is_bf16, num_splits, - softmax_lse.data(); - softmax_out->data(); + softmax_lse.data(), + softmax_out->data(), nullptr, &workspace_size, - dev_ctx_.stream(), + stream, seed, offset, - src_mask ? src_mask->data() : nullptr, - nonbatched_bias ? nonbatched_bias->data() : nullptr, - src_mask->dims().Get(), - nonbatched_bias->dims().Get()); + src_mask ? temp_mask.data() : nullptr, + nonbatched_bias ? temp_bias.data() : nullptr, + temp_mask.dims().Get(), + temp_bias.dims().Get() + ); if (!succ) { PADDLE_THROW(phi::errors::External(phi::dynload::flash_attn_error())); } - DenseTensor workspace; + DBG_WAIT; + phi::DenseTensor workspace; if (workspace_size > 0) { - workspace = - Empty(ctx, {int64_t(workspace_size / sizeof(float))}); + workspace = phi::Empty(dev_ctx_, {int64_t(workspace_size / sizeof(float))}); } - bool succ = phi::dynload::flash_attn_fwd( + DBG_WAIT; + // 8. flas_attn part two, run impl. + succ = phi::dynload::flash_attn_fwd_with_bias_and_mask( static_cast(q_ptr), static_cast(k_ptr), static_cast(v_ptr), static_cast(fmha_out), // for calculation workspace size - static_cast(cu_seq_q.data()), - static_cast(cu_seq_k.data()), + static_cast(cu_seq_q.data()), + static_cast(cu_seq_k.data()), total_q_, total_k_, batch_size_, @@ -514,23 +608,29 @@ class FMHAGateRef { head_size_, max_seqlen_q_, max_seqlen_k_, - /*p_dropout=*/0.f, - /*softmax_scale=*/static_cast(1.0 / sqrt(config->head_size_)); + p_dropout, + softmax_scale, /*zero_tensors=*/false, /*is_causal=*/false, - is_bf16_, + is_bf16, num_splits, - softmax_lse.data(); - softmax_out->data(); - workspace_size > 0 ? workspace.data() : nullptr, + softmax_lse.data(), + softmax_out->data(), + workspace_size > 0 ? static_cast(workspace.data()) : nullptr, &workspace_size, - dev_ctx_.stream(), + stream, seed, offset, - src_mask ? src_mask->data() : nullptr, - nonbatched_bias ? nonbatched_bias->data() : nullptr, - src_mask->dims().Get(), - nonbatched_bias->dims().Get()); + src_mask ? temp_mask.data() : nullptr, + nonbatched_bias ? temp_bias.data() : nullptr, + temp_mask.dims().Get(), + temp_bias.dims().Get() + ); + DBG_WAIT; + if (!succ) { + PADDLE_THROW(phi::errors::External(phi::dynload::flash_attn_error())); + } + DBG_WAIT; } else { if (merge_qkv_) { // qkv_transpose_out = transpose(qkv_out) @@ -609,7 +709,7 @@ class FMHAGateRef { // qktv_out = BatchedGEMM(softmax_out, V) // [batch_size, seq_len_m, num_heads, seq_len_r, m_size] * - // [batch_size, seq_len_m, num_heads, m_size, head_dim] + // [batch_size, seq_len_m, num_heads, m_size, head_dim] // -> [batch_size, seq_len_m, num_heads, seq_len_r, head_dim] phi::DenseTensor* qktv_out = config->GetQKTVOut(gate_out); T* qktv_out_ptr = qktv_out->data(); @@ -854,6 +954,15 @@ class FMHAGateRef { dev_ctx_, qkv_out, perm, qkv_transpose_out); } + // [batch_size, seq_len_m, seq_len_r, 3, num_heads, head_dim] -> + // [3, batch_size, seq_len_m, seq_len_r, num_heads, head_dim] + void ComputeQKVTransposeForwardForFlashAttn(const phi::DenseTensor& qkv_out, + phi::DenseTensor* qkv_transpose_out) { + std::vector perm = {3, 0, 1, 2, 4, 5}; + phi::funcs::TransposeGPUKernelDriver( + dev_ctx_, qkv_out, perm, qkv_transpose_out); + } + void ComputeQKVTransposeBackward( const phi::DenseTensor& qkv_transpose_out_grad, phi::DenseTensor* qkv_out_grad) { diff --git a/paddle/phi/backends/dynload/flashattn.h b/paddle/phi/backends/dynload/flashattn.h index 87836a49a700d..1d235fcdecfc5 100644 --- a/paddle/phi/backends/dynload/flashattn.h +++ b/paddle/phi/backends/dynload/flashattn.h @@ -46,8 +46,8 @@ extern void* flashattn_dso_handle; #define FLASHATTN_ROUTINE_EACH(__macro) \ __macro(flash_attn_fwd); \ __macro(flash_attn_bwd); \ - __macro(flash_attn_error); \ - __macro(flash_attn_fwd_with_bias_and_mask); + __macro(flash_attn_error); + __macro(flash_attn_fwd_with_bias_and_mask); \ FLASHATTN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_FLASHATTN_WRAP); From a26df023f28d39d0288736b17dc2861988da9e44 Mon Sep 17 00:00:00 2001 From: JamesLim-sy Date: Tue, 18 Apr 2023 18:44:00 +0800 Subject: [PATCH 003/405] fix some bugs --- .../operators/fused/fused_gate_attention.h | 118 ++++++++++-------- 1 file changed, 63 insertions(+), 55 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h index fcbebdda20a52..b58e9dbc59215 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention.h +++ b/paddle/fluid/operators/fused/fused_gate_attention.h @@ -15,18 +15,18 @@ limitations under the License. */ #pragma once #include "paddle/phi/kernels/arange_kernel.h" +#include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" +#include "paddle/phi/kernels/funcs/range_function.h" #include "paddle/phi/kernels/funcs/reduce_function.h" #include "paddle/phi/kernels/funcs/transpose_function.cu.h" #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" -#include "paddle/phi/kernels/funcs/range_function.h" -#include "paddle/phi/kernels/empty_kernel.h" #ifdef PADDLE_WITH_FLASHATTN -#include "paddle/phi/kernels/flash_attn_kernel.h" #include "paddle/phi/backends/dynload/flashattn.h" +#include "paddle/phi/kernels/flash_attn_kernel.h" #endif namespace paddle { @@ -155,7 +155,7 @@ struct GateAttentionConfig { } bool UseFlashAttn(const bool merge_qkv, const bool is_amp) { - if (!is_amp) { + if (!is_amp) { return false; } @@ -345,28 +345,36 @@ struct GateAttentionGradConfig : public GateAttentionConfig { phi::DenseTensor qk_out_grad; }; - #define DEBUG_HERE printf("[%s, %d]: Run here!\n", __func__, __LINE__); -#define DEBUG_DATA_INT(name, x) do { \ - printf("[%s, %d]: %s = %d\n", __func__, __LINE__, name, static_cast(x)); \ -} whilie(0); - -#define DEBUG_DATA_FlOAT(name, x) do {\ - printf("[%s, %d]: %s = %f\n", __func__, __LINE__, std::string(name), static_cast(x)); \ -} whilie(0); - -#define DEBUG_DIMS(x) do { \ - printf("[%s, %d]: dims is : [", __func__, __LINE__); \ - for (int i = 0; i < x.size(); ++i) { \ - printf("%d, ", x[i]); \ - } \ - printf(" ]\n"); \ -} whilie(0); - +#define DEBUG_DATA_INT(name, x) \ + do { \ + printf( \ + "[%s, %d]: %s = %d\n", __func__, __LINE__, name, static_cast(x)); \ + } \ + whilie(0); + +#define DEBUG_DATA_FlOAT(name, x) \ + do { \ + printf("[%s, %d]: %s = %f\n", \ + __func__, \ + __LINE__, \ + std::string(name), \ + static_cast(x)); \ + } \ + whilie(0); + +#define DEBUG_DIMS(x) \ + do { \ + printf("[%s, %d]: dims is : [", __func__, __LINE__); \ + for (int i = 0; i < x.size(); ++i) { \ + printf("%d, ", x[i]); \ + } \ + printf(" ]\n"); \ + } \ + whilie(0); template -__global__ void FlashAttRange( - int start, int step, int size, T* out1, T* out2) { +__global__ void FlashAttRange(int start, int step, int size, T* out1, T* out2) { CUDA_KERNEL_LOOP(index, size) { out1[index] = static_cast(start + step * index); out2[index] = static_cast(start + step * index); @@ -374,23 +382,24 @@ __global__ void FlashAttRange( } static void GetFlashAttnDimsString(const std::string& prefix, - const phi::DDim dim_val) { - // if (VLOG_IS_ON(4)) { + const phi::DDim dim_val) { + if (VLOG_IS_ON(4)) { std::ostringstream out_string; out_string << "FlashAttn - " << prefix << ".dims() is [ "; for (int i = 0; i < dim_val.size(); ++i) { - out_string << dim_val[i] << ", "; + out_string << dim_val[i] << ", "; } out_string << "]\n"; VLOG(4) << out_string.str(); std::cout << out_string.str(); - // } + } } -#define DBG_WAIT do {\ - printf("[%s, %d] Run here.\n", __func__, __LINE__); \ - dev_ctx_.Wait(); \ -} while(0); +#define DBG_WAIT \ + do { \ + printf("[%s, %d] Run here.\n", __func__, __LINE__); \ + dev_ctx_.Wait(); \ + } while (0); template class FMHAGateRef { @@ -411,7 +420,8 @@ class FMHAGateRef { T* q_ptr = nullptr; T* k_ptr = nullptr; T* v_ptr = nullptr; - bool is_bf16 = qkv_transpose_out->dtype() == DataType::BFLOAT16 ? true : false; + bool is_bf16 = + qkv_transpose_out->dtype() == DataType::BFLOAT16 ? true : false; if (std::is_same::value) { std::cout << "T is phi::dtype::float16. \n"; @@ -429,14 +439,14 @@ class FMHAGateRef { phi::DenseTensor* qkv_out = config->GetQKVOut(); ComputeQKVTransposeForwardForFlashAttn(*qkv_out, qkv_transpose_out); config->ClearQKVOut(); - + dev_ctx_.Wait(); qkv_transpose_out->Resize({3, - static_cast(config->batch_size), - static_cast(config->seq_len_m), - static_cast(config->seq_len_r), - static_cast(config->num_heads), - static_cast(config->head_dim)}); + static_cast(config->batch_size), + static_cast(config->seq_len_m), + static_cast(config->seq_len_r), + static_cast(config->num_heads), + static_cast(config->head_dim)}); // 1. Dealing with qkv_out for flash_attn. auto& qkv_dims = qkv_transpose_out->dims(); @@ -446,10 +456,8 @@ class FMHAGateRef { for (int i = 1; i < (rank - 3); ++i) { q_batch_size *= qkv_dims[i]; } - qkv_transpose_out->Resize({3, - q_batch_size * rest_dim, - config->num_heads, - config->head_dim}); + qkv_transpose_out->Resize( + {3, q_batch_size * rest_dim, config->num_heads, config->head_dim}); DBG_WAIT; // q_size == k_size int64_t q_size = config->GetQuerySize(); @@ -486,10 +494,10 @@ class FMHAGateRef { mask_first_dim *= mask_dim[i]; } auto mask_dim_rank = mask_dim.size(); - temp_mask.Resize({mask_first_dim, - mask_dim[mask_dim_rank - 3], - mask_dim[mask_dim_rank - 2], - mask_dim[mask_dim_rank - 1]}); + temp_mask.Resize({mask_first_dim, + mask_dim[mask_dim_rank - 3], + mask_dim[mask_dim_rank - 2], + mask_dim[mask_dim_rank - 1]}); GetFlashAttnDimsString("mask_dim", temp_mask.dims()); } if (nonbatched_bias) { @@ -520,7 +528,7 @@ class FMHAGateRef { int max_seqlen_k_ = batch_size_; int num_splits = 0; // 0 for an internal heuristic, which is optimal DBG_WAIT; - + VLOG(6) << "[Flash_attn] batch_size : " << batch_size_; VLOG(6) << "[Flash_attn] total_q : " << total_q_; VLOG(6) << "[Flash_attn] total_k : " << total_k_; @@ -533,7 +541,8 @@ class FMHAGateRef { phi::DenseTensor softmax_lse; int softmax_lse_last_dim = ((max_seqlen_q_ + 16 - 1) / 16) * 16; softmax_lse.Resize({batch_size_, num_heads_, softmax_lse_last_dim}); - AllocWithDebugInfo(dev_ctx_, "flash_attn: softmax_lse", &softmax_lse); + AllocWithDebugInfo( + dev_ctx_, "flash_attn: softmax_lse", &softmax_lse); DBG_WAIT; // 6. construct random seed @@ -582,15 +591,15 @@ class FMHAGateRef { src_mask ? temp_mask.data() : nullptr, nonbatched_bias ? temp_bias.data() : nullptr, temp_mask.dims().Get(), - temp_bias.dims().Get() - ); + temp_bias.dims().Get()); if (!succ) { PADDLE_THROW(phi::errors::External(phi::dynload::flash_attn_error())); } DBG_WAIT; phi::DenseTensor workspace; if (workspace_size > 0) { - workspace = phi::Empty(dev_ctx_, {int64_t(workspace_size / sizeof(float))}); + workspace = phi::Empty( + dev_ctx_, {int64_t(workspace_size / sizeof(float))}); } DBG_WAIT; // 8. flas_attn part two, run impl. @@ -624,8 +633,7 @@ class FMHAGateRef { src_mask ? temp_mask.data() : nullptr, nonbatched_bias ? temp_bias.data() : nullptr, temp_mask.dims().Get(), - temp_bias.dims().Get() - ); + temp_bias.dims().Get()); DBG_WAIT; if (!succ) { PADDLE_THROW(phi::errors::External(phi::dynload::flash_attn_error())); @@ -956,8 +964,8 @@ class FMHAGateRef { // [batch_size, seq_len_m, seq_len_r, 3, num_heads, head_dim] -> // [3, batch_size, seq_len_m, seq_len_r, num_heads, head_dim] - void ComputeQKVTransposeForwardForFlashAttn(const phi::DenseTensor& qkv_out, - phi::DenseTensor* qkv_transpose_out) { + void ComputeQKVTransposeForwardForFlashAttn( + const phi::DenseTensor& qkv_out, phi::DenseTensor* qkv_transpose_out) { std::vector perm = {3, 0, 1, 2, 4, 5}; phi::funcs::TransposeGPUKernelDriver( dev_ctx_, qkv_out, perm, qkv_transpose_out); From cc0889933c0f34c5e6c0c75321cec7bb11043857 Mon Sep 17 00:00:00 2001 From: JamesLim-sy Date: Wed, 19 Apr 2023 19:15:50 +0800 Subject: [PATCH 004/405] fix bugs in flashattn.h --- .../operators/fused/fused_gate_attention.h | 207 ++++++++++++------ .../fused/fused_gate_attention_op.cc | 9 + .../fused/fused_gate_attention_op.cu | 11 +- paddle/phi/backends/dynload/flashattn.h | 8 +- .../phi/kernels/funcs/blas/blaslt_impl.cu.h | 46 +++- 5 files changed, 196 insertions(+), 85 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h index b58e9dbc59215..58ab2d9e378a2 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention.h +++ b/paddle/fluid/operators/fused/fused_gate_attention.h @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/phi/kernels/arange_kernel.h" #include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h" #include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" @@ -69,6 +70,7 @@ struct GateAttentionConfig { bool merge_qkv; bool has_gating; + bool use_flash_attn; int64_t batch_size; int64_t seq_len_m; @@ -98,8 +100,12 @@ struct GateAttentionConfig { const phi::DenseTensor* query_weight, const phi::DenseTensor* qkv_weight, bool merge_qkv, - bool has_gating) - : dev_ctx(dev_ctx), merge_qkv(merge_qkv), has_gating(has_gating) { + bool has_gating, + bool use_flash_attn = false) + : dev_ctx(dev_ctx), + merge_qkv(merge_qkv), + has_gating(has_gating), + use_flash_attn(use_flash_attn) { // query: shape=[batch_size, seq_len_m, seq_len_r, q_dim] batch_size = query->dims()[0]; seq_len_m = query->dims()[1]; @@ -383,16 +389,16 @@ __global__ void FlashAttRange(int start, int step, int size, T* out1, T* out2) { static void GetFlashAttnDimsString(const std::string& prefix, const phi::DDim dim_val) { - if (VLOG_IS_ON(4)) { - std::ostringstream out_string; - out_string << "FlashAttn - " << prefix << ".dims() is [ "; - for (int i = 0; i < dim_val.size(); ++i) { - out_string << dim_val[i] << ", "; - } - out_string << "]\n"; - VLOG(4) << out_string.str(); - std::cout << out_string.str(); + // if (VLOG_IS_ON(4)) { + std::ostringstream out_string; + out_string << "FlashAttn - " << prefix << ".dims() is [ "; + for (int i = 0; i < dim_val.size(); ++i) { + out_string << dim_val[i] << ", "; } + out_string << "]\n"; + VLOG(4) << out_string.str(); + std::cout << out_string.str(); + // } } #define DBG_WAIT \ @@ -431,34 +437,26 @@ class FMHAGateRef { std::cout << "T is float. \n"; } - if (config->UseFlashAttn(merge_qkv_, is_bf16)) { + if (config->UseFlashAttn(merge_qkv_, config->use_flash_attn && is_bf16)) { PADDLE_ENFORCE_NOT_NULL( qkv_transpose_out, platform::errors::NotFound("The input qkv_transpose_out can not be " "nullptr when merge_qkv is true.")); + + // 1. Dealing with qkv_out for flash_attn. phi::DenseTensor* qkv_out = config->GetQKVOut(); ComputeQKVTransposeForwardForFlashAttn(*qkv_out, qkv_transpose_out); config->ClearQKVOut(); - dev_ctx_.Wait(); - qkv_transpose_out->Resize({3, - static_cast(config->batch_size), - static_cast(config->seq_len_m), - static_cast(config->seq_len_r), - static_cast(config->num_heads), - static_cast(config->head_dim)}); - - // 1. Dealing with qkv_out for flash_attn. - auto& qkv_dims = qkv_transpose_out->dims(); - auto rank = qkv_dims.size(); - int64_t q_batch_size = 1; - int64_t rest_dim = qkv_dims[rank - 3]; - for (int i = 1; i < (rank - 3); ++i) { - q_batch_size *= qkv_dims[i]; - } + int seq_batch_size = static_cast(config->batch_size) * + static_cast(config->seq_len_m); qkv_transpose_out->Resize( - {3, q_batch_size * rest_dim, config->num_heads, config->head_dim}); + {3, + seq_batch_size * static_cast(config->seq_len_r), + static_cast(config->num_heads), + static_cast(config->head_dim)}); DBG_WAIT; + // q_size == k_size int64_t q_size = config->GetQuerySize(); q_ptr = qkv_transpose_out->data(); @@ -467,9 +465,10 @@ class FMHAGateRef { // 2. Dealing with cu_seq_q and cu_seq_k for flash_attn. phi::DenseTensor cu_seq_q, cu_seq_k; - int64_t end_size = (q_batch_size + 1); + int64_t end_size = (seq_batch_size + 1); int64_t seq_size = 0; - int64_t start = 0, end = end_size, step = rest_dim; + int64_t start = 0, end = end_size, + step = static_cast(config->seq_len_r); phi::funcs::GetSize(start, end, step, &seq_size); cu_seq_q.Resize({end_size}); cu_seq_k.Resize({end_size}); @@ -485,41 +484,32 @@ class FMHAGateRef { // 3. Dealing with mask and bias for flash_attn. phi::DenseTensor temp_mask, temp_bias; - if (src_mask) { - int64_t mask_first_dim = 1; - temp_mask.ShareDataWith(*src_mask); - - auto mask_dim = temp_mask.dims(); - for (int i = 0; i < mask_dim.size() - 3; ++i) { - mask_first_dim *= mask_dim[i]; - } - auto mask_dim_rank = mask_dim.size(); - temp_mask.Resize({mask_first_dim, - mask_dim[mask_dim_rank - 3], - mask_dim[mask_dim_rank - 2], - mask_dim[mask_dim_rank - 1]}); - GetFlashAttnDimsString("mask_dim", temp_mask.dims()); - } - if (nonbatched_bias) { - int64_t bias_first_dim = 1; - temp_bias.ShareDataWith(*nonbatched_bias); - - auto bias_dim = nonbatched_bias->dims(); - for (int i = 0; i < bias_dim.size() - 3; ++i) { - bias_first_dim *= bias_dim[i]; + auto dims_merge_func = [&](const phi::DenseTensor* src_tensor, + phi::DenseTensor* dst_tensor, + const std::string& prefix) { + if (src_tensor) { + int64_t first_dim = 1; + dst_tensor->ShareDataWith(*src_tensor); + auto dims_ = src_tensor->dims(); + for (int i = 0; i < dims_.size() - 3; ++i) { + first_dim *= dims_[i]; + } + auto dims_rank = dims_.size(); + dst_tensor->Resize({first_dim, + dims_[dims_rank - 3], + dims_[dims_rank - 2], + dims_[dims_rank - 1]}); + GetFlashAttnDimsString(prefix, temp_mask.dims()); } - auto bias_dim_rank = temp_bias.dims().size(); - temp_bias.Resize({bias_first_dim, - bias_dim[bias_dim_rank - 3], - bias_dim[bias_dim_rank - 2], - bias_dim[bias_dim_rank - 1]}); - GetFlashAttnDimsString("bias_dim", temp_bias.dims()); - } + }; + auto& qkv_dims = qkv_transpose_out->dims(); + dims_merge_func(src_mask, &temp_mask, "mask_dim"); + dims_merge_func(nonbatched_bias, &temp_bias, "bias_dim"); + GetFlashAttnDimsString("qkv_transpose_out", qkv_dims); DBG_WAIT; - GetFlashAttnDimsString("qkv_transpose_out", qkv_transpose_out->dims()); - // 4. flash_attn parameter setting. - int batch_size_ = q_batch_size; + + int batch_size_ = seq_batch_size; int total_q_ = qkv_dims[1]; // q.dims()[0] int total_k_ = qkv_dims[1]; // q.dims()[0] int num_heads_ = qkv_dims[2]; // q.dims()[1] @@ -527,8 +517,6 @@ class FMHAGateRef { int max_seqlen_q_ = batch_size_; int max_seqlen_k_ = batch_size_; int num_splits = 0; // 0 for an internal heuristic, which is optimal - DBG_WAIT; - VLOG(6) << "[Flash_attn] batch_size : " << batch_size_; VLOG(6) << "[Flash_attn] total_q : " << total_q_; VLOG(6) << "[Flash_attn] total_k : " << total_k_; @@ -554,6 +542,9 @@ class FMHAGateRef { GetFlashAttnDimsString("softmax_out", softmax_out->dims()); GetFlashAttnDimsString("softmax_lse", softmax_lse.dims()); + + GetFlashAttnDimsString("cu_seq_q", cu_seq_q.dims()); + GetFlashAttnDimsString("cu_seq_k", cu_seq_k.dims()); DBG_WAIT; // 7. flas_attn part one, get temp worksapce size. @@ -699,6 +690,27 @@ class FMHAGateRef { int64_t gemm_k = config->head_dim; // attn = torch.matmul(q, k.transpose(-1, -2)) T alpha = static_cast(1.0 / sqrt(config->head_dim)); + // ComputeBatchedGEMM(merge_qkv_ ? + // phi::slice_ddim(qkv_transpose_out->dims(), + // 1, + // qkv_transpose_out->dims().size() + // - 1) : q_transpose_out->dims(), + // merge_qkv_ ? + // phi::slice_ddim(qkv_transpose_out->dims(), + // 1, + // qkv_transpose_out->dims().size() + // - 1) : k_transpose_out->dims(), + // q_ptr, + // k_ptr, + // qk_out_ptr, + // false, + // true, + // gemm_m, + // gemm_n, + // gemm_k, + // gemm_batch_size, + // alpha); + ComputeBatchedGEMM(q_ptr, k_ptr, qk_out_ptr, @@ -728,6 +740,22 @@ class FMHAGateRef { // o = torch.matmul(attn, v) T* softmax_out_ptr = softmax_out->data(); + // ComputeBatchedGEMM(softmax_out->dims(), + // merge_qkv_ ? + // phi::slice_ddim(qkv_transpose_out->dims(), + // 1, + // qkv_transpose_out->dims().size() + // - 1) : v_transpose_out->dims(), + // softmax_out_ptr, + // v_ptr, + // qktv_out_ptr, + // false, + // false, + // gemm_m, + // gemm_n, + // gemm_k, + // gemm_batch_size); + ComputeBatchedGEMM(softmax_out_ptr, v_ptr, qktv_out_ptr, @@ -737,6 +765,7 @@ class FMHAGateRef { gemm_n, gemm_k, gemm_batch_size); + // fmha_out = transpose(qktv_out) // o = o.transpose(-2, -3).contiguous() ComputeQKTVTransposeForward(*qktv_out, fmha_out); @@ -1056,6 +1085,50 @@ class FMHAGateRef { } private: + void ComputeBatchedGEMM(const phi::DDim& a_dims, + const phi::DDim& b_dims, + const T* a_ptr, + const T* b_ptr, + T* c_ptr, + bool trans_a, + bool trans_b, + int64_t m, + int64_t n, + int64_t k, + int64_t batch_size, + T alpha = static_cast(1.0), + T beta = static_cast(0.0)) { + int64_t stride_a = m * k; + int64_t stride_b = k * n; + int64_t stride_out = m * n; + + phi::funcs::MatmulPlanner matmul_planner( + vectorize(a_dims), + vectorize(b_dims), + trans_a, + trans_b, + phi::CppTypeToDataType::Type(), + phi::funcs::MatmulFusedType::kMatmul); + + using blaslt = phi::funcs::MatmulWithCublasLt; + blaslt::RunWithBatch(dev_ctx_, + a_ptr, + b_ptr, + c_ptr, + m, + n, + k, + trans_a, + trans_b, + batch_size, + stride_a, + stride_b, + stride_out, + &matmul_planner, + alpha, + beta); + } + void ComputeBatchedGEMM(const T* a_ptr, const T* b_ptr, T* c_ptr, @@ -1067,11 +1140,11 @@ class FMHAGateRef { int64_t batch_size, T alpha = static_cast(1.0), T beta = static_cast(0.0)) { - CBLAS_TRANSPOSE cblas_trans_a = trans_a ? CblasTrans : CblasNoTrans; - CBLAS_TRANSPOSE cblas_trans_b = trans_b ? CblasTrans : CblasNoTrans; int64_t stride_a = m * k; int64_t stride_b = k * n; + CBLAS_TRANSPOSE cblas_trans_a = trans_a ? CblasTrans : CblasNoTrans; + CBLAS_TRANSPOSE cblas_trans_b = trans_b ? CblasTrans : CblasNoTrans; auto blas = phi::funcs::GetBlas(dev_ctx_); blas.BatchedGEMM(cblas_trans_a, cblas_trans_b, diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cc b/paddle/fluid/operators/fused/fused_gate_attention_op.cc index c91bca47cf42f..e2d8926f53516 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention_op.cc +++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cc @@ -162,6 +162,9 @@ class FusedGateAttentionOpMaker : public framework::OpProtoAndCheckerMaker { .AsIntermediate() .AsDispensable(); AddOutput("Out", "Result after attention."); + AddOutput("SoftmaxLse", "Result of the gating module.") + .AsIntermediate() + .AsDispensable(); AddAttr("has_gating", "if true, the attention op uses gate architecure, " "[default true].") @@ -170,6 +173,11 @@ class FusedGateAttentionOpMaker : public framework::OpProtoAndCheckerMaker { "if true, calculation with merged qkv, " "[default true].") .SetDefault(true); + AddAttr( + "use_flash_attn", + "if true, the attention op will be computed in flash_attn branch, " + "[default false].") + .SetDefault(false); AddComment(R"DOC( Add fused attention op whose logic is as follows: { @@ -276,6 +284,7 @@ class FusedGateAttentionGradOpMaker : public framework::SingleGradOpMaker { op->SetAttrMap(this->Attrs()); bool merge_qkv = PADDLE_GET_CONST(bool, op->GetAttr("merge_qkv")); + if (merge_qkv) { op->SetInput("QKVWeight", this->Input("QKVWeight")); op->SetOutput(framework::GradVarName("QKVWeight"), diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu index ca7b70b220f38..5927f03eca09d 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu @@ -378,6 +378,7 @@ class FusedGateAttentionOpKernel : public framework::OpKernel { const bool merge_qkv = ctx.Attr("merge_qkv"); const bool has_gating = ctx.Attr("has_gating"); + const bool use_flash_attn = ctx.Attr("use_flash_attn"); bool use_fused_matmul_bias = true; auto &dev_ctx = ctx.template device_context(); @@ -389,8 +390,14 @@ class FusedGateAttentionOpKernel : public framework::OpKernel { AllocWithDebugInfo(dev_ctx, "out", out); // When seq_len_r = m_size, q_dim = kv_dim, QKV matmul can be merged. - GateAttentionConfig config( - dev_ctx, query, key, query_weight, qkv_weight, merge_qkv, has_gating); + GateAttentionConfig config(dev_ctx, + query, + key, + query_weight, + qkv_weight, + merge_qkv, + has_gating, + use_flash_attn); if (merge_qkv) { PADDLE_ENFORCE_EQ( diff --git a/paddle/phi/backends/dynload/flashattn.h b/paddle/phi/backends/dynload/flashattn.h index 1d235fcdecfc5..36ec693218f3c 100644 --- a/paddle/phi/backends/dynload/flashattn.h +++ b/paddle/phi/backends/dynload/flashattn.h @@ -43,11 +43,11 @@ extern void* flashattn_dso_handle; #define DECLARE_DYNAMIC_LOAD_FLASHATTN_WRAP(__name) \ DYNAMIC_LOAD_FLASHATTN_WRAP(__name) -#define FLASHATTN_ROUTINE_EACH(__macro) \ - __macro(flash_attn_fwd); \ - __macro(flash_attn_bwd); \ - __macro(flash_attn_error); +#define FLASHATTN_ROUTINE_EACH(__macro) \ + __macro(flash_attn_fwd); \ + __macro(flash_attn_bwd); \ __macro(flash_attn_fwd_with_bias_and_mask); \ + __macro(flash_attn_error); FLASHATTN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_FLASHATTN_WRAP); diff --git a/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h b/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h index ab3b5af1d54ac..79a653fb48772 100644 --- a/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h +++ b/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h @@ -328,10 +328,20 @@ struct MatmulWithCublasLt { const int K, const bool trans_x, const bool trans_y, - phi::funcs::MatmulPlanner* planner = nullptr) { + phi::funcs::MatmulPlanner* planner = nullptr, + T alpha = static_cast(1), + T beta = static_cast(0)) { auto setter = DescriptorSetter(planner, M, N, K, trans_x, trans_y); - RunImpl( - ctx, &setter.desc, setter.sub_key, x_data, y_data, out_data, planner); + + RunImpl(ctx, + &setter.desc, + setter.sub_key, + x_data, + y_data, + out_data, + planner, + static_cast(alpha), + static_cast(beta)); } static void RunWithBatch(const phi::GPUContext& ctx, @@ -347,7 +357,9 @@ struct MatmulWithCublasLt { int64_t stride_x, int64_t stride_y, int64_t stride_out, - phi::funcs::MatmulPlanner* planner = nullptr) { + phi::funcs::MatmulPlanner* planner = nullptr, + T alpha = static_cast(1), + T beta = static_cast(0)) { auto setter = DescriptorSetter(planner, M, N, @@ -358,8 +370,15 @@ struct MatmulWithCublasLt { stride_x, stride_y, stride_out); - RunImpl( - ctx, &setter.desc, setter.sub_key, x_data, y_data, out_data, planner); + RunImpl(ctx, + &setter.desc, + setter.sub_key, + x_data, + y_data, + out_data, + planner, + static_cast(alpha), + static_cast(beta)); } static void RunWithBatch(const phi::GPUContext& ctx, @@ -372,7 +391,9 @@ struct MatmulWithCublasLt { bool trans_x, bool trans_y, int batch_size, - phi::funcs::MatmulPlanner* planner = nullptr) { + phi::funcs::MatmulPlanner* planner = nullptr, + T alpha = static_cast(1), + T beta = static_cast(0)) { for (int i = 0; i < batch_size; ++i) { Run(ctx, x_data[i], @@ -383,7 +404,9 @@ struct MatmulWithCublasLt { K, trans_x, trans_y, - planner); + planner, + alpha, + beta); } } @@ -402,10 +425,9 @@ struct MatmulWithCublasLt { const T* x_ptr, const T* y_ptr, T* out_ptr, - phi::funcs::MatmulPlanner* planner) { - MT alpha = static_cast(1); - MT beta = static_cast(0); - + phi::funcs::MatmulPlanner* planner, + MT alpha, + MT beta) { cublasLtHandle_t cublaslt_handle = ctx.cublaslt_handle(); size_t workspace_size = static_cast(4) * 1024 * 1024; phi::Allocator::AllocationPtr workspace = GetWorkspace(ctx, workspace_size); From 5bee3a69932444ea079e5ca57c2a5564156e1bd3 Mon Sep 17 00:00:00 2001 From: JamesLim-sy Date: Wed, 19 Apr 2023 20:23:06 +0800 Subject: [PATCH 005/405] :qix pointer bugs for my yesterday errors --- cmake/external/flashattn.cmake | 6 +- .../operators/fused/fused_gate_attention.h | 59 +++++++++++++++---- .../fused/fused_gate_attention_op.cu | 11 +++- 3 files changed, 60 insertions(+), 16 deletions(-) diff --git a/cmake/external/flashattn.cmake b/cmake/external/flashattn.cmake index a37559c483421..8846fab1d9c76 100644 --- a/cmake/external/flashattn.cmake +++ b/cmake/external/flashattn.cmake @@ -20,10 +20,10 @@ set(FLASHATTN_PREFIX_DIR ${THIRD_PARTY_PATH}/flashattn) set(FLASHATTN_SOURCE_SUBDIR csrc/flash_attn) set(FLASHATTN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/flashattn) set(FLASHATTN_REPOSITORY ${GIT_URL}/JamesLim-sy/flash-attention.git) -set(FLASHATTN_TAG c7442255b553e8f8ddee21b48a4bca992678cb89) +set(FLASHATTN_TAG f009e2b67a98e279cf9d5b446cd21f3260d9cb0f) -# set(FLASHATTN_REPOSITORY ${GIT_URL}/PaddlePaddle/flash-attention.git) -# set(FLASHATTN_TAG f0edf243a813a65d05c75fcb331b2a95faf96bbc) +#set(FLASHATTN_REPOSITORY ${GIT_URL}/PaddlePaddle/flash-attention.git) +#set(FLASHATTN_TAG f0edf243a813a65d05c75fcb331b2a95faf96bbc) set(FLASHATTN_INCLUDE_DIR "${FLASHATTN_INSTALL_DIR}/include" diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h index 58ab2d9e378a2..d4db86ec67196 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention.h +++ b/paddle/fluid/operators/fused/fused_gate_attention.h @@ -15,10 +15,8 @@ limitations under the License. */ #pragma once #include "paddle/phi/kernels/arange_kernel.h" -#include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h" #include "paddle/phi/kernels/funcs/broadcast_function.h" -#include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" #include "paddle/phi/kernels/funcs/range_function.h" #include "paddle/phi/kernels/funcs/reduce_function.h" @@ -287,14 +285,16 @@ struct GateAttentionGradConfig : public GateAttentionConfig { const phi::DenseTensor* query_weight, const phi::DenseTensor* qkv_weight, bool merge_qkv, - bool has_gating) + bool has_gating, + bool use_flash_attn) : GateAttentionConfig(dev_ctx, query, key, query_weight, qkv_weight, merge_qkv, - has_gating) {} + has_gating, + use_flash_attn) {} phi::DenseTensor* GetQKVOutGrad() { if (!qkv_out_grad.IsInitialized()) { @@ -401,6 +401,16 @@ static void GetFlashAttnDimsString(const std::string& prefix, // } } +#define DBGPTR(ptr, prefix) \ + do { \ + std::ostringstream out_string; \ + void* data = static_cast(ptr); \ + out_string << "[" << __func__ << ", " << __LINE__ << "]: " << prefix \ + << "`s addr is "; \ + out_string << ptr << std::endl; \ + std::cout << out_string.str(); \ + } while (0); + #define DBG_WAIT \ do { \ printf("[%s, %d] Run here.\n", __func__, __LINE__); \ @@ -472,8 +482,8 @@ class FMHAGateRef { phi::funcs::GetSize(start, end, step, &seq_size); cu_seq_q.Resize({end_size}); cu_seq_k.Resize({end_size}); - AllocWithDebugInfo(dev_ctx_, "flash_attn: cu_seq_q", &cu_seq_q); - AllocWithDebugInfo(dev_ctx_, "flash_attn: cu_seq_k", &cu_seq_k); + AllocWithDebugInfo(dev_ctx_, "cu_seq_q", &cu_seq_q); + AllocWithDebugInfo(dev_ctx_, "cu_seq_k", &cu_seq_k); int64_t block = std::min(seq_size, static_cast(256)); int64_t grid = (seq_size + block - 1) / block; FlashAttRange<<>>( @@ -538,11 +548,10 @@ class FMHAGateRef { uint64_t inc = batch_size_ * num_heads_ * 32; auto seed_offset_pair = gen->IncrementOffset(inc); uint64_t seed = seed_offset_pair.first; - uint64_t offset = seed_offset_pair.second; + uint64_t offset = 1908576; // seed_offset_pair.second; GetFlashAttnDimsString("softmax_out", softmax_out->dims()); GetFlashAttnDimsString("softmax_lse", softmax_lse.dims()); - GetFlashAttnDimsString("cu_seq_q", cu_seq_q.dims()); GetFlashAttnDimsString("cu_seq_k", cu_seq_k.dims()); DBG_WAIT; @@ -587,18 +596,23 @@ class FMHAGateRef { PADDLE_THROW(phi::errors::External(phi::dynload::flash_attn_error())); } DBG_WAIT; + phi::DenseTensor workspace; + printf("workspace_size = %d\n", workspace_size); if (workspace_size > 0) { - workspace = phi::Empty( + workspace = phi::Empty( dev_ctx_, {int64_t(workspace_size / sizeof(float))}); + DBGPTR(workspace.data(), "workspace"); } DBG_WAIT; + // 8. flas_attn part two, run impl. succ = phi::dynload::flash_attn_fwd_with_bias_and_mask( static_cast(q_ptr), static_cast(k_ptr), static_cast(v_ptr), - static_cast(fmha_out), // for calculation workspace size + static_cast( + fmha_out->data()), // for calculation workspace size static_cast(cu_seq_q.data()), static_cast(cu_seq_k.data()), total_q_, @@ -616,7 +630,7 @@ class FMHAGateRef { num_splits, softmax_lse.data(), softmax_out->data(), - workspace_size > 0 ? static_cast(workspace.data()) : nullptr, + (workspace_size > 0) ? static_cast(workspace.data()) : nullptr, &workspace_size, stream, seed, @@ -798,6 +812,29 @@ class FMHAGateRef { phi::DenseTensor k_transpose_out_grad; phi::DenseTensor v_transpose_out_grad; phi::DenseTensor qkv_transpose_out_grad; + + bool is_bf16 = + qkv_transpose_out->dtype() == DataType::BFLOAT16 ? true : false; + + // if (config->UseFlashAttn(merge_qkv_, config->use_flash_attn && is_bf16)) + // { + // PADDLE_ENFORCE_NOT_NULL( + // qkv_transpose_out, + // platform::errors::NotFound("The input qkv_transpose_out can not be + // " + // "nullptr when merge_qkv is true.")); + // int64_t q_size = config->GetQuerySize(); + // q_ptr = qkv_transpose_out->data(); + // k_ptr = q_ptr + q_size; + // v_ptr = k_ptr + q_size; + + // qkv_transpose_out_grad.Resize(config->qkv_transpose_out_dims); + // AllocWithDebugInfo( + // dev_ctx_, "qkv_transpose_out_grad", &qkv_transpose_out_grad); + + // } else { + + // } if (merge_qkv_) { PADDLE_ENFORCE_NOT_NULL( qkv_transpose_out, diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu index 5927f03eca09d..584cb48c1ebe4 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu @@ -484,13 +484,20 @@ class FusedGateAttentionGradKernel : public framework::OpKernel { bool has_gating = ctx.Attr("has_gating"); bool merge_qkv = ctx.Attr("merge_qkv"); + const bool use_flash_attn = ctx.Attr("use_flash_attn"); bool use_fused_matmul_bias = true; auto &dev_ctx = ctx.template device_context(); AllocWithDebugInfo(dev_ctx, "query_grad", query_grad); - GateAttentionGradConfig config( - dev_ctx, query, key, query_weight, qkv_weight, merge_qkv, has_gating); + GateAttentionGradConfig config(dev_ctx, + query, + key, + query_weight, + qkv_weight, + merge_qkv, + has_gating, + use_flash_attn); phi::DenseTensor fmha_out_grad; fmha_out_grad.Resize(config.gate_out_dims); From afce9d6bf6a6b202845af3a63a181a3716e7f9ff Mon Sep 17 00:00:00 2001 From: JamesLim-sy Date: Sun, 23 Apr 2023 17:20:25 +0800 Subject: [PATCH 006/405] add for backward --- cmake/external/flashattn.cmake | 2 +- .../operators/fused/fused_gate_attention.h | 474 +++++++++++++----- .../fused/fused_gate_attention_op.cc | 7 + .../fused/fused_gate_attention_op.cu | 53 +- paddle/phi/backends/dynload/flashattn.h | 1 + 5 files changed, 400 insertions(+), 137 deletions(-) diff --git a/cmake/external/flashattn.cmake b/cmake/external/flashattn.cmake index 8846fab1d9c76..2cf1ddafaafdc 100644 --- a/cmake/external/flashattn.cmake +++ b/cmake/external/flashattn.cmake @@ -20,7 +20,7 @@ set(FLASHATTN_PREFIX_DIR ${THIRD_PARTY_PATH}/flashattn) set(FLASHATTN_SOURCE_SUBDIR csrc/flash_attn) set(FLASHATTN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/flashattn) set(FLASHATTN_REPOSITORY ${GIT_URL}/JamesLim-sy/flash-attention.git) -set(FLASHATTN_TAG f009e2b67a98e279cf9d5b446cd21f3260d9cb0f) +set(FLASHATTN_TAG edeaea701e0c0e712f1a43a8970b5d59f5256e3b) #set(FLASHATTN_REPOSITORY ${GIT_URL}/PaddlePaddle/flash-attention.git) #set(FLASHATTN_TAG f0edf243a813a65d05c75fcb331b2a95faf96bbc) diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h index d4db86ec67196..e43a9b77c848c 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention.h +++ b/paddle/fluid/operators/fused/fused_gate_attention.h @@ -15,7 +15,8 @@ limitations under the License. */ #pragma once #include "paddle/phi/kernels/arange_kernel.h" -#include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h" +// #include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h" +#include "paddle/phi/backends/dynload/flashattn.h" #include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" #include "paddle/phi/kernels/funcs/range_function.h" @@ -24,7 +25,6 @@ limitations under the License. */ #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" #ifdef PADDLE_WITH_FLASHATTN -#include "paddle/phi/backends/dynload/flashattn.h" #include "paddle/phi/kernels/flash_attn_kernel.h" #endif @@ -158,8 +158,8 @@ struct GateAttentionConfig { gate_out_dims = {batch_size, seq_len_m, seq_len_r, num_heads, head_dim}; } - bool UseFlashAttn(const bool merge_qkv, const bool is_amp) { - if (!is_amp) { + bool UseFlashAttn(const bool merge_qkv, const bool run_flash_attn) { + if (!run_flash_attn) { return false; } @@ -527,13 +527,13 @@ class FMHAGateRef { int max_seqlen_q_ = batch_size_; int max_seqlen_k_ = batch_size_; int num_splits = 0; // 0 for an internal heuristic, which is optimal - VLOG(6) << "[Flash_attn] batch_size : " << batch_size_; - VLOG(6) << "[Flash_attn] total_q : " << total_q_; - VLOG(6) << "[Flash_attn] total_k : " << total_k_; - VLOG(6) << "[Flash_attn] num_heads : " << num_heads_; - VLOG(6) << "[Flash_attn] head_size : " << head_size_; - VLOG(6) << "[Flash_attn] max_seqlen_q : " << max_seqlen_q_; - VLOG(6) << "[Flash_attn] max_seqlen_k : " << max_seqlen_k_; + VLOG(6) << "[Flash_attn Fwd] batch_size : " << batch_size_; + VLOG(6) << "[Flash_attn Fwd] total_q : " << total_q_; + VLOG(6) << "[Flash_attn Fwd] total_k : " << total_k_; + VLOG(6) << "[Flash_attn Fwd] num_heads : " << num_heads_; + VLOG(6) << "[Flash_attn Fwd] head_size : " << head_size_; + VLOG(6) << "[Flash_attn Fwd] max_seqlen_q : " << max_seqlen_q_; + VLOG(6) << "[Flash_attn Fwd] max_seqlen_k : " << max_seqlen_k_; // 5. construct softmax_lse phi::DenseTensor softmax_lse; @@ -606,6 +606,21 @@ class FMHAGateRef { } DBG_WAIT; +#define DBG_INIT(prefix, x) \ + do { \ + printf("[%s, %d] ", __func__, __LINE__); \ + if (x->initialized()) { \ + std::cout << prefix << " is initialized." << std::endl; \ + } else { \ + std::cout << prefix << " is not initialized." << std::endl; \ + } \ + } while (0); + DBG_INIT("qkv_transpose_out", qkv_transpose_out); + DBG_INIT("softmax_out", softmax_out); + DBG_INIT("src_mask", src_mask); + DBG_INIT("fmha_out", fmha_out); + DBG_INIT("gate_out", gate_out); + // 8. flas_attn part two, run impl. succ = phi::dynload::flash_attn_fwd_with_bias_and_mask( static_cast(q_ptr), @@ -799,7 +814,11 @@ class FMHAGateRef { const phi::DenseTensor* fmha_out_grad, phi::DenseTensor* src_mask_grad, phi::DenseTensor* nonbatched_bias_grad, - GateAttentionGradConfig* config) { + GateAttentionGradConfig* config, + const phi::DenseTensor* fmha_out = nullptr, + const phi::DenseTensor* softmax_lse = nullptr, + const phi::DenseTensor* nonbatched_bias = nullptr, + const phi::DenseTensor* src_mask = = nullptr) { const T* q_ptr = nullptr; const T* k_ptr = nullptr; const T* v_ptr = nullptr; @@ -816,31 +835,19 @@ class FMHAGateRef { bool is_bf16 = qkv_transpose_out->dtype() == DataType::BFLOAT16 ? true : false; - // if (config->UseFlashAttn(merge_qkv_, config->use_flash_attn && is_bf16)) - // { - // PADDLE_ENFORCE_NOT_NULL( - // qkv_transpose_out, - // platform::errors::NotFound("The input qkv_transpose_out can not be - // " - // "nullptr when merge_qkv is true.")); - // int64_t q_size = config->GetQuerySize(); - // q_ptr = qkv_transpose_out->data(); - // k_ptr = q_ptr + q_size; - // v_ptr = k_ptr + q_size; - - // qkv_transpose_out_grad.Resize(config->qkv_transpose_out_dims); - // AllocWithDebugInfo( - // dev_ctx_, "qkv_transpose_out_grad", &qkv_transpose_out_grad); - - // } else { - - // } - if (merge_qkv_) { + if (std::is_same::value) { + std::cout << "[Grad]: T is phi::dtype::float16. \n"; + } else if (std::is_same::value) { + std::cout << "[Grad]: T is phi::dtype::bfloat16. \n"; + } else if (std::is_same::value) { + std::cout << "[Grad]: T is float. \n"; + } + + if (config->UseFlashAttn(merge_qkv_, config->use_flash_attn && is_bf16)) { PADDLE_ENFORCE_NOT_NULL( qkv_transpose_out, - platform::errors::NotFound("The input qkv_transpose_out can not be " + platform::errors::NotFound("The input qkv_transpose_out can not be" "nullptr when merge_qkv is true.")); - int64_t q_size = config->GetQuerySize(); q_ptr = qkv_transpose_out->data(); k_ptr = q_ptr + q_size; @@ -850,27 +857,95 @@ class FMHAGateRef { AllocWithDebugInfo( dev_ctx_, "qkv_transpose_out_grad", &qkv_transpose_out_grad); - q_grad_ptr = qkv_transpose_out_grad.data(); - k_grad_ptr = q_grad_ptr + q_size; - v_grad_ptr = k_grad_ptr + q_size; - } else { - PADDLE_ENFORCE_NOT_NULL( - q_transpose_out, - platform::errors::NotFound("The input q_transpose_out can not be " - "nullptr when merge_qkv is false.")); - PADDLE_ENFORCE_NOT_NULL( - k_transpose_out, - platform::errors::NotFound("The input k_transpose_out can not be " - "nullptr when merge_qkv is false.")); - PADDLE_ENFORCE_NOT_NULL( - v_transpose_out, - platform::errors::NotFound("The input v_transpose_out can not be " - "nullptr when merge_qkv is false.")); + int seq_batch_size = static_cast(config->batch_size) * + static_cast(config->seq_len_m); + qkv_transpose_out->Resize( + {3, + seq_batch_size * static_cast(config->seq_len_r), + static_cast(config->num_heads), + static_cast(config->head_dim)}); + DBG_WAIT; + + // 2. Dealing with cu_seq_q and cu_seq_k for flash_attn. + phi::DenseTensor cu_seq_q, cu_seq_k; + int64_t start = 0; + int64_t step = static_cast(config->seq_len_r); + int64_t end_size = (seq_batch_size + 1); + int64_t end = end_size, int64_t seq_size = 0; + phi::funcs::GetSize(start, end, step, &seq_size); + cu_seq_q.Resize({end_size}); + cu_seq_k.Resize({end_size}); + AllocWithDebugInfo(dev_ctx_, "Grad: cu_seq_q", &cu_seq_q); + AllocWithDebugInfo(dev_ctx_, "Grad: cu_seq_k", &cu_seq_k); + int64_t block = std::min(seq_size, static_cast(256)); + int64_t grid = (seq_size + block - 1) / block; + FlashAttRange<<>>( + start, step, end, cu_seq_q.data(), cu_seq_k.data()); + VLOG(4) << "[Flash_attn] cu_seq_len : start = " << start + << ", step = " << step << ", end = " << end; + DBG_WAIT; + + // 3. Dealing with mask and bias for flash_attn. + phi::DenseTensor temp_mask, temp_bias; + auto dims_merge_func = [&](const phi::DenseTensor* src_tensor, + phi::DenseTensor* dst_tensor, + const std::string& prefix) { + if (src_tensor) { + int64_t first_dim = 1; + dst_tensor->ShareDataWith(*src_tensor); + auto dims_ = src_tensor->dims(); + for (int i = 0; i < dims_.size() - 3; ++i) { + first_dim *= dims_[i]; + } + auto dims_rank = dims_.size(); + dst_tensor->Resize({first_dim, + dims_[dims_rank - 3], + dims_[dims_rank - 2], + dims_[dims_rank - 1]}); + GetFlashAttnDimsString(prefix, temp_mask.dims()); + } + }; + dims_merge_func(src_mask, &temp_mask, "[Grad] mask_dim"); + dims_merge_func(nonbatched_bias, &temp_bias, "[Grad] bias_dim"); + + auto& qkv_dims = qkv_transpose_out->dims(); + int batch_size_ = seq_batch_size; + int total_q_ = qkv_dims[1]; // q.dims()[0] + int total_k_ = qkv_dims[1]; // q.dims()[0] + int num_heads_ = qkv_dims[2]; // q.dims()[1] + int head_size_ = qkv_dims[3]; // q.dims()[2] + int max_seqlen_q_ = batch_size_; + int max_seqlen_k_ = batch_size_; + int num_splits = 0; + VLOG(6) << "[Flash_attn Grad] batch_size : " << batch_size_; + VLOG(6) << "[Flash_attn Grad] total_q : " << total_q_; + VLOG(6) << "[Flash_attn Grad] total_k : " << total_k_; + VLOG(6) << "[Flash_attn Grad] num_heads : " << num_heads_; + VLOG(6) << "[Flash_attn Grad] head_size : " << head_size_; + VLOG(6) << "[Flash_attn Grad] max_seqlen_q : " << max_seqlen_q_; + VLOG(6) << "[Flash_attn Grad] max_seqlen_k : " << max_seqlen_k_; + + // 5. construct softmax_lse + int last_q_dim = ((max_seqlen_q_ + 16 - 1) / 16) * 16; + softmax_lse->Resize({batch_size_, num_heads_, last_q_dim}); + AllocWithDebugInfo( + dev_ctx_, "flash_attn: softmax_lse", softmax_lse); + DBG_WAIT; + + phi::DenseTensor softmax_d = phi::Empty( + dev_ctx_, {batch_size_, num_heads_, last_q_dim}); + DBG_WAIT; + + phi::DenseTensor bias_d; + if (nonbatched_bias) { + bias_d = phi::Empty( + dev_ctx_, {batch_size_, num_heads_, max_seqlen_q_, max_seqlen_k_}); + } + DBG_WAIT; q_ptr = q_transpose_out->data(); k_ptr = k_transpose_out->data(); v_ptr = v_transpose_out->data(); - q_transpose_out_grad.Resize(config->q_transpose_out_dims); k_transpose_out_grad.Resize(config->kv_transpose_out_dims); v_transpose_out_grad.Resize(config->kv_transpose_out_dims); @@ -881,99 +956,246 @@ class FMHAGateRef { k_transpose_out_grad.numel() * sizeof(T)); v_grad_ptr = dev_ctx_.Alloc(&v_transpose_out_grad, v_transpose_out_grad.numel() * sizeof(T)); - } - phi::DenseTensor softmax_out_grad; - softmax_out_grad.Resize(config->softmax_out_dims); - AllocWithDebugInfo(dev_ctx_, "softmax_out_grad", &softmax_out_grad); - - int64_t gemm_batch_size = - config->batch_size * config->seq_len_m * config->num_heads; - { - // Forward: fmha_out = transpose(qktv_out) - phi::DenseTensor qktv_out_grad; - qktv_out_grad.Resize(config->qktv_out_dims); - AllocWithDebugInfo(dev_ctx_, "qktv_out_grad", &qktv_out_grad); - ComputeQKTVTransposeBackward(*fmha_out_grad, &qktv_out_grad); - - // Forward: qktv_out = BatchedGEMM(softmax_out, V) - // Backward: - // V_grad = BatchedGEMM(softmax_out^T, qktv_out_grad) (dy = x^T * dout) + // 7. flas_attn part one, get temp worksapce size. + uint64_t workspace_size; + float p_dropout = 0.f; + float softmax_scale = static_cast(1); + cudaStream_t stream = dev_ctx_.stream(); + int num_splits = 0; // 0 for an internal heuristic, which is optimal + succ = phi::dynload::flash_attn_bwd_with_bias_and_mask( + static_cast(q_ptr), + static_cast(k_ptr), + static_cast(v_ptr), + q_grad_ptr, + k_grad_ptr, + v_grad_ptr, + static_cast( + fmha_out->data()), // total_q x num_heads x head_size, total_k := + // \sum_{i=0}^{b} s_i + static_cast( + fmha_out_grad->data()), // total_q x num_heads, x head_size + static_cast(cu_seq_q.data()), + static_cast(cu_seq_k.data()), + total_q_, + total_k_, + batch_size_, + num_heads_, + head_size_, + max_seqlen_q_, + max_seqlen_k_, + p_dropout, + softmax_scale, + /*zero_tensors=*/false, + /*is_causal=*/false, + is_bf16, + num_splits, + softmax_lse.data(), + softmax_d.data(), + nullptr, + bias_d.data(), + &workspace_size, + stream, + seed, + offset, + src_mask ? temp_mask.data() : nullptr, + nonbatched_bias ? temp_bias.data() : nullptr, + temp_mask.dims().Get(), + temp_bias.dims().Get()); + if (!succ) { + PADDLE_THROW(phi::errors::External(phi::dynload::flash_attn_error())); + } + DBG_WAIT; + + phi::DenseTensor workspace; + printf("workspace_size = %d\n", workspace_size); + if (workspace_size > 0) { + workspace = phi::Empty( + dev_ctx_, {int64_t(workspace_size / sizeof(float))}); + DBGPTR(workspace.data(), "workspace"); + } + DBG_WAIT; + + succ = phi::dynload::flash_attn_bwd_with_bias_and_mask( + static_cast(q_ptr), + static_cast(k_ptr), + static_cast(v_ptr), + q_grad_ptr, + k_grad_ptr, + v_grad_ptr, + static_cast( + fmha_out->data()), // total_q x num_heads x head_size, total_k := + // \sum_{i=0}^{b} s_i + static_cast( + fmha_out_grad->data()), // total_q x num_heads, x head_size + static_cast(cu_seq_q.data()), + static_cast(cu_seq_k.data()), + total_q_, + total_k_, + batch_size_, + num_heads_, + head_size_, + max_seqlen_q_, + max_seqlen_k_, + p_dropout, + softmax_scale, + /*zero_tensors=*/false, + /*is_causal=*/false, + is_bf16, + num_splits, + softmax_lse.data(), + softmax_d.data(), + workspace.data(), + bias_d.data(), + &workspace_size, + stream, + seed, + offset, + src_mask ? temp_mask.data() : nullptr, + nonbatched_bias ? temp_bias.data() : nullptr, + temp_mask.dims().Get(), + temp_bias.dims().Get()); + } else { + if (merge_qkv_) { + PADDLE_ENFORCE_NOT_NULL( + qkv_transpose_out, + platform::errors::NotFound("The input qkv_transpose_out can not be " + "nullptr when merge_qkv is true.")); + + int64_t q_size = config->GetQuerySize(); + q_ptr = qkv_transpose_out->data(); + k_ptr = q_ptr + q_size; + v_ptr = k_ptr + q_size; + + qkv_transpose_out_grad.Resize(config->qkv_transpose_out_dims); + AllocWithDebugInfo( + dev_ctx_, "qkv_transpose_out_grad", &qkv_transpose_out_grad); + + q_grad_ptr = qkv_transpose_out_grad.data(); + k_grad_ptr = q_grad_ptr + q_size; + v_grad_ptr = k_grad_ptr + q_size; + } else { + PADDLE_ENFORCE_NOT_NULL( + q_transpose_out, + platform::errors::NotFound("The input q_transpose_out can not be " + "nullptr when merge_qkv is false.")); + PADDLE_ENFORCE_NOT_NULL( + k_transpose_out, + platform::errors::NotFound("The input k_transpose_out can not be " + "nullptr when merge_qkv is false.")); + PADDLE_ENFORCE_NOT_NULL( + v_transpose_out, + platform::errors::NotFound("The input v_transpose_out can not be " + "nullptr when merge_qkv is false.")); + + q_ptr = q_transpose_out->data(); + k_ptr = k_transpose_out->data(); + v_ptr = v_transpose_out->data(); + + q_transpose_out_grad.Resize(config->q_transpose_out_dims); + k_transpose_out_grad.Resize(config->kv_transpose_out_dims); + v_transpose_out_grad.Resize(config->kv_transpose_out_dims); + + q_grad_ptr = dev_ctx_.Alloc( + &q_transpose_out_grad, q_transpose_out_grad.numel() * sizeof(T)); + k_grad_ptr = dev_ctx_.Alloc( + &k_transpose_out_grad, k_transpose_out_grad.numel() * sizeof(T)); + v_grad_ptr = dev_ctx_.Alloc( + &v_transpose_out_grad, v_transpose_out_grad.numel() * sizeof(T)); + } + + phi::DenseTensor softmax_out_grad; + softmax_out_grad.Resize(config->softmax_out_dims); + AllocWithDebugInfo(dev_ctx_, "softmax_out_grad", &softmax_out_grad); + + int64_t gemm_batch_size = + config->batch_size * config->seq_len_m * config->num_heads; + { + // Forward: fmha_out = transpose(qktv_out) + phi::DenseTensor qktv_out_grad; + qktv_out_grad.Resize(config->qktv_out_dims); + AllocWithDebugInfo(dev_ctx_, "qktv_out_grad", &qktv_out_grad); + ComputeQKTVTransposeBackward(*fmha_out_grad, &qktv_out_grad); + + // Forward: qktv_out = BatchedGEMM(softmax_out, V) + // Backward: + // V_grad = BatchedGEMM(softmax_out^T, qktv_out_grad) (dy = x^T * dout) + int64_t gemm_m = config->m_size; + int64_t gemm_n = config->head_dim; + int64_t gemm_k = config->seq_len_r; + + const T* softmax_out_ptr = softmax_out->data(); + const T* qktv_out_grad_ptr = qktv_out_grad.data(); + ComputeBatchedGEMM(softmax_out_ptr, + qktv_out_grad_ptr, + v_grad_ptr, + true, + false, + gemm_m, + gemm_n, + gemm_k, + gemm_batch_size); + + // Backward: softmax_out_grad = qktv_out_grad * V^T (dx = dout * y^T) + gemm_m = config->seq_len_r; + gemm_n = config->m_size; + gemm_k = config->head_dim; + + T* softmax_out_grad_ptr = softmax_out_grad.data(); + ComputeBatchedGEMM(qktv_out_grad_ptr, + v_ptr, + softmax_out_grad_ptr, + false, + true, + gemm_m, + gemm_n, + gemm_k, + gemm_batch_size); + } + + phi::DenseTensor* qk_out_grad = config->GetQKOutGrad(&softmax_out_grad); + ComputeBiasMaskSoftmaxBackward(&softmax_out_grad, + softmax_out, + src_mask_grad, + qk_out_grad, + nonbatched_bias_grad); + + // Forward: qk_out = BatchedGEMM(Q, K^T) + // Backward: k_grad = BatchedGEMM(qk_out_grad^T, Q) (dy = dout^t * x) int64_t gemm_m = config->m_size; int64_t gemm_n = config->head_dim; int64_t gemm_k = config->seq_len_r; + T alpha = static_cast(1.0 / sqrt(config->head_dim)); - const T* softmax_out_ptr = softmax_out->data(); - const T* qktv_out_grad_ptr = qktv_out_grad.data(); - ComputeBatchedGEMM(softmax_out_ptr, - qktv_out_grad_ptr, - v_grad_ptr, + T* qk_out_grad_ptr = qk_out_grad->data(); + ComputeBatchedGEMM(qk_out_grad_ptr, + q_ptr, + k_grad_ptr, true, false, gemm_m, gemm_n, gemm_k, - gemm_batch_size); + gemm_batch_size, + alpha); - // Backward: softmax_out_grad = qktv_out_grad * V^T (dx = dout * y^T) + // Backward: q_grad = BatchedGEMM(qk_out_grad, K) (dx = dout * y) gemm_m = config->seq_len_r; - gemm_n = config->m_size; - gemm_k = config->head_dim; - - T* softmax_out_grad_ptr = softmax_out_grad.data(); - ComputeBatchedGEMM(qktv_out_grad_ptr, - v_ptr, - softmax_out_grad_ptr, + gemm_n = config->head_dim; + gemm_k = config->m_size; + ComputeBatchedGEMM(qk_out_grad_ptr, + k_ptr, + q_grad_ptr, + false, false, - true, gemm_m, gemm_n, gemm_k, - gemm_batch_size); + gemm_batch_size, + alpha); } - phi::DenseTensor* qk_out_grad = config->GetQKOutGrad(&softmax_out_grad); - ComputeBiasMaskSoftmaxBackward(&softmax_out_grad, - softmax_out, - src_mask_grad, - qk_out_grad, - nonbatched_bias_grad); - - // Forward: qk_out = BatchedGEMM(Q, K^T) - // Backward: k_grad = BatchedGEMM(qk_out_grad^T, Q) (dy = dout^t * x) - int64_t gemm_m = config->m_size; - int64_t gemm_n = config->head_dim; - int64_t gemm_k = config->seq_len_r; - T alpha = static_cast(1.0 / sqrt(config->head_dim)); - - T* qk_out_grad_ptr = qk_out_grad->data(); - ComputeBatchedGEMM(qk_out_grad_ptr, - q_ptr, - k_grad_ptr, - true, - false, - gemm_m, - gemm_n, - gemm_k, - gemm_batch_size, - alpha); - - // Backward: q_grad = BatchedGEMM(qk_out_grad, K) (dx = dout * y) - gemm_m = config->seq_len_r; - gemm_n = config->head_dim; - gemm_k = config->m_size; - ComputeBatchedGEMM(qk_out_grad_ptr, - k_ptr, - q_grad_ptr, - false, - false, - gemm_m, - gemm_n, - gemm_k, - gemm_batch_size, - alpha); - - if (merge_qkv_) { + if (merge_qkv_ || config->use_flash_attn) { phi::DenseTensor* qkv_out_grad = config->GetQKVOutGrad(); ComputeQKVTransposeBackward(qkv_transpose_out_grad, qkv_out_grad); } else { diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cc b/paddle/fluid/operators/fused/fused_gate_attention_op.cc index e2d8926f53516..0c965ac08745a 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention_op.cc +++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cc @@ -284,12 +284,19 @@ class FusedGateAttentionGradOpMaker : public framework::SingleGradOpMaker { op->SetAttrMap(this->Attrs()); bool merge_qkv = PADDLE_GET_CONST(bool, op->GetAttr("merge_qkv")); + bool use_flash_attn = PADDLE_GET_CONST(bool, op->GetAttr("use_flash_attn")); if (merge_qkv) { op->SetInput("QKVWeight", this->Input("QKVWeight")); op->SetOutput(framework::GradVarName("QKVWeight"), this->InputGrad("QKVWeight")); op->SetInput("QKVTransposeOut", this->Output("QKVTransposeOut")); + + if (use_flash_attn) { + op->SetInput("NonbatchedBias", this->Input("NonbatchedBias")); + op->SetInput("SrcMask", this->Input("SrcMask")); + op->SetInput("SoftmaxLse", this->Output("SoftmaxLse")); + } } else { op->SetInput("Key", this->Input("Key")); op->SetOutput(framework::GradVarName("Key"), this->InputGrad("Key")); diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu index 584cb48c1ebe4..407ffe0a0820d 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu @@ -484,7 +484,7 @@ class FusedGateAttentionGradKernel : public framework::OpKernel { bool has_gating = ctx.Attr("has_gating"); bool merge_qkv = ctx.Attr("merge_qkv"); - const bool use_flash_attn = ctx.Attr("use_flash_attn"); + bool use_flash_attn = ctx.Attr("use_flash_attn"); bool use_fused_matmul_bias = true; auto &dev_ctx = ctx.template device_context(); @@ -499,6 +499,17 @@ class FusedGateAttentionGradKernel : public framework::OpKernel { has_gating, use_flash_attn); + const phi::DenseTensor *fwd_out = nullptr; + const phi::DenseTensor *fwd_bias = nullptr; + const phi::DenseTensor *fwd_mask = nullptr; + const phi::DenseTensor *fwd_softmax_lse = nullptr; + if (merge_qkv && use_flash_attn) { + fwd_bias = ctx.Input("NonbatchedBias"); + fwd_mask = ctx.Input("SrcMask"); + fwd_softmax_lse = ctx.Input("SoftmaxLse"); + fwd_out = fmha_out; + } + phi::DenseTensor fmha_out_grad; fmha_out_grad.Resize(config.gate_out_dims); AllocWithDebugInfo(dev_ctx, "fmha_out_grad", &fmha_out_grad); @@ -533,15 +544,37 @@ class FusedGateAttentionGradKernel : public framework::OpKernel { } auto fmha_compute = FMHAGateRef(dev_ctx, merge_qkv); - fmha_compute.ComputeBackward(q_transpose_out, - k_transpose_out, - v_transpose_out, - qkv_transpose_out, - softmax_out, - &fmha_out_grad, - nullptr, - nonbatched_bias_grad, - &config); + + if (use_flash_attn) { + const phi::DenseTensor *fwd_bias = + ctx.Input("NonbatchedBias"); + const phi::DenseTensor *fwd_mask = ctx.Input("SrcMask"); + const phi::DenseTensor *fwd_softmax_lse = + ctx.Input("SoftmaxLse"); + fmha_compute.ComputeBackward(q_transpose_out, + k_transpose_out, + v_transpose_out, + qkv_transpose_out, + softmax_out, + &fmha_out_grad, + nullptr, + nonbatched_bias_grad, + &config, + fmha_out, // fwd_out + fwd_softmax_lse, + fwd_bias, + fwd_mask); + } else { + fmha_compute.ComputeBackward(q_transpose_out, + k_transpose_out, + v_transpose_out, + qkv_transpose_out, + softmax_out, + &fmha_out_grad, + nullptr, + nonbatched_bias_grad, + &config); + } bool use_addto = has_gating ? true : false; if (merge_qkv) { diff --git a/paddle/phi/backends/dynload/flashattn.h b/paddle/phi/backends/dynload/flashattn.h index 36ec693218f3c..8948ec6a46988 100644 --- a/paddle/phi/backends/dynload/flashattn.h +++ b/paddle/phi/backends/dynload/flashattn.h @@ -47,6 +47,7 @@ extern void* flashattn_dso_handle; __macro(flash_attn_fwd); \ __macro(flash_attn_bwd); \ __macro(flash_attn_fwd_with_bias_and_mask); \ + __macro(flash_attn_bwd_with_bias_and_mask); \ __macro(flash_attn_error); FLASHATTN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_FLASHATTN_WRAP); From 9f76b5f0650dfb4dbec6254f5501d1cffece6d24 Mon Sep 17 00:00:00 2001 From: JamesLim-sy Date: Sun, 23 Apr 2023 17:27:35 +0800 Subject: [PATCH 007/405] fix bugs for backward --- cmake/external/flashattn.cmake | 2 +- .../operators/fused/fused_gate_attention.h | 108 +++++++++++------- 2 files changed, 69 insertions(+), 41 deletions(-) diff --git a/cmake/external/flashattn.cmake b/cmake/external/flashattn.cmake index 2cf1ddafaafdc..fd9459aad7072 100644 --- a/cmake/external/flashattn.cmake +++ b/cmake/external/flashattn.cmake @@ -20,7 +20,7 @@ set(FLASHATTN_PREFIX_DIR ${THIRD_PARTY_PATH}/flashattn) set(FLASHATTN_SOURCE_SUBDIR csrc/flash_attn) set(FLASHATTN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/flashattn) set(FLASHATTN_REPOSITORY ${GIT_URL}/JamesLim-sy/flash-attention.git) -set(FLASHATTN_TAG edeaea701e0c0e712f1a43a8970b5d59f5256e3b) +set(FLASHATTN_TAG opitmization_for_alphafold2) #set(FLASHATTN_REPOSITORY ${GIT_URL}/PaddlePaddle/flash-attention.git) #set(FLASHATTN_TAG f0edf243a813a65d05c75fcb331b2a95faf96bbc) diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h index e43a9b77c848c..282b1940aa325 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention.h +++ b/paddle/fluid/operators/fused/fused_gate_attention.h @@ -548,7 +548,7 @@ class FMHAGateRef { uint64_t inc = batch_size_ * num_heads_ * 32; auto seed_offset_pair = gen->IncrementOffset(inc); uint64_t seed = seed_offset_pair.first; - uint64_t offset = 1908576; // seed_offset_pair.second; + uint64_t offset = seed_offset_pair.second; GetFlashAttnDimsString("softmax_out", softmax_out->dims()); GetFlashAttnDimsString("softmax_lse", softmax_lse.dims()); @@ -566,8 +566,8 @@ class FMHAGateRef { static_cast(k_ptr), static_cast(v_ptr), nullptr, // for calculation workspace size - static_cast(cu_seq_q.data()), - static_cast(cu_seq_k.data()), + cu_seq_q.data(), + cu_seq_k.data(), total_q_, total_k_, batch_size_, @@ -628,8 +628,8 @@ class FMHAGateRef { static_cast(v_ptr), static_cast( fmha_out->data()), // for calculation workspace size - static_cast(cu_seq_q.data()), - static_cast(cu_seq_k.data()), + cu_seq_q.data(), + cu_seq_k.data(), total_q_, total_k_, batch_size_, @@ -859,11 +859,6 @@ class FMHAGateRef { int seq_batch_size = static_cast(config->batch_size) * static_cast(config->seq_len_m); - qkv_transpose_out->Resize( - {3, - seq_batch_size * static_cast(config->seq_len_r), - static_cast(config->num_heads), - static_cast(config->head_dim)}); DBG_WAIT; // 2. Dealing with cu_seq_q and cu_seq_k for flash_attn. @@ -871,7 +866,8 @@ class FMHAGateRef { int64_t start = 0; int64_t step = static_cast(config->seq_len_r); int64_t end_size = (seq_batch_size + 1); - int64_t end = end_size, int64_t seq_size = 0; + int64_t end = end_size; + int64_t seq_size = 0; phi::funcs::GetSize(start, end, step, &seq_size); cu_seq_q.Resize({end_size}); cu_seq_k.Resize({end_size}); @@ -908,7 +904,10 @@ class FMHAGateRef { dims_merge_func(src_mask, &temp_mask, "[Grad] mask_dim"); dims_merge_func(nonbatched_bias, &temp_bias, "[Grad] bias_dim"); - auto& qkv_dims = qkv_transpose_out->dims(); + phi::DDim qkv_dims({3, + seq_batch_size * static_cast(config->seq_len_r), + static_cast(config->num_heads), + static_cast(config->head_dim)}); int batch_size_ = seq_batch_size; int total_q_ = qkv_dims[1]; // q.dims()[0] int total_k_ = qkv_dims[1]; // q.dims()[0] @@ -916,7 +915,6 @@ class FMHAGateRef { int head_size_ = qkv_dims[3]; // q.dims()[2] int max_seqlen_q_ = batch_size_; int max_seqlen_k_ = batch_size_; - int num_splits = 0; VLOG(6) << "[Flash_attn Grad] batch_size : " << batch_size_; VLOG(6) << "[Flash_attn Grad] total_q : " << total_q_; VLOG(6) << "[Flash_attn Grad] total_k : " << total_k_; @@ -927,9 +925,9 @@ class FMHAGateRef { // 5. construct softmax_lse int last_q_dim = ((max_seqlen_q_ + 16 - 1) / 16) * 16; - softmax_lse->Resize({batch_size_, num_heads_, last_q_dim}); - AllocWithDebugInfo( - dev_ctx_, "flash_attn: softmax_lse", softmax_lse); + // softmax_lse->Resize({batch_size_, num_heads_, last_q_dim}); + // AllocWithDebugInfo( + // dev_ctx_, "flash_attn: softmax_lse", softmax_lse); DBG_WAIT; phi::DenseTensor softmax_d = phi::Empty( @@ -957,26 +955,33 @@ class FMHAGateRef { v_grad_ptr = dev_ctx_.Alloc(&v_transpose_out_grad, v_transpose_out_grad.numel() * sizeof(T)); + // 6. construct random seed + auto gen = dev_ctx_.GetGenerator(); + uint64_t inc = batch_size_ * num_heads_ * 32; + auto seed_offset_pair = gen->IncrementOffset(inc); + uint64_t seed = seed_offset_pair.first; + uint64_t offset = seed_offset_pair.second; + // 7. flas_attn part one, get temp worksapce size. uint64_t workspace_size; float p_dropout = 0.f; float softmax_scale = static_cast(1); cudaStream_t stream = dev_ctx_.stream(); int num_splits = 0; // 0 for an internal heuristic, which is optimal - succ = phi::dynload::flash_attn_bwd_with_bias_and_mask( + bool succ = phi::dynload::flash_attn_bwd_with_bias_and_mask( static_cast(q_ptr), static_cast(k_ptr), static_cast(v_ptr), - q_grad_ptr, - k_grad_ptr, - v_grad_ptr, - static_cast( - fmha_out->data()), // total_q x num_heads x head_size, total_k := + static_cast(q_grad_ptr), + static_cast(k_grad_ptr), + static_cast(v_grad_ptr), + static_cast( + fmha_out->data()), // total_q x num_heads x head_size, total_k : // \sum_{i=0}^{b} s_i - static_cast( + static_cast( fmha_out_grad->data()), // total_q x num_heads, x head_size - static_cast(cu_seq_q.data()), - static_cast(cu_seq_k.data()), + cu_seq_q.data(), + cu_seq_k.data(), total_q_, total_k_, batch_size_, @@ -990,16 +995,16 @@ class FMHAGateRef { /*is_causal=*/false, is_bf16, num_splits, - softmax_lse.data(), + softmax_lse->data(), softmax_d.data(), - nullptr, bias_d.data(), + nullptr, &workspace_size, stream, seed, offset, - src_mask ? temp_mask.data() : nullptr, nonbatched_bias ? temp_bias.data() : nullptr, + src_mask ? temp_mask.data() : nullptr, temp_mask.dims().Get(), temp_bias.dims().Get()); if (!succ) { @@ -1020,16 +1025,13 @@ class FMHAGateRef { static_cast(q_ptr), static_cast(k_ptr), static_cast(v_ptr), - q_grad_ptr, - k_grad_ptr, - v_grad_ptr, - static_cast( - fmha_out->data()), // total_q x num_heads x head_size, total_k := - // \sum_{i=0}^{b} s_i - static_cast( - fmha_out_grad->data()), // total_q x num_heads, x head_size - static_cast(cu_seq_q.data()), - static_cast(cu_seq_k.data()), + static_cast(q_grad_ptr), + static_cast(k_grad_ptr), + static_cast(v_grad_ptr), + static_cast(fmha_out->data()), + static_cast(fmha_out_grad->data()), + cu_seq_q.data(), + cu_seq_k.data(), total_q_, total_k_, batch_size_, @@ -1043,10 +1045,10 @@ class FMHAGateRef { /*is_causal=*/false, is_bf16, num_splits, - softmax_lse.data(), + softmax_lse->data(), softmax_d.data(), - workspace.data(), bias_d.data(), + workspace.data(), &workspace_size, stream, seed, @@ -1055,6 +1057,32 @@ class FMHAGateRef { nonbatched_bias ? temp_bias.data() : nullptr, temp_mask.dims().Get(), temp_bias.dims().Get()); + if (!succ) { + PADDLE_THROW(phi::errors::External(phi::dynload::flash_attn_error())); + } + DBG_WAIT; + + if (nonbatched_bias) { + // compare block reduce + // auto size = attn_bias->sizes(); + // dbias = ds.reshape({ -1, size[0], size[1], size[2], size[3] }).sum({ + // 0 }); result.push_back( dbias ); + const auto temp_bias_num = temp_bias.numel(); + const auto bias_d_num = bias_d.numel(); + auto dbias_first_dim = bias_d_num / temp_bias_num; + bias_d.Resize({dbias_first_dim, + temp_bias.dims()[0], + temp_bias.dims()[1], + temp_bias.dims()[2], + temp_bias.dims()[3]}); + phi::funcs:: + ReduceKernel>( + dev_ctx_, + bias_d, + nonbatched_bias_grad, + kps::IdentityFunctor(), + {0}); + } } else { if (merge_qkv_) { PADDLE_ENFORCE_NOT_NULL( From 05b3444faab07c59e37ac52d408252a730e31f46 Mon Sep 17 00:00:00 2001 From: JamesLim-sy Date: Mon, 24 Apr 2023 10:57:37 +0800 Subject: [PATCH 008/405] 04-24 first commit --- paddle/fluid/operators/fused/fused_gate_attention.h | 11 ++++------- .../fluid/operators/fused/fused_gate_attention_op.cu | 2 +- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h index 282b1940aa325..7cacf8f1b1f31 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention.h +++ b/paddle/fluid/operators/fused/fused_gate_attention.h @@ -818,7 +818,7 @@ class FMHAGateRef { const phi::DenseTensor* fmha_out = nullptr, const phi::DenseTensor* softmax_lse = nullptr, const phi::DenseTensor* nonbatched_bias = nullptr, - const phi::DenseTensor* src_mask = = nullptr) { + const phi::DenseTensor* src_mask = nullptr) { const T* q_ptr = nullptr; const T* k_ptr = nullptr; const T* v_ptr = nullptr; @@ -975,11 +975,8 @@ class FMHAGateRef { static_cast(q_grad_ptr), static_cast(k_grad_ptr), static_cast(v_grad_ptr), - static_cast( - fmha_out->data()), // total_q x num_heads x head_size, total_k : - // \sum_{i=0}^{b} s_i - static_cast( - fmha_out_grad->data()), // total_q x num_heads, x head_size + static_cast(fmha_out->data()), + static_cast(fmha_out_grad->data()), cu_seq_q.data(), cu_seq_k.data(), total_q_, @@ -1003,8 +1000,8 @@ class FMHAGateRef { stream, seed, offset, - nonbatched_bias ? temp_bias.data() : nullptr, src_mask ? temp_mask.data() : nullptr, + nonbatched_bias ? temp_bias.data() : nullptr, temp_mask.dims().Get(), temp_bias.dims().Get()); if (!succ) { diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu index 407ffe0a0820d..446d89accfcae 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu @@ -560,7 +560,7 @@ class FusedGateAttentionGradKernel : public framework::OpKernel { nullptr, nonbatched_bias_grad, &config, - fmha_out, // fwd_out + fmha_out, fwd_softmax_lse, fwd_bias, fwd_mask); From 41e90283ac58a1f46216cf7b6f1f4426bac688ad Mon Sep 17 00:00:00 2001 From: Zhang Ting Date: Mon, 24 Apr 2023 14:45:44 +0800 Subject: [PATCH 009/405] [AMP]expand blacklists for amp training (#50940) --- python/paddle/amp/__init__.py | 6 +- python/paddle/amp/amp_lists.py | 110 ++++++++++++++++++ python/paddle/amp/auto_cast.py | 110 ++---------------- ...perative_auto_mixed_precision_for_eager.py | 8 +- test/amp/test_amp_list.py | 63 +++++++--- 5 files changed, 175 insertions(+), 122 deletions(-) create mode 100644 python/paddle/amp/amp_lists.py diff --git a/python/paddle/amp/__init__.py b/python/paddle/amp/__init__.py index 60df9de03ad11..5fa8055ba233b 100644 --- a/python/paddle/amp/__init__.py +++ b/python/paddle/amp/__init__.py @@ -16,10 +16,8 @@ from .auto_cast import decorate # noqa: F401 from .auto_cast import amp_guard # noqa: F401 from .auto_cast import amp_decorate # noqa: F401 -from .auto_cast import FP16_WHITE_LIST # noqa: F401 -from .auto_cast import FP16_BLACK_LIST # noqa: F401 -from .auto_cast import PURE_FP16_WHITE_LIST # noqa: F401 -from .auto_cast import PURE_FP16_BLACK_LIST # noqa: F401 +from .amp_lists import white_list # noqa: F401 +from .amp_lists import black_list # noqa: F401 from . import grad_scaler # noqa: F401 from .grad_scaler import GradScaler # noqa: F401 diff --git a/python/paddle/amp/amp_lists.py b/python/paddle/amp/amp_lists.py new file mode 100644 index 0000000000000..f70c8f5ed7f91 --- /dev/null +++ b/python/paddle/amp/amp_lists.py @@ -0,0 +1,110 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The set of ops that support fp16 calculation and are considered numerically- +# safe and performance-critical. These ops are always converted to fp16. +FP16_WHITE_LIST = { + 'conv2d', + 'matmul', + 'matmul_v2', + 'max_pool2d_with_index', + 'mul', + 'fake_quantize_dequantize_abs_max', + 'fake_quantize_dequantize_moving_average_abs_max', +} + +# The set of ops that support fp16 calculation and are considered numerically- +# dangerous and whose effects may also be observed in downstream ops. +FP16_BLACK_LIST = { + 'tan', + 'acos', + 'asin', + 'sinh', + 'cosh', + 'atanh', + 'tanh_shrink', + 'cos_sim', + 'erfinv', + 'exp', + 'expm1', + 'log', + 'log10', + 'log2', + 'reciprocal', + 'rsqrt', + 'pow', + 'square', + 'reduce_sum', + 'mean', + 'reduce_mean', + 'reduce_prod', + 'cumprod', + 'cumsum', + 'dist', + 'pnorm', + 'frobenius_norm', + 'renorm', + 'group_norm', + 'layer_norm', + 'softmax', + 'softmin', + 'softplus', + 'log_softmax', + 'softmax_with_cross_entropy', + 'sigmoid_cross_entropy_with_logits', + 'c_softmax_with_cross_entropy', + 'cross_entropy', + 'cross_entropy2', + 'nll_loss', + 'huber_loss', + 'triplet_margin_loss', + 'log_loss', + 'hsigmoid_loss', + 'margin_cross_entropy', +} + +# FP16 performance of grad op is worse than that of FP32. Use FP32 by default. +FP16_EXTRA_BLACK_LIST = { + 'linear_interp_v2', + 'nearest_interp_v2', + 'bilinear_interp_v2', + 'bicubic_interp_v2', + 'trilinear_interp_v2', + 'lookup_table', + 'lookup_table_v2', + 'scatter', + 'depthwise_conv2d', +} + +BF16_WHITE_LIST = {'conv2d', 'matmul_v2'} +BF16_BLACK_LIST = set() + + +def white_list(): + white_list = { + "float16": {"O1": FP16_WHITE_LIST, "O2": FP16_WHITE_LIST}, + "bfloat16": {"O1": BF16_WHITE_LIST, "O2": BF16_WHITE_LIST}, + } + return white_list + + +def black_list(): + black_list = { + "float16": { + "O1": FP16_BLACK_LIST | FP16_EXTRA_BLACK_LIST, + "O2": FP16_EXTRA_BLACK_LIST, + }, + "bfloat16": {"O1": BF16_BLACK_LIST, "O2": set()}, + } + return black_list diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py index e8f552607affc..1f82533edbfb3 100644 --- a/python/paddle/amp/auto_cast.py +++ b/python/paddle/amp/auto_cast.py @@ -20,45 +20,7 @@ from paddle.fluid.framework import _dygraph_tracer, dygraph_only from paddle.fluid.wrapped_decorator import signature_safe_contextmanager -AMP_LEVEL = core.AmpLevel - -# The set of ops that support fp16 calculation and are considered numerically- -# safe and performance-critical. These ops are always converted to fp16. -FP16_WHITE_LIST = { - 'conv2d', - 'matmul', - 'matmul_v2', - 'max_pool2d_with_index', - 'mul', - 'fake_quantize_dequantize_abs_max', - 'fake_quantize_dequantize_moving_average_abs_max', -} - -# The set of ops that support fp16 calculation and are considered numerically- -# dangerous and whose effects may also be observed in downstream ops. -FP16_BLACK_LIST = { - 'exp', - 'square', - 'log', - 'mean', - 'sum', - 'cos_sim', - 'softmax', - 'softmax_with_cross_entropy', - 'sigmoid_cross_entropy_with_logits', - 'c_softmax_with_cross_entropy', - 'cross_entropy', - 'cross_entropy2', - # default fp32 can avoid return inf when the sum value large than 65504 - 'reduce_sum', - # FP16 performance of grad op is worse than that of FP32. Use FP32 by default. - 'linear_interp_v2', - 'nearest_interp_v2', - 'bilinear_interp_v2', - 'bicubic_interp_v2', - 'trilinear_interp_v2', -} - +from .amp_lists import black_list, white_list AMP_RELATED_FLAGS = [ 'FLAGS_cudnn_exhaustive_search', @@ -72,27 +34,7 @@ 'FLAGS_cudnn_batchnorm_spatial_persistent': 1, } -PURE_FP16_WHITE_LIST = copy.copy(FP16_WHITE_LIST) - -PURE_FP16_BLACK_LIST = { - 'lookup_table', - 'lookup_table_v2', - 'scatter', - 'scatter_grad', - # FP16 performance of grad op is worse than that of FP32. Use FP32 by default. - 'linear_interp_v2', - 'nearest_interp_v2', - 'bilinear_interp_v2', - 'bicubic_interp_v2', - 'trilinear_interp_v2', -} - -BF16_WHITE_LIST = {'conv2d', 'matmul_v2'} -BF16_BLACK_LIST = set() - -PURE_BF16_WHITE_LIST = copy.copy(BF16_WHITE_LIST) -PURE_BF16_BLACK_LIST = set() - +AMP_LEVEL = core.AmpLevel _g_amp_state_ = None @@ -126,20 +68,12 @@ def _update_list( """ Update black and white list according to users' custom list. """ - if dtype == 'float16': - if level == 'O1': - _white_list = copy.copy(FP16_WHITE_LIST) - _black_list = copy.copy(FP16_BLACK_LIST) - else: - _white_list = copy.copy(PURE_FP16_WHITE_LIST) - _black_list = copy.copy(PURE_FP16_BLACK_LIST) - else: - if level == 'O1': - _white_list = copy.copy(BF16_WHITE_LIST) - _black_list = copy.copy(BF16_BLACK_LIST) - else: - _white_list = copy.copy(PURE_BF16_WHITE_LIST) - _black_list = copy.copy(PURE_BF16_BLACK_LIST) + if level == 'O0': + _white_list = set() + _black_list = set() + return _white_list, _black_list + _white_list = copy.copy(white_list()[dtype][level]) + _black_list = copy.copy(black_list()[dtype][level]) if custom_white_list and custom_black_list: for op_name in custom_white_list: if op_name in custom_black_list: @@ -453,34 +387,14 @@ def amp_guard( if level == 'O1': amp_level = AMP_LEVEL.O1 - if dtype == 'float16': - _white_list = FP16_WHITE_LIST - _black_list = FP16_BLACK_LIST - elif dtype == 'bfloat16': - _white_list = BF16_WHITE_LIST - _black_list = BF16_BLACK_LIST - elif level == 'O2': amp_level = AMP_LEVEL.O2 - if dtype == 'float16': - _white_list = PURE_FP16_WHITE_LIST - _black_list = PURE_FP16_BLACK_LIST - elif dtype == 'bfloat16': - _white_list = BF16_WHITE_LIST - _black_list = BF16_BLACK_LIST elif level == 'O0': amp_level = AMP_LEVEL.O0 - if dtype == 'float16': - _white_list = FP16_WHITE_LIST - _black_list = FP16_BLACK_LIST - elif dtype == 'bfloat16': - _white_list = BF16_WHITE_LIST - _black_list = BF16_BLACK_LIST - - if custom_white_list or custom_black_list: - _white_list, _black_list = _update_list( - custom_white_list, custom_black_list, level, dtype - ) + + _white_list, _black_list = _update_list( + custom_white_list, custom_black_list, level, dtype + ) if not enable: amp_level = AMP_LEVEL.O0 diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py index 7a7d65d27d556..8d24febaff213 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py @@ -88,8 +88,8 @@ def test_amp_guard_black_op(self): def custom_op_list(self): with fluid.dygraph.guard(): tracer = fluid.framework._dygraph_tracer() - base_white_list = paddle.amp.FP16_WHITE_LIST - base_black_list = paddle.amp.FP16_BLACK_LIST + base_white_list = paddle.amp.white_list()["float16"]["O1"] + base_black_list = paddle.amp.black_list()["float16"]["O1"] with paddle.amp.amp_guard( custom_white_list=["log"], custom_black_list=["conv2d"] ): @@ -104,8 +104,8 @@ def custom_op_list(self): == (set(base_black_list) - {"log"}) | {"conv2d"} ) - base_white_list = paddle.amp.PURE_FP16_WHITE_LIST - base_black_list = paddle.amp.PURE_FP16_BLACK_LIST + base_white_list = paddle.amp.white_list()["float16"]["O2"] + base_black_list = paddle.amp.black_list()["float16"]["O2"] with paddle.amp.amp_guard( custom_white_list=["log"], custom_black_list=["conv2d"], diff --git a/test/amp/test_amp_list.py b/test/amp/test_amp_list.py index 11bcdbfd3ba6a..9b0bf5129c36d 100644 --- a/test/amp/test_amp_list.py +++ b/test/amp/test_amp_list.py @@ -14,32 +14,63 @@ import unittest +import paddle from paddle.fluid import core -from paddle.static.amp import fp16_lists -from paddle.static.amp.fp16_lists import AutoMixedPrecisionLists +from paddle.static.amp import AutoMixedPrecisionLists, fp16_lists class TestAMPList(unittest.TestCase): - def test_main(self): - custom_white_list = [ - 'lookup_table', - 'lookup_table_v2', - ] - amp_list = AutoMixedPrecisionLists(custom_white_list=custom_white_list) - for op in custom_white_list: - self.assertTrue(op in amp_list.white_list) - self.assertTrue(op not in amp_list.black_list) - self.assertTrue(op not in amp_list.unsupported_list) - - default_black_list = [ + def setUp(self): + self.default_black_list = [ 'linear_interp_v2', 'nearest_interp_v2', 'bilinear_interp_v2', 'bicubic_interp_v2', 'trilinear_interp_v2', ] - for op in default_black_list: - self.assertTrue(op in amp_list.black_list) + self.custom_white_list = [ + 'lookup_table', + 'lookup_table_v2', + ] + + def check_if_op_in_list(self, op_list, amp_list): + for op in op_list: + self.assertTrue(op in amp_list) + + def check_if_op_not_in_list(self, op_list, amp_list): + for op in op_list: + self.assertTrue(op not in amp_list) + + def test_static(self): + amp_list = AutoMixedPrecisionLists( + custom_white_list=self.custom_white_list + ) + self.check_if_op_in_list(self.default_black_list, amp_list.black_list) + self.check_if_op_in_list(self.custom_white_list, amp_list.white_list) + self.check_if_op_not_in_list( + self.custom_white_list, amp_list.black_list + ) + self.check_if_op_not_in_list( + self.custom_white_list, amp_list.unsupported_list + ) + + def test_eager(self): + if not paddle.amp.is_float16_supported(): + return + white_list = paddle.amp.white_list() + black_list = paddle.amp.black_list() + self.check_if_op_in_list( + self.default_black_list, black_list["float16"]["O2"] + ) + self.check_if_op_not_in_list(['log', 'elementwise_add'], white_list) + with paddle.amp.auto_cast(custom_white_list={'elementwise_add'}): + out1 = paddle.rand([2, 3]) + paddle.rand([2, 3]) + out2 = out1.mean() + out3 = paddle.log(out2) + self.check_if_op_not_in_list(['log', 'elementwise_add'], white_list) + self.assertEqual(out1.dtype, paddle.float16) + self.assertEqual(out2.dtype, paddle.float32) + self.assertEqual(out3.dtype, paddle.float32) def test_apis(self): def _run_check_dtype(): From d71615dc2b07dc92034256fd705187d08d9436e1 Mon Sep 17 00:00:00 2001 From: csy0225 <78470701+csy0225@users.noreply.github.com> Date: Mon, 24 Apr 2023 14:54:49 +0800 Subject: [PATCH 010/405] shared_external mermory add xpu (#53240) --- paddle/fluid/inference/api/details/zero_copy_tensor.cc | 9 ++++++++- paddle/phi/kernels/xpu/linspace_kernel.cc | 6 +++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index 147db87a101d4..c10e6b4a43fdd 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -345,9 +345,16 @@ void Tensor::ShareExternalData(const T *data, const_cast(data), size, paddle::platform::CUDAPlace(device_)), meta); *tensor = std::move(dtensor); + } else if (place == PlaceType::kXPU) { + phi::DenseTensor dtensor( + std::make_shared( + const_cast(data), size, paddle::platform::XPUPlace(device_)), + meta); + *tensor = std::move(dtensor); } else { PADDLE_THROW(paddle::platform::errors::InvalidArgument( - "PlaceType must be PlaceType::kCPU or PlaceType::kGPU.")); + "PlaceType must be one of [PlaceType::kCPU, PlaceType::kGPU, " + "PlaceType::kXPU].")); } } diff --git a/paddle/phi/kernels/xpu/linspace_kernel.cc b/paddle/phi/kernels/xpu/linspace_kernel.cc index e33a6d73f1c5d..e61676bae49c7 100644 --- a/paddle/phi/kernels/xpu/linspace_kernel.cc +++ b/paddle/phi/kernels/xpu/linspace_kernel.cc @@ -81,4 +81,8 @@ void LinspaceKernel(const Context& ctx, } // namespace phi PD_REGISTER_KERNEL( - linspace, XPU, ALL_LAYOUT, phi::LinspaceKernel, float, int32_t) {} + linspace, XPU, ALL_LAYOUT, phi::LinspaceKernel, float, int32_t) { + kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(1).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND); +} From 7a9754a7e72d218fdf839762be820c70177ca61c Mon Sep 17 00:00:00 2001 From: HongyuJia Date: Mon, 24 Apr 2023 14:59:57 +0800 Subject: [PATCH 011/405] [CppExtension Cuda] Add cuda unit test for CppExtension (#52900) * [CppExtension Cuda] Add cuda unit test for CppExtension * update extra_compile_args for CUDAExtension * add debug info * Add patch to fix CUDA12 compile error * patch for all env * add windows judgement * Try to fix setup function not found error * fix mix_relu_and_extension include file * fix setup compile error * remove useless debug comments * add sleep, debug CI-build * add space to disable cmake cache * remove debug info * add space to pass CI-build --- cmake/external/pybind11.cmake | 9 ++++ cmake/external/warpctc.cmake | 1 + patches/pybind/cast.h.patch | 15 +++++++ .../cpp_extension/cpp_extension_setup.py | 14 ++++-- .../tests/cpp_extension/custom_extension.cc | 3 ++ .../cpp_extension/custom_relu_forward.cu | 45 +++++++++++++++++++ .../mix_relu_and_extension_setup.py | 2 +- .../cpp_extension/test_cpp_extension_jit.py | 13 +++++- .../cpp_extension/test_cpp_extension_setup.py | 13 ++++++ .../paddle/fluid/tests/cpp_extension/utils.py | 42 +++++++++++++++++ .../utils/cpp_extension/extension_utils.py | 9 ++-- 11 files changed, 156 insertions(+), 10 deletions(-) create mode 100644 patches/pybind/cast.h.patch create mode 100644 python/paddle/fluid/tests/cpp_extension/custom_relu_forward.cu diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake index 6abd24e87305b..db53e3511be44 100644 --- a/cmake/external/pybind11.cmake +++ b/cmake/external/pybind11.cmake @@ -21,6 +21,14 @@ set(PYBIND_TAG v2.10.3) set(PYBIND_INCLUDE_DIR ${THIRD_PARTY_PATH}/pybind/src/extern_pybind/include) include_directories(${PYBIND_INCLUDE_DIR}) +set(PYBIND_PATCH_COMMAND "") +if(NOT WIN32) + file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/pybind/cast.h.patch + native_dst) + set(PYBIND_PATCH_COMMAND patch -d ${PYBIND_INCLUDE_DIR}/pybind11 < + ${native_dst}) +endif() + ExternalProject_Add( extern_pybind ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} @@ -33,6 +41,7 @@ ExternalProject_Add( # third-party library version changes cannot be incorporated. # reference: https://cmake.org/cmake/help/latest/module/ExternalProject.html UPDATE_COMMAND "" + PATCH_COMMAND ${PYBIND_PATCH_COMMAND} CONFIGURE_COMMAND "" BUILD_COMMAND "" INSTALL_COMMAND "" diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index e1e7234da0e25..46befee8bd255 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -82,6 +82,7 @@ else() set(WARPCTC_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE}) set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG}) endif() + ExternalProject_Add( extern_warpctc ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} diff --git a/patches/pybind/cast.h.patch b/patches/pybind/cast.h.patch new file mode 100644 index 0000000000000..ebd65571ebf82 --- /dev/null +++ b/patches/pybind/cast.h.patch @@ -0,0 +1,15 @@ +diff --git a/include/pybind11/cast.h b/include/pybind11/cast.h +index 3a404602..9054478c 100644 +--- a/include/pybind11/cast.h ++++ b/include/pybind11/cast.h +@@ -42,7 +42,9 @@ using make_caster = type_caster>; + // Shortcut for calling a caster's `cast_op_type` cast operator for casting a type_caster to a T + template + typename make_caster::template cast_op_type cast_op(make_caster &caster) { +- return caster.operator typename make_caster::template cast_op_type(); ++ // https://github.com/pybind/pybind11/issues/4606 with CUDA 12 ++ //return caster.operator typename make_caster::template cast_op_type(); ++ return caster; + } + template + typename make_caster::template cast_op_type::type> diff --git a/python/paddle/fluid/tests/cpp_extension/cpp_extension_setup.py b/python/paddle/fluid/tests/cpp_extension/cpp_extension_setup.py index b5c12284c117e..5a4ff2afd6c63 100644 --- a/python/paddle/fluid/tests/cpp_extension/cpp_extension_setup.py +++ b/python/paddle/fluid/tests/cpp_extension/cpp_extension_setup.py @@ -15,7 +15,9 @@ import os from site import getsitepackages -from paddle.utils.cpp_extension import CppExtension, setup +from utils import extra_compile_args + +from paddle.utils.cpp_extension import CUDAExtension, setup paddle_includes = [] for site_packages_path in getsitepackages(): @@ -30,10 +32,14 @@ setup( name='custom_cpp_extension', - ext_modules=CppExtension( - sources=["custom_extension.cc", "custom_sub.cc"], + ext_modules=CUDAExtension( + sources=[ + "custom_extension.cc", + "custom_sub.cc", + "custom_relu_forward.cu", + ], include_dirs=paddle_includes, - extra_compile_args={'cc': ['-w', '-g']}, + extra_compile_args=extra_compile_args, verbose=True, ), ) diff --git a/python/paddle/fluid/tests/cpp_extension/custom_extension.cc b/python/paddle/fluid/tests/cpp_extension/custom_extension.cc index 2334e23af536c..2fc5c42a80d75 100644 --- a/python/paddle/fluid/tests/cpp_extension/custom_extension.cc +++ b/python/paddle/fluid/tests/cpp_extension/custom_extension.cc @@ -20,6 +20,8 @@ paddle::Tensor custom_sub(paddle::Tensor x, paddle::Tensor y); +paddle::Tensor relu_cuda_forward(const paddle::Tensor& x); + paddle::Tensor custom_add(const paddle::Tensor& x, const paddle::Tensor& y) { return x.exp() + y.exp(); } @@ -46,6 +48,7 @@ PYBIND11_MODULE(custom_cpp_extension, m) { m.def("nullable_tensor", &nullable_tensor, "returned Tensor might be None"); m.def( "optional_tensor", &optional_tensor, "returned Tensor might be optional"); + m.def("relu_cuda_forward", &relu_cuda_forward, "relu(x)"); py::class_(m, "Power") .def(py::init()) diff --git a/python/paddle/fluid/tests/cpp_extension/custom_relu_forward.cu b/python/paddle/fluid/tests/cpp_extension/custom_relu_forward.cu new file mode 100644 index 0000000000000..e0405309f7add --- /dev/null +++ b/python/paddle/fluid/tests/cpp_extension/custom_relu_forward.cu @@ -0,0 +1,45 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/extension.h" + +#define CHECK_GPU_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.") + +template +__global__ void relu_cuda_forward_kernel(const data_t* x, + data_t* y, + int64_t num) { + int64_t gid = blockIdx.x * blockDim.x + threadIdx.x; + for (int64_t i = gid; i < num; i += blockDim.x * gridDim.x) { + y[i] = x[i] > static_cast(0.) ? x[i] : static_cast(0.); + } +} + +paddle::Tensor relu_cuda_forward(const paddle::Tensor& x) { + CHECK_GPU_INPUT(x); + auto out = paddle::empty_like(x); + + PD_CHECK(x.place() == paddle::DefaultGPUPlace()); + + int64_t numel = x.numel(); + int64_t block = 512; + int64_t grid = (numel + block - 1) / block; + PD_DISPATCH_FLOATING_AND_HALF_TYPES( + x.type(), "relu_cuda_forward_kernel", ([&] { + relu_cuda_forward_kernel<<>>( + x.data(), out.data(), numel); + })); + + return out; +} diff --git a/python/paddle/fluid/tests/cpp_extension/mix_relu_and_extension_setup.py b/python/paddle/fluid/tests/cpp_extension/mix_relu_and_extension_setup.py index 3766d33f03443..823d0183cfda8 100644 --- a/python/paddle/fluid/tests/cpp_extension/mix_relu_and_extension_setup.py +++ b/python/paddle/fluid/tests/cpp_extension/mix_relu_and_extension_setup.py @@ -21,7 +21,7 @@ setup( name='mix_relu_extension', ext_modules=CppExtension( - sources=["mix_relu_and_extension.cc", "custom_sub.cc"], + sources=["mix_relu_and_extension.cc"], include_dirs=paddle_includes + [os.path.dirname(os.path.abspath(__file__))], extra_compile_args={'cc': ['-w', '-g']}, diff --git a/python/paddle/fluid/tests/cpp_extension/test_cpp_extension_jit.py b/python/paddle/fluid/tests/cpp_extension/test_cpp_extension_jit.py index 9ed330a2b4ac7..bc6f8113afd91 100644 --- a/python/paddle/fluid/tests/cpp_extension/test_cpp_extension_jit.py +++ b/python/paddle/fluid/tests/cpp_extension/test_cpp_extension_jit.py @@ -18,6 +18,7 @@ from site import getsitepackages import numpy as np +from utils import check_output import paddle from paddle.utils.cpp_extension import load @@ -27,7 +28,7 @@ sys.exit() # Compile and load cpp extension Just-In-Time. -sources = ["custom_extension.cc", "custom_sub.cc"] +sources = ["custom_extension.cc", "custom_sub.cc", "custom_relu_forward.cu"] paddle_includes = [] for site_packages_path in getsitepackages(): paddle_includes.append( @@ -69,6 +70,8 @@ def test_cpp_extension(self): self._test_extension_class() self._test_nullable_tensor() self._test_optional_tensor() + if paddle.is_compiled_with_cuda(): + self._test_cuda_relu() def _test_extension_function(self): for dtype in self.dtypes: @@ -130,6 +133,14 @@ def _test_optional_tensor(self): err_msg=f'extension out: {x},\n numpy out: {x_np}', ) + def _test_cuda_relu(self): + paddle.set_device('gpu') + x = np.random.uniform(-1, 1, [4, 8]).astype('float32') + x = paddle.to_tensor(x, dtype='float32') + out = custom_cpp_extension.relu_cuda_forward(x) + pd_out = paddle.nn.functional.relu(x) + check_output(out, pd_out, "out") + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/cpp_extension/test_cpp_extension_setup.py b/python/paddle/fluid/tests/cpp_extension/test_cpp_extension_setup.py index 5c8c91ed30356..53dffde432095 100644 --- a/python/paddle/fluid/tests/cpp_extension/test_cpp_extension_setup.py +++ b/python/paddle/fluid/tests/cpp_extension/test_cpp_extension_setup.py @@ -18,6 +18,7 @@ import unittest import numpy as np +from utils import check_output import paddle from paddle import static @@ -154,6 +155,8 @@ def test_cpp_extension(self): self._test_static() self._test_dynamic() self._test_double_grad_dynamic() + if paddle.is_compiled_with_cuda(): + self._test_cuda_relu() def _test_extension_function_plain(self): import custom_cpp_extension @@ -314,6 +317,16 @@ def _test_double_grad_dynamic(self): ), ) + def _test_cuda_relu(self): + import custom_cpp_extension + + paddle.set_device('gpu') + x = np.random.uniform(-1, 1, [4, 8]).astype('float32') + x = paddle.to_tensor(x, dtype='float32') + out = custom_cpp_extension.relu_cuda_forward(x) + pd_out = paddle.nn.functional.relu(x) + check_output(out, pd_out, "out") + if __name__ == '__main__': if os.name == 'nt' or sys.platform.startswith('darwin'): diff --git a/python/paddle/fluid/tests/cpp_extension/utils.py b/python/paddle/fluid/tests/cpp_extension/utils.py index 5c5a458a5c7a6..19659c6d5d716 100644 --- a/python/paddle/fluid/tests/cpp_extension/utils.py +++ b/python/paddle/fluid/tests/cpp_extension/utils.py @@ -16,6 +16,8 @@ import sys from site import getsitepackages +import numpy as np + from paddle.utils.cpp_extension.extension_utils import IS_WINDOWS IS_MAC = sys.platform.startswith('darwin') @@ -37,3 +39,43 @@ extra_cc_args = ['-w', '-g'] if not IS_WINDOWS else ['/w'] extra_nvcc_args = ['-O3'] extra_compile_args = {'cc': extra_cc_args, 'nvcc': extra_nvcc_args} + + +def check_output(out, pd_out, name): + if out is None and pd_out is None: + return + assert out is not None, "out value of " + name + " is None" + assert pd_out is not None, "pd_out value of " + name + " is None" + if isinstance(out, list) and isinstance(pd_out, list): + for idx in range(len(out)): + np.testing.assert_array_equal( + out[idx], + pd_out[idx], + err_msg='custom op {}: {},\n paddle api {}: {}'.format( + name, out[idx], name, pd_out[idx] + ), + ) + else: + np.testing.assert_array_equal( + out, + pd_out, + err_msg='custom op {}: {},\n paddle api {}: {}'.format( + name, out, name, pd_out + ), + ) + + +def check_output_allclose(out, pd_out, name, rtol=5e-5, atol=1e-2): + if out is None and pd_out is None: + return + assert out is not None, "out value of " + name + " is None" + assert pd_out is not None, "pd_out value of " + name + " is None" + np.testing.assert_allclose( + out, + pd_out, + rtol, + atol, + err_msg='custom op {}: {},\n paddle api {}: {}'.format( + name, out, name, pd_out + ), + ) diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py index 511b72378c78f..7da4d1f1ba122 100644 --- a/python/paddle/utils/cpp_extension/extension_utils.py +++ b/python/paddle/utils/cpp_extension/extension_utils.py @@ -180,6 +180,9 @@ def custom_write_stub(resource, pyfile): def __bootstrap__(): assert os.path.exists(so_path) + # load custom op shared library with abs path + custom_ops = paddle.utils.cpp_extension.load_op_meta_info_and_register_op(so_path) + if os.name == 'nt' or sys.platform.startswith('darwin'): # Cpp Extension only support Linux now mod = types.ModuleType(__name__) @@ -193,10 +196,8 @@ def __bootstrap__(): except ImportError: mod = types.ModuleType(__name__) - # load custom op shared library with abs path - custom_ops = paddle.utils.cpp_extension.load_op_meta_info_and_register_op(so_path) - for custom_ops in custom_ops: - setattr(mod, custom_ops, eval(custom_ops)) + for custom_op in custom_ops: + setattr(mod, custom_op, eval(custom_op)) __bootstrap__() From 680460fd37778c5f78925a2471609b4addf7f393 Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Mon, 24 Apr 2023 15:12:20 +0800 Subject: [PATCH 012/405] [AMP] Allow to enable multi_precision through paddle.static.amp.decorate and add documents for some apis. (#53012) * Add document for some apis. test=docs_preview * Allow to set master_weight in paddle.static.amp.decorate. * Polish codes and add unittest. * Refine docs. * Remove the repetitive function. --- python/paddle/static/amp/debugging.py | 101 ++++++++++++++++++++--- python/paddle/static/amp/decorator.py | 111 ++++++++++++++++++++++++-- test/amp/amp_base_models.py | 1 - test/amp/test_model_cast_to_bf16.py | 23 ++++++ 4 files changed, 220 insertions(+), 16 deletions(-) diff --git a/python/paddle/static/amp/debugging.py b/python/paddle/static/amp/debugging.py index 28abe84c39b2e..5a894495d98f5 100644 --- a/python/paddle/static/amp/debugging.py +++ b/python/paddle/static/amp/debugging.py @@ -13,8 +13,14 @@ # limitations under the License. import copy +import logging import paddle +from paddle.fluid.log_helper import get_logger + +_logger = get_logger( + __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s' +) class OperatorStatsUnit: @@ -76,7 +82,7 @@ def _get_var_dtype_from_block(block, op, arg_name, is_input): var = block._var_recursive(var_name) return var.dtype except: - print( + _logger.warning( "Operator < {} > gets {} < {} : {} > error!".format( op.type, "input" if is_input else "output", arg_name, var_name ) @@ -99,7 +105,7 @@ def _extract_compute_dtype(op, block): if _is_floating_point(compute_dtype) and _is_floating_point( var_dtype ): - print( + _logger.warning( "Operator < {} > has different input data types, input_names = {}, output_names = {}.".format( op.type, op.input_names, op.output_names ) @@ -125,7 +131,7 @@ def _extract_compute_dtype(op, block): if _is_floating_point(compute_dtype) and _is_floating_point( var_dtype ): - print( + _logger.warning( "Operator < {} > has different input / output data types, input_names = {}, output_names = {}.".format( op.type, op.input_names, op.output_names ) @@ -145,6 +151,15 @@ def _merge_op_stats(op_stats_list): def _get_op_stats_list(program): + def _is_special_ops_with_input_x(op_type): + # operators have input X and have inputs different dtypes. + special_op_list = ['cast', 'batch_norm', 'instance_norm', 'layer_norm'] + if op_type in special_op_list: + return True + if op_type.replace("_grad", "") in special_op_list: + return True + return False + op_stats_list = [] for block in program.blocks: block_op_stats_dict = {} @@ -161,13 +176,7 @@ def _get_op_stats_list(program): 'create_double_buffer_reader', ]: compute_dtype = None - elif op.type in [ - 'cast', - 'layer_norm', - 'layer_norm_grad', - 'batch_norm', - 'batch_norm_grad', - ]: + elif _is_special_ops_with_input_x(op.type): # Not check the input and output dtype difference for this operators. compute_dtype = _get_var_dtype_from_block(block, op, 'X', True) elif "Param" in op.input_names: @@ -183,6 +192,78 @@ def _get_op_stats_list(program): def collect_operator_stats(program=None, print_subblocks=False): + """ + Collect the number of operators for different data types through parsing + the program. The statistical data are categorized according to four data + types, namely float32, float16, bfloat16 and others. + + Args: + program(Program, optional): The program to parse. Default None, and the default main_program will be parsed. + print_subblocks(bool, optional): Whether to print the operator stats for each subblock. Default False. + + Examples: + + .. code-block:: python + + import paddle + + paddle.enable_static() + + class SimpleConvNet(paddle.nn.Layer): + def __init__(self): + super().__init__() + self.conv = paddle.nn.Conv2D(in_channels=1, out_channels=6, kernel_size=3) + self.linear = paddle.nn.Linear(in_features=26, out_features=10) + + def forward(self, x): + out = self.conv(x) + out = paddle.nn.functional.relu(out) + out = self.linear(out) + out = paddle.nn.functional.softmax(out) + return out + + main_program = paddle.static.Program() + startup_program = paddle.static.Program() + with paddle.utils.unique_name.guard(): + with paddle.static.program_guard(main_program, startup_program): + model = SimpleConvNet() + x = paddle.static.data( + name='input', shape=[None, 1, 28, 28], dtype='float32' + ) + out = model(x) + loss = paddle.mean(out) + optimizer = paddle.optimizer.AdamW() + optimizer = paddle.static.amp.decorate(optimizer) + optimizer.minimize(loss) + paddle.static.amp.debugging.collect_operator_stats(main_program) + # <------------------------------------------------ op list of all blocks -------------------------------------------------> + # <------------------------------------------------------- op list --------------------------------------------------------> + # <--------------- Op Name ---------------- | -- FP16 Calls --- | -- BF16 Calls --- | --- FP32 Calls--- | -- Other Calls --> + # adamw | 0 | 0 | 4 | 0 + # cast | 5 | 0 | 6 | 0 + # check_finite_and_unscale | 0 | 0 | 1 | 0 + # conv2d | 1 | 0 | 0 | 0 + # conv2d_grad | 1 | 0 | 0 | 0 + # elementwise_add | 2 | 0 | 0 | 0 + # elementwise_add_grad | 2 | 0 | 0 | 0 + # elementwise_mul | 0 | 0 | 1 | 0 + # elementwise_mul_grad | 0 | 0 | 1 | 0 + # fill_constant | 0 | 0 | 1 | 0 + # matmul_v2 | 1 | 0 | 0 | 0 + # matmul_v2_grad | 1 | 0 | 0 | 0 + # memcpy | 0 | 0 | 0 | 1 + # reduce_mean | 0 | 0 | 1 | 0 + # reduce_mean_grad | 0 | 0 | 1 | 0 + # relu | 1 | 0 | 0 | 0 + # relu_grad | 1 | 0 | 0 | 0 + # reshape2 | 0 | 0 | 1 | 0 + # reshape2_grad | 0 | 0 | 1 | 0 + # softmax | 0 | 0 | 1 | 0 + # softmax_grad | 0 | 0 | 1 | 0 + # update_loss_scaling | 0 | 0 | 1 | 0 + # <----------------------------------------------------- op count: 22 -----------------------------------------------------> + """ + def _convert_to_list(op_stats_unit_dict): for key, value in op_stats_unit_dict.items(): op_stats_unit_dict[key] = value.convert_to_list() diff --git a/python/paddle/static/amp/decorator.py b/python/paddle/static/amp/decorator.py index 1ef13e5bcd0c5..fc0aaac92bf46 100644 --- a/python/paddle/static/amp/decorator.py +++ b/python/paddle/static/amp/decorator.py @@ -34,6 +34,21 @@ from .function_overload import FunctionType, overload +def _set_multi_precision(optimizer, multi_precision): + if not isinstance( + optimizer, + (paddle.optimizer.Optimizer, paddle.fluid.optimizer.Optimizer), + ): + raise RuntimeError( + "Current AMP training level is O2, optimizer is expected to be paddle.optimizer.Optimizer or paddle.fluid.optimizer.Optimizer, but receive {}.".format( + type(optimizer) + ) + ) + + if multi_precision and hasattr(optimizer, "_multi_precision"): + optimizer._multi_precision = multi_precision + + class OptimizerWithMixedPrecision: """ Optimizer with mixed-precision (MP) training. This is a wrapper of a common @@ -767,22 +782,96 @@ def decorate( amp_lists=None, level='O1', dtype='float16', + master_weight=None, init_loss_scaling=2**15, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, incr_ratio=2.0, decr_ratio=0.8, - use_dynamic_loss_scaling=True, + use_dynamic_loss_scaling=None, use_amp_guard=False, use_promote=False, ): """ Decorate the given optimizer to adapt to the mixed-precision training. - """ - amp_dtype = check_amp_dtype(dtype) - if amp_lists is None: - amp_lists = AutoMixedPrecisionLists(dtype=amp_dtype) + Args: + optimizer(Optimizer): A common Optimizer. + amp_lists(CustomOpLists, optional): An CustomOpLists object. The default + white_list and black_list will be used for AMP training when it is + not set. Default is None. + level(str, optional): Auto mixed precision level. Accepted values are + "O1" and "O2": O1 represent mixed precision, the input data type of + each operator will be casted by white_list and black_list; + O2 represent pure FP16 / BF16 training, all operators parameters + and input data will be casted to FP16 / BF16, except operators in + black_list, don't support FP16 / BF16 kernel and batch_norm. Default is O1. + dtype(str, optional): Whether to use 'float16' or 'bfloat16'. Default is 'float16'. + master_weight(bool, optinal): For level='O2', whether to use multi-precision + during weight updating. If master_weight is None, in O2 level optimizer + will use multi-precision. Default is None. + init_loss_scaling(float, optional): The initial loss scaling factor. + Default is 32768. + incr_every_n_steps(int, optional): Increases loss scaling every n + consecutive steps with finite gradients. Default is 1000. + decr_every_n_nan_or_inf(int, optional): Decreases loss scaling every n + accumulated steps with nan or inf gradients. Default is 2. + incr_ratio(float, optional): The multiplier to use when increasing the + loss scaling. Default is 2. + decr_ratio(float, optional): The less-than-one-multiplier to use when + decreasing the loss scaling. Default is 0.8. + use_dynamic_loss_scaling(bool, None): Whether to use dynamic loss + scaling. Default is None, which means True for float16, and False + for bfloat16. + + Returns: + An optimizer acting like a normal one but with mixed-precision training + + Examples: + + .. code-block:: python + + import paddle + + paddle.enable_static() + + class SimpleConvNet(paddle.nn.Layer): + def __init__(self): + super().__init__() + self.conv = paddle.nn.Conv2D(in_channels=1, out_channels=6, kernel_size=3) + self.linear = paddle.nn.Linear(in_features=26, out_features=10) + + def forward(self, x): + out = self.conv(x) + out = paddle.nn.functional.relu(out) + out = self.linear(out) + out = paddle.nn.functional.softmax(out) + return out + + main_program = paddle.static.Program() + startup_program = paddle.static.Program() + with paddle.utils.unique_name.guard(): + with paddle.static.program_guard(main_program, startup_program): + model = SimpleConvNet() + x = paddle.static.data( + name='input', shape=[None, 1, 28, 28], dtype='float32' + ) + out = model(x) + loss = paddle.mean(out) + optimizer = paddle.optimizer.AdamW() + optimizer = paddle.static.amp.decorate(optimizer, level="O2", dtype="float16") + optimizer.minimize(loss) + + if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0: + place = paddle.CUDAPlace(0) + exe = paddle.static.Executor(place) + exe.run(startup_program) + + # Call `amp_init` after FP32 parameters initialization, such as `exe.run(startup_program)`, + # to convert FP32 parameters to low precision FP16 / BF16. + optimizer.amp_init(place, scope=paddle.static.global_scope()) + + """ # check amp_level: O0-O2 level = level.upper() if not (level in ['O0', 'O1', 'O2']): @@ -790,6 +879,18 @@ def decorate( "level should be O0, O1 or O2. O0 represents fp32 train mode, O1 represents AMP train mode, O2 represents pure fp16/bf16 train mode." ) + amp_dtype = check_amp_dtype(dtype) + if amp_lists is None: + amp_lists = AutoMixedPrecisionLists(dtype=amp_dtype) + + if use_dynamic_loss_scaling is None: + use_dynamic_loss_scaling = dtype == "float16" + + if optimizer is not None: + # support master_weight + multi_precision = not (master_weight is False) + _set_multi_precision(optimizer, multi_precision) + mp_optimizer = OptimizerWithMixedPrecision( optimizer, amp_lists, diff --git a/test/amp/amp_base_models.py b/test/amp/amp_base_models.py index 23c4b018b67d5..8b63b2391c020 100644 --- a/test/amp/amp_base_models.py +++ b/test/amp/amp_base_models.py @@ -42,7 +42,6 @@ def _build_optimizer( beta2=0.836, epsilon=1e-4, weight_decay=0.01, - multi_precision=True, ) if use_amp: optimizer = paddle.static.amp.decorate( diff --git a/test/amp/test_model_cast_to_bf16.py b/test/amp/test_model_cast_to_bf16.py index 1a58a2905ec66..3002b623b18af 100644 --- a/test/amp/test_model_cast_to_bf16.py +++ b/test/amp/test_model_cast_to_bf16.py @@ -221,11 +221,29 @@ def test_graph_cast(self): class TestProgramBF16(AmpTestBase): + def _check_optimizer(self, program, expected_num_mp): + optimizers = [] + for block in program.blocks: + for op in block.ops: + if "Param" in op.input_names and "Grad" in op.input_names: + optimizers.append(op) + + actual_num_mp = 0 + for op in optimizers: + if op.has_attr("multi_precision") and op.attr("multi_precision"): + actual_num_mp += 1 + self.assertEqual( + actual_num_mp, + expected_num_mp, + f"The number of optimizers with multi_precison = True is expected to be {expected_num_mp}, but recieved {actual_num_mp}.", + ) + def test_amp_bf16_o1(self): main_program, startup_program = build_embedding_model( True, "bfloat16", "O1" ) self.assertEqual(main_program.num_blocks, 1) + self._check_optimizer(main_program, 0) amp.debugging.collect_operator_stats(main_program) op_stats_list = amp.debugging._get_op_stats_list(main_program) @@ -255,6 +273,11 @@ def test_amp_bf16_o2(self): "squared_l2_norm": 2, "adamw": 2, } + self._check_optimizer( + main_program, + expected_bf16_calls["matmul_v2"] + + expected_bf16_calls["elementwise_add"], + ) self._check_op_calls(op_stats_list[0], expected_bf16_calls) From 83c2e68207c7689684fd46453d0f3e8dacd4e7cc Mon Sep 17 00:00:00 2001 From: Sonder <55493212+AndSonder@users.noreply.github.com> Date: Mon, 24 Apr 2023 15:49:46 +0800 Subject: [PATCH 013/405] Move fused feedforward xpu (#53196) * add sig file * trans fused feedforward compute function to phi * remove fluid include * delete old register info * fix build error * trans fused feedforward grad xpu to phi --- .../fused/fused_feedforward_op_xpu.cc | 832 ------------------ .../kernels/fused_feedforward_grad_kernel.h | 69 ++ paddle/phi/kernels/fused_feedforward_kernel.h | 61 ++ .../xpu/fused_feedforward_grad_xpu_kernel.cc | 542 ++++++++++++ .../xpu/fused_feedforward_xpu_kernel.cc | 390 ++++++++ .../phi/ops/compat/fused_feedforward_sig.cc | 102 +++ 6 files changed, 1164 insertions(+), 832 deletions(-) delete mode 100644 paddle/fluid/operators/fused/fused_feedforward_op_xpu.cc create mode 100644 paddle/phi/kernels/fused_feedforward_grad_kernel.h create mode 100644 paddle/phi/kernels/fused_feedforward_kernel.h create mode 100644 paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_xpu_kernel.cc create mode 100644 paddle/phi/kernels/fusion/xpu/fused_feedforward_xpu_kernel.cc create mode 100644 paddle/phi/ops/compat/fused_feedforward_sig.cc diff --git a/paddle/fluid/operators/fused/fused_feedforward_op_xpu.cc b/paddle/fluid/operators/fused/fused_feedforward_op_xpu.cc deleted file mode 100644 index 4b9ba95143345..0000000000000 --- a/paddle/fluid/operators/fused/fused_feedforward_op_xpu.cc +++ /dev/null @@ -1,832 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PADDLE_WITH_XPU -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/operators/matmul_v2_op.h" -#include "paddle/fluid/operators/xpu_api_wrapper.h" -#include "paddle/fluid/platform/device/device_wrapper.h" -#include "paddle/phi/core/ddim.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -#include "paddle/fluid/operators/fused/xpu_fused_common_function.h" - -namespace paddle { -namespace operators { - -template -class FusedFeedForwardXPUKernel : public framework::OpKernel { - using XPUTypeT = typename XPUTypeTrait::Type; - - public: - void FFN(const phi::XPUContext& dev_ctx, - const phi::DenseTensor* x, - const phi::DenseTensor* linear1_weight, - const phi::DenseTensor* linear1_bias, - const phi::DenseTensor* linear2_weight, - const phi::DenseTensor* linear2_bias, - const phi::DenseTensor* ln_scale, - const phi::DenseTensor* ln_bias, - phi::DenseTensor* out, - phi::DenseTensor* dropout1_mask, - phi::DenseTensor* dropout2_mask, - phi::DenseTensor* ln_mean, - phi::DenseTensor* ln_variance, - phi::DenseTensor* linear1_out, - phi::DenseTensor* ln1_out, - phi::DenseTensor* dropout1_out, - phi::DenseTensor* dropout2_out, - const int bsz_seq, - const int d_model, - const int dim_feedforward, - const std::string& act_method, - const bool pre_layer_norm, - const float epsilon1, - const float epsilon2, - const XPUDropoutParam& dropout_param1, - const XPUDropoutParam& dropout_param2, - int ring_id) const { - xpu::Context* xpu_ctx = dev_ctx.x_context(); - xpu::ctx_guard RAII_GUARD(xpu_ctx); - - int r = xpu::SUCCESS; - - const XPUTypeT* x_ptr = reinterpret_cast(x->data()); - const XPUTypeT* residual_ptr = x_ptr; - const XPUTypeT* linear1_weight_ptr = - reinterpret_cast(linear1_weight->data()); - const XPUTypeT* linear1_bias_ptr = - reinterpret_cast(linear1_bias->data()); - const XPUTypeT* linear2_weight_ptr = - reinterpret_cast(linear2_weight->data()); - const XPUTypeT* linear2_bias_ptr = - reinterpret_cast(linear2_bias->data()); - - const float* ln_scale_ptr = ln_scale->data(); - - const float* ln_bias_ptr = ln_bias->data(); - - // out - XPUTypeT* out_ptr = reinterpret_cast(out->data()); - XPUTypeT* linear1_out_ptr = - reinterpret_cast(linear1_out->data()); - XPUTypeT* dropout1_mask_ptr = - reinterpret_cast(dropout1_mask->data()); - XPUTypeT* dropout2_mask_ptr = - reinterpret_cast(dropout2_mask->data()); - float* ln_mean_ptr = ln_mean->data(); - float* ln_variance_ptr = ln_variance->data(); - - XPUTypeT* dropout1_out_ptr = - reinterpret_cast(dropout1_out->data()); - XPUTypeT* dropout2_out_ptr = - reinterpret_cast(dropout2_out->data()); - - size_t l3_total_size = xpu_ctx->_l3_mgr.get_size(); - XPUTypeT* linear2_before_tmp_ptr = NULL; // dim_feedforward * bsz_seq - XPUTypeT* linear2_after_tmp_ptr = NULL; // d_model * bsz_seq - if (l3_total_size >= dim_feedforward * bsz_seq * sizeof(T)) { - XPUTypeT* l3_ptr = - RAII_GUARD.alloc_l3(dim_feedforward * bsz_seq); - PADDLE_ENFORCE_XDNN_NOT_NULL(l3_ptr); - linear2_before_tmp_ptr = linear2_after_tmp_ptr = l3_ptr; - } else if ((l3_total_size < dim_feedforward * bsz_seq * sizeof(T)) && - (l3_total_size >= d_model * bsz_seq * sizeof(T))) { - XPUTypeT* l3_ptr = RAII_GUARD.alloc_l3(d_model * bsz_seq); - PADDLE_ENFORCE_XDNN_NOT_NULL(l3_ptr); - linear2_after_tmp_ptr = l3_ptr; - linear2_before_tmp_ptr = - RAII_GUARD.alloc(dim_feedforward * bsz_seq); - PADDLE_ENFORCE_XDNN_NOT_NULL(linear2_before_tmp_ptr); - - } else { - XPUTypeT* gm_ptr = RAII_GUARD.alloc(dim_feedforward * bsz_seq); - PADDLE_ENFORCE_XDNN_NOT_NULL(gm_ptr); - linear2_before_tmp_ptr = linear2_after_tmp_ptr = gm_ptr; - } - - // layernorm - if (pre_layer_norm) { - XPUTypeT* ln1_out_ptr = reinterpret_cast(ln1_out->data()); - r = xpu::layer_norm(xpu_ctx, - x_ptr, - ln1_out_ptr, - bsz_seq, - d_model, - epsilon1, - ln_scale_ptr, - ln_bias_ptr, - ln_mean_ptr, - ln_variance_ptr); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "layer_norm "); - x_ptr = ln1_out_ptr; - } - - // fc - phi::XpuFcInfo linear1_fc_info; - linear1_fc_info.InitFcInfo(0, - bsz_seq, - dim_feedforward, - d_model, - false, - false, - nullptr, - nullptr, - nullptr); - phi::MatMulXPUFunction(xpu_ctx, - x_ptr, - linear1_weight_ptr, - linear2_before_tmp_ptr, - linear1_fc_info, - 1.0f); - - // bias - r = xpu::broadcast_add(xpu_ctx, - linear2_before_tmp_ptr, - linear1_bias_ptr, - linear1_out_ptr, - {bsz_seq, dim_feedforward}, - {dim_feedforward}); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_add"); - - // act - if (act_method == "gelu") { - r = xpu::gelu(xpu_ctx, - linear1_out_ptr, - linear2_before_tmp_ptr, - linear1_out->numel()); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "gelu"); - } else if (act_method == "relu") { - r = xpu::relu(xpu_ctx, - linear1_out_ptr, - linear2_before_tmp_ptr, - linear1_out->numel()); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "relu"); - } else { - PADDLE_THROW(platform::errors::Unimplemented( - "Currently only supports gelu or relu activation functions!")); - } - - // dropout1 - Dropout(xpu_ctx, - linear2_before_tmp_ptr, - dropout1_mask_ptr, - dropout1_out_ptr, - dropout_param1, - dropout1_out->numel()); - - // fc - phi::XpuFcInfo linear2_fc_info; - linear2_fc_info.InitFcInfo(0, - bsz_seq, - d_model, - dim_feedforward, - false, - false, - nullptr, - nullptr, - nullptr); - phi::MatMulXPUFunction(xpu_ctx, - dropout1_out_ptr, - linear2_weight_ptr, - dropout2_out_ptr, - linear2_fc_info, - 1.0f); - - // bias - r = xpu::broadcast_add(xpu_ctx, - dropout2_out_ptr, - linear2_bias_ptr, - dropout2_out_ptr, - {bsz_seq, d_model}, - {d_model}); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_add"); - - // dropout2 - Dropout(xpu_ctx, - dropout2_out_ptr, - dropout2_mask_ptr, - dropout2_out_ptr, - dropout_param2, - dropout2_out->numel()); - - // residual_ptr + dropout_out - XPUTypeT* residual_add_out_ptr = out_ptr; - if (pre_layer_norm == false) { - residual_add_out_ptr = dropout2_out_ptr; - } - r = xpu::broadcast_add(xpu_ctx, - residual_ptr, - dropout2_out_ptr, - residual_add_out_ptr, - {bsz_seq, d_model}, - {bsz_seq, d_model}); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_add"); - - if (pre_layer_norm == false) { - r = xpu::layer_norm(xpu_ctx, - residual_add_out_ptr, - out_ptr, - bsz_seq, - d_model, - epsilon2, - ln_scale_ptr, - ln_bias_ptr, - ln_mean_ptr, - ln_variance_ptr); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "layer_norm"); - } - } - - void Compute(const framework::ExecutionContext& context) const override { - auto place = context.GetPlace(); - - auto* x = context.Input("X"); - - auto* linear1_weight = context.Input("Linear1Weight"); - auto* linear1_bias = context.Input("Linear1Bias"); - auto* linear2_weight = context.Input("Linear2Weight"); - auto* linear2_bias = context.Input("Linear2Bias"); - const bool pre_layer_norm = context.Attr("pre_layer_norm"); - - const phi::DenseTensor* ln_scale = nullptr; - const phi::DenseTensor* ln_bias = nullptr; - phi::DenseTensor* ln_mean = nullptr; - phi::DenseTensor* ln_variance = nullptr; - phi::DenseTensor* ln1_out = nullptr; - - if (pre_layer_norm) { - ln_scale = context.Input("Ln1Scale"); - ln_bias = context.Input("Ln1Bias"); - ln_mean = context.Output("Ln1Mean"); - ln_variance = context.Output("Ln1Variance"); - ln1_out = context.Output("Ln1Out"); - ln1_out->mutable_data(place); - } else { - ln_scale = context.Input("Ln2Scale"); - ln_bias = context.Input("Ln2Bias"); - ln_mean = context.Output("Ln2Mean"); - ln_variance = context.Output("Ln2Variance"); - } - - auto* out = context.Output("Out"); - auto* dropout1_mask = context.Output("Dropout1Mask"); - auto* dropout2_mask = context.Output("Dropout2Mask"); - auto* linear1_out = context.Output("Linear1Out"); - - auto* dropout1_out = context.Output("Dropout1Out"); - auto* dropout2_out = context.Output("Dropout2Out"); - - const std::string act_method = context.Attr("act_method"); - - const int ring_id = context.Attr("ring_id"); - const float epsilon1 = context.Attr("ln1_epsilon"); - const float epsilon2 = context.Attr("ln2_epsilon"); - XPUDropoutParam dropout_param1; - dropout_param1.initXPUDropoutParam(context, 1); - XPUDropoutParam dropout_param2; - dropout_param2.initXPUDropoutParam(context, 2); - - ln_mean->mutable_data(place); - ln_variance->mutable_data(place); - out->mutable_data(place); - dropout1_mask->mutable_data(place); - dropout2_mask->mutable_data(place); - dropout1_out->mutable_data(place); - dropout2_out->mutable_data(place); - linear1_out->mutable_data(place); - - auto x_dim = x->dims(); - auto mat_dim_x = phi::funcs::CreateMatrixDescriptor( - RowMatrixFromVector(x_dim), 0, false); - - auto dim = linear1_weight->dims(); - int d_model = dim[0]; - int dim_feedforward = dim[dim.size() - 1]; - int bsz_seq = mat_dim_x.batch_size_ * mat_dim_x.height_; - - auto& dev_ctx = context.template device_context(); - FFN(dev_ctx, - x, - linear1_weight, - linear1_bias, - linear2_weight, - linear2_bias, - ln_scale, - ln_bias, - out, - dropout1_mask, - dropout2_mask, - ln_mean, - ln_variance, - linear1_out, - ln1_out, - dropout1_out, - dropout2_out, - bsz_seq, - d_model, - dim_feedforward, - act_method, - pre_layer_norm, - epsilon1, - epsilon2, - dropout_param1, - dropout_param2, - ring_id); - } -}; - -template -class FusedFeedForwardGradXPUKernel : public framework::OpKernel { - using XPUTypeT = typename XPUTypeTrait::Type; - - public: - void FFNGrad(const phi::XPUContext& dev_ctx, - const phi::DenseTensor* d_out, - const phi::DenseTensor* x, - const phi::DenseTensor* dropout1_mask, - const phi::DenseTensor* dropout2_mask, - const phi::DenseTensor* linear1_out, - const phi::DenseTensor* ln1_out, - const phi::DenseTensor* dropout1_out, - const phi::DenseTensor* dropout2_out, - const phi::DenseTensor* linear1_weight, - const phi::DenseTensor* linear2_weight, - const phi::DenseTensor* ln_scale, - const phi::DenseTensor* ln_mean, - const phi::DenseTensor* ln_variance, - phi::DenseTensor* d_x, - phi::DenseTensor* d_linear1_weight, - phi::DenseTensor* d_linear1_bias, - phi::DenseTensor* d_linear2_weight, - phi::DenseTensor* d_linear2_bias, - phi::DenseTensor* d_ln_scale, - phi::DenseTensor* d_ln_bias, - const int bsz_seq, - const int d_model, - const int dim_feedforward, - const XPUDropoutParam& dropout_param1, - const XPUDropoutParam& dropout_param2, - const std::string& act_method, - const bool pre_layer_norm, - const float epsilon, - const int ring_id) const { - xpu::Context* xpu_ctx = dev_ctx.x_context(); - xpu::ctx_guard RAII_GUARD(xpu_ctx); - int r = xpu::SUCCESS; - - // inputs ptr - const XPUTypeT* d_out_ptr = - reinterpret_cast(d_out->data()); - const XPUTypeT* x_ptr = reinterpret_cast(x->data()); - const XPUTypeT* dropout1_mask_ptr = - reinterpret_cast(dropout1_mask->data()); - const XPUTypeT* dropout2_mask_ptr = - reinterpret_cast(dropout2_mask->data()); - const XPUTypeT* linear1_out_ptr = - reinterpret_cast(linear1_out->data()); - const XPUTypeT* dropout1_out_ptr = - reinterpret_cast(dropout1_out->data()); - const XPUTypeT* linear1_weight_ptr = - reinterpret_cast(linear1_weight->data()); - const XPUTypeT* linear2_weight_ptr = - reinterpret_cast(linear2_weight->data()); - const float* ln_scale_ptr = ln_scale->data(); - - const float* ln_mean_ptr = ln_mean->data(); - const float* ln_variance_ptr = ln_variance->data(); - // outputs ptr - XPUTypeT* d_x_ptr = reinterpret_cast(d_x->data()); - XPUTypeT* d_linear1_weight_ptr = - reinterpret_cast(d_linear1_weight->data()); - XPUTypeT* d_linear1_bias_ptr = - reinterpret_cast(d_linear1_bias->data()); - XPUTypeT* d_linear2_weight_ptr = - reinterpret_cast(d_linear2_weight->data()); - XPUTypeT* d_linear2_bias_ptr = - reinterpret_cast(d_linear2_bias->data()); - float* d_ln_scale_ptr = d_ln_scale->data(); - float* d_ln_bias_ptr = d_ln_bias->data(); - - size_t l3_total_size = xpu_ctx->_l3_mgr.get_size(); - - XPUTypeT* big_tmp_l3_ptr = NULL; // dim_feedforward * bsz_seq - XPUTypeT* small_tmp_l3_ptr = NULL; // d_model * bsz_seq - XPUTypeT* big_tmp_gm_ptr = NULL; // dim_feedforward * bsz_seq - XPUTypeT* small_tmp_gm_ptr = NULL; // d_model * bsz_seq - - XPUTypeT* d_layernorm_out_ptr = NULL; // dx9 - XPUTypeT* d_dropout2_out_ptr = NULL; // dx7 - - XPUTypeT* d_linear2_out_ptr = NULL; // dx5 - XPUTypeT* d_dropout1_out_ptr = NULL; // dx4 - XPUTypeT* d_act_out_ptr = NULL; // dx3 - - XPUTypeT* d_linear1_out_ptr = NULL; // dx1 - - const XPUTypeT* d_residual_ptr = d_out_ptr; - - if (l3_total_size >= (dim_feedforward * bsz_seq * sizeof(T) + - d_model * bsz_seq * sizeof(T))) { - big_tmp_l3_ptr = RAII_GUARD.alloc_l3(dim_feedforward * bsz_seq); - PADDLE_ENFORCE_XDNN_NOT_NULL(big_tmp_l3_ptr); - small_tmp_l3_ptr = RAII_GUARD.alloc_l3(d_model * bsz_seq); - PADDLE_ENFORCE_XDNN_NOT_NULL(small_tmp_l3_ptr); - d_layernorm_out_ptr = small_tmp_l3_ptr; - d_dropout2_out_ptr = small_tmp_l3_ptr; - d_linear2_out_ptr = big_tmp_l3_ptr; - d_dropout1_out_ptr = big_tmp_l3_ptr; - d_act_out_ptr = big_tmp_l3_ptr; - d_linear1_out_ptr = small_tmp_l3_ptr; - } else if (l3_total_size >= dim_feedforward * bsz_seq * sizeof(T)) { - big_tmp_l3_ptr = RAII_GUARD.alloc_l3(dim_feedforward * bsz_seq); - PADDLE_ENFORCE_XDNN_NOT_NULL(big_tmp_l3_ptr); - small_tmp_l3_ptr = big_tmp_l3_ptr; - big_tmp_gm_ptr = RAII_GUARD.alloc(dim_feedforward * bsz_seq); - PADDLE_ENFORCE_XDNN_NOT_NULL(big_tmp_gm_ptr); - small_tmp_gm_ptr = RAII_GUARD.alloc(d_model * bsz_seq); - PADDLE_ENFORCE_XDNN_NOT_NULL(small_tmp_gm_ptr); - - d_layernorm_out_ptr = small_tmp_l3_ptr; - d_dropout2_out_ptr = small_tmp_gm_ptr; - d_linear2_out_ptr = big_tmp_l3_ptr; - d_dropout1_out_ptr = big_tmp_l3_ptr; - d_act_out_ptr = big_tmp_gm_ptr; - d_linear1_out_ptr = small_tmp_l3_ptr; - - } else if (l3_total_size >= d_model * bsz_seq * sizeof(T)) { - big_tmp_gm_ptr = RAII_GUARD.alloc(dim_feedforward * bsz_seq); - PADDLE_ENFORCE_XDNN_NOT_NULL(big_tmp_gm_ptr); - small_tmp_l3_ptr = RAII_GUARD.alloc_l3(d_model * bsz_seq); - PADDLE_ENFORCE_XDNN_NOT_NULL(small_tmp_l3_ptr); - - d_layernorm_out_ptr = small_tmp_l3_ptr; - d_dropout2_out_ptr = small_tmp_l3_ptr; - d_linear2_out_ptr = big_tmp_gm_ptr; - d_dropout1_out_ptr = big_tmp_gm_ptr; - d_act_out_ptr = big_tmp_gm_ptr; - d_linear1_out_ptr = small_tmp_l3_ptr; - } else { - big_tmp_gm_ptr = RAII_GUARD.alloc(dim_feedforward * bsz_seq); - PADDLE_ENFORCE_XDNN_NOT_NULL(big_tmp_gm_ptr); - small_tmp_gm_ptr = RAII_GUARD.alloc(d_model * bsz_seq); - PADDLE_ENFORCE_XDNN_NOT_NULL(small_tmp_gm_ptr); - d_layernorm_out_ptr = small_tmp_gm_ptr; - d_dropout2_out_ptr = small_tmp_gm_ptr; - d_linear2_out_ptr = big_tmp_gm_ptr; - d_dropout1_out_ptr = big_tmp_gm_ptr; - d_act_out_ptr = big_tmp_gm_ptr; - d_linear1_out_ptr = small_tmp_gm_ptr; - } - - if (pre_layer_norm == false) { - const XPUTypeT* dropout2_out_ptr = - reinterpret_cast(dropout2_out->data()); - r = xpu::layer_norm_grad(xpu_ctx, - dropout2_out_ptr, - d_out_ptr, - d_layernorm_out_ptr, - bsz_seq, - d_model, - epsilon, - ln_scale_ptr, - ln_mean_ptr, - ln_variance_ptr, - d_ln_scale_ptr, - d_ln_bias_ptr); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "layer_norm_grad"); - d_residual_ptr = d_layernorm_out_ptr; - } - DropoutGrad(xpu_ctx, - d_residual_ptr, - dropout2_mask_ptr, - d_dropout2_out_ptr, - dropout_param2, - bsz_seq * d_model); - // linear_grad2 - r = xpu::reduce_sum(xpu_ctx, - d_dropout2_out_ptr, - d_linear2_bias_ptr, - {bsz_seq, d_model}, - {0}); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum"); - - phi::XpuFcInfo linear2_fc_info; - linear2_fc_info.InitFcInfo(0, - bsz_seq, - d_model, - dim_feedforward, - false, - false, - nullptr, - nullptr, - nullptr); - - const XPUTypeT* a_1 = reinterpret_cast(NULL); - const XPUTypeT* b_1 = reinterpret_cast(NULL); - const XPUTypeT* a_2 = reinterpret_cast(NULL); - const XPUTypeT* b_2 = reinterpret_cast(NULL); - XPUTypeT* c_1 = d_linear2_out_ptr; - XPUTypeT* c_2 = d_linear2_weight_ptr; - phi::XpuFcInfo info_d_dropout1; - phi::XpuFcInfo info_dw2; - - std::tuple - fc_info = phi::MatmulGradFcInfo(xpu_ctx, - &RAII_GUARD, - linear2_fc_info, - false, - false, - dropout1_out_ptr, - linear2_weight_ptr, - d_dropout2_out_ptr); - - std::tie(info_d_dropout1, info_dw2, a_1, b_1, a_2, b_2) = fc_info; - - // if l3_total_size >= dim_feedforward * bsz_seq * sizeof(T), first transpos - if (l3_total_size >= dim_feedforward * bsz_seq * sizeof(T) && - info_dw2.trans_x) { - r = xpu::transpose(xpu_ctx, - dropout1_out_ptr, - big_tmp_l3_ptr, - {bsz_seq, dim_feedforward}, - {1, 0}); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose"); - a_2 = big_tmp_l3_ptr; - info_dw2.trans_x = !info_dw2.trans_x; - info_dw2.stride_x = info_dw2.k; - } - - phi::MatMulXPUFunction( - xpu_ctx, a_1, b_1, c_1, info_d_dropout1, 1.0f, true); - - phi::MatMulXPUFunction( - xpu_ctx, a_2, b_2, c_2, info_dw2, 1.0f, true); - - // dropout_grad1 - DropoutGrad(xpu_ctx, - d_linear2_out_ptr, - dropout1_mask_ptr, - d_dropout1_out_ptr, - dropout_param1, - bsz_seq * dim_feedforward); - - // act_grad - if (act_method == "gelu") { - r = xpu::gelu_grad(xpu_ctx, - linear1_out_ptr, - linear1_out_ptr, - d_dropout1_out_ptr, - d_act_out_ptr, - bsz_seq * dim_feedforward); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "gelu_grad"); - } else if (act_method == "relu") { - r = xpu::relu_grad(xpu_ctx, - linear1_out_ptr, - linear1_out_ptr, - d_dropout1_out_ptr, - d_act_out_ptr, - bsz_seq * dim_feedforward); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "relu_grad"); - } else { - PADDLE_THROW(platform::errors::Unimplemented( - "Currently only supports gelu or relu activation functions!")); - } - - // linear1_grad - r = xpu::reduce_sum(xpu_ctx, - d_act_out_ptr, - d_linear1_bias_ptr, - {bsz_seq, dim_feedforward}, - {0}); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum"); - - phi::XpuFcInfo linear1_fc_info; - linear1_fc_info.InitFcInfo(0, - bsz_seq, - dim_feedforward, - d_model, - false, - false, - nullptr, - nullptr, - nullptr); - - a_1 = reinterpret_cast(NULL); - b_1 = reinterpret_cast(NULL); - a_2 = reinterpret_cast(NULL); - b_2 = reinterpret_cast(NULL); - - c_1 = (pre_layer_norm == true ? d_linear1_out_ptr : d_x_ptr); - c_2 = d_linear1_weight_ptr; - phi::XpuFcInfo info_dx; - phi::XpuFcInfo info_dw1; - - const XPUTypeT* linear1_x_ptr = - (pre_layer_norm == true - ? reinterpret_cast(ln1_out->data()) - : x_ptr); - - if (l3_total_size >= d_model * bsz_seq * sizeof(T) && info_dw1.trans_x) { - r = xpu::transpose( - xpu_ctx, linear1_x_ptr, small_tmp_l3_ptr, {bsz_seq, d_model}, {1, 0}); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose"); - a_2 = small_tmp_l3_ptr; - info_dw1.trans_x = !info_dw1.trans_x; - info_dw1.stride_x = info_dw1.k; - } - - fc_info = phi::MatmulGradFcInfo(xpu_ctx, - &RAII_GUARD, - linear1_fc_info, - false, - false, - linear1_x_ptr, - linear1_weight_ptr, - d_act_out_ptr); - - std::tie(info_dx, info_dw1, a_1, b_1, a_2, b_2) = fc_info; - - phi::MatMulXPUFunction( - xpu_ctx, a_1, b_1, c_1, info_dx, 1.0f, true); - - phi::MatMulXPUFunction( - xpu_ctx, a_2, b_2, c_2, info_dw1, 1.0f, true); - - if (pre_layer_norm) { - r = xpu::layer_norm_grad(xpu_ctx, - x_ptr, - c_1, - c_1, - bsz_seq, - d_model, - epsilon, - ln_scale_ptr, - ln_mean_ptr, - ln_variance_ptr, - d_ln_scale_ptr, - d_ln_bias_ptr); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "layer_norm_grad"); - } - - r = xpu::add(xpu_ctx, c_1, d_residual_ptr, d_x_ptr, d_model * bsz_seq); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "add"); - } - - void Compute(const framework::ExecutionContext& context) const override { - auto place = context.GetPlace(); - const bool pre_layer_norm = context.Attr("pre_layer_norm"); - // inputs - auto* d_out = - context.Input(framework::GradVarName("Out")); - auto* x = context.Input("X"); - - auto* dropout1_mask = context.Input("Dropout1Mask"); - auto* dropout2_mask = context.Input("Dropout2Mask"); - auto* linear1_out = context.Input("Linear1Out"); - auto* ln1_out = - pre_layer_norm ? context.Input("Ln1Out") : nullptr; - - auto* dropout1_out = context.Input("Dropout1Out"); - auto* dropout2_out = context.Input("Dropout2Out"); - auto* linear1_weight = context.Input("Linear1Weight"); - auto* linear2_weight = context.Input("Linear2Weight"); - - const phi::DenseTensor* ln_mean = nullptr; - const phi::DenseTensor* ln_variance = nullptr; - const phi::DenseTensor* ln_scale = nullptr; - - if (pre_layer_norm) { - ln_mean = context.Input("Ln1Mean"); - ln_variance = context.Input("Ln1Variance"); - ln_scale = context.Input("Ln1Scale"); - } else { - ln_mean = context.Input("Ln2Mean"); - ln_variance = context.Input("Ln2Variance"); - ln_scale = context.Input("Ln2Scale"); - } - - // output - auto* d_x = context.Output(framework::GradVarName("X")); - - phi::DenseTensor* d_ln_scale = nullptr; - phi::DenseTensor* d_ln_bias = nullptr; - - if (pre_layer_norm) { - d_ln_scale = - context.Output(framework::GradVarName("Ln1Scale")); - d_ln_bias = - context.Output(framework::GradVarName("Ln1Bias")); - } else { - d_ln_scale = - context.Output(framework::GradVarName("Ln2Scale")); - d_ln_bias = - context.Output(framework::GradVarName("Ln2Bias")); - } - - auto* d_linear1_weight = context.Output( - framework::GradVarName("Linear1Weight")); - auto* d_linear1_bias = - context.Output(framework::GradVarName("Linear1Bias")); - auto* d_linear2_weight = context.Output( - framework::GradVarName("Linear2Weight")); - auto* d_linear2_bias = - context.Output(framework::GradVarName("Linear2Bias")); - - float epsilon = 0.0f; - if (pre_layer_norm) { - epsilon = context.Attr("ln1_epsilon"); - } else { - epsilon = context.Attr("ln2_epsilon"); - } - - const std::string act_method = context.Attr("act_method"); - - XPUDropoutParam dropout_param1(context, 1); - XPUDropoutParam dropout_param2(context, 2); - - const int ring_id = context.Attr("ring_id"); - - d_x->mutable_data(place); - d_ln_scale->mutable_data(place); - d_ln_bias->mutable_data(place); - d_linear1_bias->mutable_data(place); - d_linear2_bias->mutable_data(place); - d_linear1_weight->mutable_data(place); - d_linear2_weight->mutable_data(place); - - auto x_dim = x->dims(); - auto mat_dim_x = phi::funcs::CreateMatrixDescriptor( - RowMatrixFromVector(x_dim), 0, false); - - auto linear1_weight_dim = linear1_weight->dims(); - int d_model = linear1_weight_dim[0]; - int dim_feedforward = linear1_weight_dim[linear1_weight_dim.size() - 1]; - int bsz_seq = mat_dim_x.batch_size_ * mat_dim_x.height_; - auto& dev_ctx = context.template device_context(); - - FFNGrad(dev_ctx, - d_out, - x, - dropout1_mask, - dropout2_mask, - linear1_out, - ln1_out, - dropout1_out, - dropout2_out, - linear1_weight, - linear2_weight, - ln_scale, - ln_mean, - ln_variance, - d_x, - d_linear1_weight, - d_linear1_bias, - d_linear2_weight, - d_linear2_bias, - d_ln_scale, - d_ln_bias, - bsz_seq, - d_model, - dim_feedforward, - dropout_param1, - dropout_param2, - act_method, - pre_layer_norm, - epsilon, - ring_id); - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_XPU_KERNEL( - fused_feedforward, - ops::FusedFeedForwardXPUKernel, - ops::FusedFeedForwardXPUKernel); - -REGISTER_OP_XPU_KERNEL( - fused_feedforward_grad, - ops::FusedFeedForwardGradXPUKernel, - ops::FusedFeedForwardGradXPUKernel); - -#endif diff --git a/paddle/phi/kernels/fused_feedforward_grad_kernel.h b/paddle/phi/kernels/fused_feedforward_grad_kernel.h new file mode 100644 index 0000000000000..9eee46a83987e --- /dev/null +++ b/paddle/phi/kernels/fused_feedforward_grad_kernel.h @@ -0,0 +1,69 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void FusedFeedForwardGradKernel( + const Context& dev_ctx, + const DenseTensor& out_grad, + const DenseTensor& x, + const DenseTensor& linear1_weight, + const DenseTensor& linear1_bias, + const DenseTensor& linear2_weight, + const DenseTensor& dropout1_mask, + const DenseTensor& dropout2_mask, + const DenseTensor& linear1_out, + const DenseTensor& dropout1_out, + const DenseTensor& dropout2_out, + const paddle::optional& ln1_scale, + const paddle::optional& ln1_bias, + const paddle::optional& ln1_out, + const paddle::optional& ln1_mean, + const paddle::optional& ln1_variance, + const paddle::optional& ln2_scale, + const paddle::optional& ln2_bias, + const paddle::optional& ln2_mean, + const paddle::optional& ln2_variance, + const paddle::optional& linear2_bias, + bool pre_layer_norm, + float ln1_epsilon, + float ln2_epsilon, + const std::string& act_method, + float dropout1_prob, + float dropout2_prob, + const std::string& dropout1_implementation, + const std::string& dropout2_implementation, + bool is_test, + bool dropout1_fix_seed, + bool dropout2_fix_seed, + int dropout1_seed_val, + int dropout2_seed_val, + bool add_residual, + int ring_id, + DenseTensor* x_grad, + DenseTensor* ln1_scale_grad, + DenseTensor* ln1_bias_grad, + DenseTensor* ln2_scale_grad, + DenseTensor* ln2_bias_grad, + DenseTensor* linear1_weight_grad, + DenseTensor* linear1_bias_grad, + DenseTensor* linear2_weight_grad, + DenseTensor* linear2_bias_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/fused_feedforward_kernel.h b/paddle/phi/kernels/fused_feedforward_kernel.h new file mode 100644 index 0000000000000..cade7adc0c7c2 --- /dev/null +++ b/paddle/phi/kernels/fused_feedforward_kernel.h @@ -0,0 +1,61 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void FusedFeedForwardKernel(const Context& dev_ctx, + const DenseTensor& x, + const paddle::optional& dropout1_seed, + const paddle::optional& dropout2_seed, + const DenseTensor& linear1_weight, + const paddle::optional& linear1_bias, + const DenseTensor& linear2_weight, + const paddle::optional& linear2_bias, + const paddle::optional& ln1_scale, + const paddle::optional& ln1_bias, + const paddle::optional& ln2_scale, + const paddle::optional& ln2_bias, + bool pre_layer_norm, + float ln1_epsilon, + float ln2_epsilon, + const std::string& act_method, + float dropout1_prob, + float dropout2_prob, + const std::string& dropout1_implementation, + const std::string& dropout2_implementation, + bool is_test, + bool dropout1_fix_seed, + bool dropout2_fix_seed, + int dropout1_seed_val, + int dropout2_seed_val, + bool add_residual, + int ring_id, + DenseTensor* out, + DenseTensor* dropout1_mask, + DenseTensor* dropout2_mask, + DenseTensor* ln1_mean, + DenseTensor* ln1_variance, + DenseTensor* ln2_mean, + DenseTensor* ln2_variance, + DenseTensor* linear1_out, + DenseTensor* ln1_out, + DenseTensor* dropout1_out, + DenseTensor* dropout2_out); + +} // namespace phi diff --git a/paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_xpu_kernel.cc new file mode 100644 index 0000000000000..cb10930dc9b3e --- /dev/null +++ b/paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_xpu_kernel.cc @@ -0,0 +1,542 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/backends/xpu/xpu_context.h" +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/impl/matmul_grad_kernel_impl.h" +#include "paddle/phi/kernels/xpu/xpu_api_wrapper.h" +#include "paddle/phi/kernels/xpu/xpu_fused_common_function.h" + +namespace phi { +namespace fusion { + +template +void FFNGrad(const phi::XPUContext& dev_ctx, + const phi::DenseTensor* d_out, + const phi::DenseTensor* x, + const phi::DenseTensor* dropout1_mask, + const phi::DenseTensor* dropout2_mask, + const phi::DenseTensor* linear1_out, + const phi::DenseTensor* ln1_out, + const phi::DenseTensor* dropout1_out, + const phi::DenseTensor* dropout2_out, + const phi::DenseTensor* linear1_weight, + const phi::DenseTensor* linear2_weight, + const phi::DenseTensor* ln_scale, + const phi::DenseTensor* ln_mean, + const phi::DenseTensor* ln_variance, + phi::DenseTensor* d_x, + phi::DenseTensor* d_linear1_weight, + phi::DenseTensor* d_linear1_bias, + phi::DenseTensor* d_linear2_weight, + phi::DenseTensor* d_linear2_bias, + phi::DenseTensor* d_ln_scale, + phi::DenseTensor* d_ln_bias, + const int bsz_seq, + const int d_model, + const int dim_feedforward, + const XPUDropoutParam& dropout_param1, + const XPUDropoutParam& dropout_param2, + const std::string& act_method, + const bool pre_layer_norm, + const float epsilon, + const int ring_id) { + using XPUTypeT = typename XPUTypeTrait::Type; + xpu::Context* xpu_ctx = dev_ctx.x_context(); + xpu::ctx_guard RAII_GUARD(xpu_ctx); + int r = xpu::SUCCESS; + + // inputs ptr + const XPUTypeT* d_out_ptr = + reinterpret_cast(d_out->data()); + const XPUTypeT* x_ptr = reinterpret_cast(x->data()); + const XPUTypeT* dropout1_mask_ptr = + reinterpret_cast(dropout1_mask->data()); + const XPUTypeT* dropout2_mask_ptr = + reinterpret_cast(dropout2_mask->data()); + const XPUTypeT* linear1_out_ptr = + reinterpret_cast(linear1_out->data()); + const XPUTypeT* dropout1_out_ptr = + reinterpret_cast(dropout1_out->data()); + const XPUTypeT* linear1_weight_ptr = + reinterpret_cast(linear1_weight->data()); + const XPUTypeT* linear2_weight_ptr = + reinterpret_cast(linear2_weight->data()); + const float* ln_scale_ptr = ln_scale->data(); + + const float* ln_mean_ptr = ln_mean->data(); + const float* ln_variance_ptr = ln_variance->data(); + // outputs ptr + XPUTypeT* d_x_ptr = reinterpret_cast(d_x->data()); + XPUTypeT* d_linear1_weight_ptr = + reinterpret_cast(d_linear1_weight->data()); + XPUTypeT* d_linear1_bias_ptr = + reinterpret_cast(d_linear1_bias->data()); + XPUTypeT* d_linear2_weight_ptr = + reinterpret_cast(d_linear2_weight->data()); + XPUTypeT* d_linear2_bias_ptr = + reinterpret_cast(d_linear2_bias->data()); + float* d_ln_scale_ptr = d_ln_scale->data(); + float* d_ln_bias_ptr = d_ln_bias->data(); + + size_t l3_total_size = xpu_ctx->_l3_mgr.get_size(); + + XPUTypeT* big_tmp_l3_ptr = NULL; // dim_feedforward * bsz_seq + XPUTypeT* small_tmp_l3_ptr = NULL; // d_model * bsz_seq + XPUTypeT* big_tmp_gm_ptr = NULL; // dim_feedforward * bsz_seq + XPUTypeT* small_tmp_gm_ptr = NULL; // d_model * bsz_seq + + XPUTypeT* d_layernorm_out_ptr = NULL; // dx9 + XPUTypeT* d_dropout2_out_ptr = NULL; // dx7 + + XPUTypeT* d_linear2_out_ptr = NULL; // dx5 + XPUTypeT* d_dropout1_out_ptr = NULL; // dx4 + XPUTypeT* d_act_out_ptr = NULL; // dx3 + + XPUTypeT* d_linear1_out_ptr = NULL; // dx1 + + const XPUTypeT* d_residual_ptr = d_out_ptr; + + if (l3_total_size >= + (dim_feedforward * bsz_seq * sizeof(T) + d_model * bsz_seq * sizeof(T))) { + big_tmp_l3_ptr = RAII_GUARD.alloc_l3(dim_feedforward * bsz_seq); + PADDLE_ENFORCE_XDNN_NOT_NULL(big_tmp_l3_ptr); + small_tmp_l3_ptr = RAII_GUARD.alloc_l3(d_model * bsz_seq); + PADDLE_ENFORCE_XDNN_NOT_NULL(small_tmp_l3_ptr); + d_layernorm_out_ptr = small_tmp_l3_ptr; + d_dropout2_out_ptr = small_tmp_l3_ptr; + d_linear2_out_ptr = big_tmp_l3_ptr; + d_dropout1_out_ptr = big_tmp_l3_ptr; + d_act_out_ptr = big_tmp_l3_ptr; + d_linear1_out_ptr = small_tmp_l3_ptr; + } else if (l3_total_size >= dim_feedforward * bsz_seq * sizeof(T)) { + big_tmp_l3_ptr = RAII_GUARD.alloc_l3(dim_feedforward * bsz_seq); + PADDLE_ENFORCE_XDNN_NOT_NULL(big_tmp_l3_ptr); + small_tmp_l3_ptr = big_tmp_l3_ptr; + big_tmp_gm_ptr = RAII_GUARD.alloc(dim_feedforward * bsz_seq); + PADDLE_ENFORCE_XDNN_NOT_NULL(big_tmp_gm_ptr); + small_tmp_gm_ptr = RAII_GUARD.alloc(d_model * bsz_seq); + PADDLE_ENFORCE_XDNN_NOT_NULL(small_tmp_gm_ptr); + + d_layernorm_out_ptr = small_tmp_l3_ptr; + d_dropout2_out_ptr = small_tmp_gm_ptr; + d_linear2_out_ptr = big_tmp_l3_ptr; + d_dropout1_out_ptr = big_tmp_l3_ptr; + d_act_out_ptr = big_tmp_gm_ptr; + d_linear1_out_ptr = small_tmp_l3_ptr; + + } else if (l3_total_size >= d_model * bsz_seq * sizeof(T)) { + big_tmp_gm_ptr = RAII_GUARD.alloc(dim_feedforward * bsz_seq); + PADDLE_ENFORCE_XDNN_NOT_NULL(big_tmp_gm_ptr); + small_tmp_l3_ptr = RAII_GUARD.alloc_l3(d_model * bsz_seq); + PADDLE_ENFORCE_XDNN_NOT_NULL(small_tmp_l3_ptr); + + d_layernorm_out_ptr = small_tmp_l3_ptr; + d_dropout2_out_ptr = small_tmp_l3_ptr; + d_linear2_out_ptr = big_tmp_gm_ptr; + d_dropout1_out_ptr = big_tmp_gm_ptr; + d_act_out_ptr = big_tmp_gm_ptr; + d_linear1_out_ptr = small_tmp_l3_ptr; + } else { + big_tmp_gm_ptr = RAII_GUARD.alloc(dim_feedforward * bsz_seq); + PADDLE_ENFORCE_XDNN_NOT_NULL(big_tmp_gm_ptr); + small_tmp_gm_ptr = RAII_GUARD.alloc(d_model * bsz_seq); + PADDLE_ENFORCE_XDNN_NOT_NULL(small_tmp_gm_ptr); + d_layernorm_out_ptr = small_tmp_gm_ptr; + d_dropout2_out_ptr = small_tmp_gm_ptr; + d_linear2_out_ptr = big_tmp_gm_ptr; + d_dropout1_out_ptr = big_tmp_gm_ptr; + d_act_out_ptr = big_tmp_gm_ptr; + d_linear1_out_ptr = small_tmp_gm_ptr; + } + + if (pre_layer_norm == false) { + const XPUTypeT* dropout2_out_ptr = + reinterpret_cast(dropout2_out->data()); + r = xpu::layer_norm_grad(xpu_ctx, + dropout2_out_ptr, + d_out_ptr, + d_layernorm_out_ptr, + bsz_seq, + d_model, + epsilon, + ln_scale_ptr, + ln_mean_ptr, + ln_variance_ptr, + d_ln_scale_ptr, + d_ln_bias_ptr); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "layer_norm_grad"); + d_residual_ptr = d_layernorm_out_ptr; + } + phi::DropoutGrad(xpu_ctx, + d_residual_ptr, + dropout2_mask_ptr, + d_dropout2_out_ptr, + dropout_param2, + bsz_seq * d_model); + // linear_grad2 + r = xpu::reduce_sum( + xpu_ctx, d_dropout2_out_ptr, d_linear2_bias_ptr, {bsz_seq, d_model}, {0}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum"); + + phi::XpuFcInfo linear2_fc_info; + linear2_fc_info.InitFcInfo(0, + bsz_seq, + d_model, + dim_feedforward, + false, + false, + nullptr, + nullptr, + nullptr); + + const XPUTypeT* a_1 = reinterpret_cast(NULL); + const XPUTypeT* b_1 = reinterpret_cast(NULL); + const XPUTypeT* a_2 = reinterpret_cast(NULL); + const XPUTypeT* b_2 = reinterpret_cast(NULL); + XPUTypeT* c_1 = d_linear2_out_ptr; + XPUTypeT* c_2 = d_linear2_weight_ptr; + phi::XpuFcInfo info_d_dropout1; + phi::XpuFcInfo info_dw2; + + std::tuple + fc_info = phi::MatmulGradFcInfo(xpu_ctx, + &RAII_GUARD, + linear2_fc_info, + false, + false, + dropout1_out_ptr, + linear2_weight_ptr, + d_dropout2_out_ptr); + + std::tie(info_d_dropout1, info_dw2, a_1, b_1, a_2, b_2) = fc_info; + + // if l3_total_size >= dim_feedforward * bsz_seq * sizeof(T), first transpos + if (l3_total_size >= dim_feedforward * bsz_seq * sizeof(T) && + info_dw2.trans_x) { + r = xpu::transpose(xpu_ctx, + dropout1_out_ptr, + big_tmp_l3_ptr, + {bsz_seq, dim_feedforward}, + {1, 0}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose"); + a_2 = big_tmp_l3_ptr; + info_dw2.trans_x = !info_dw2.trans_x; + info_dw2.stride_x = info_dw2.k; + } + + phi::MatMulXPUFunction( + xpu_ctx, a_1, b_1, c_1, info_d_dropout1, 1.0f, true); + + phi::MatMulXPUFunction( + xpu_ctx, a_2, b_2, c_2, info_dw2, 1.0f, true); + + // dropout_grad1 + DropoutGrad(xpu_ctx, + d_linear2_out_ptr, + dropout1_mask_ptr, + d_dropout1_out_ptr, + dropout_param1, + bsz_seq * dim_feedforward); + + // act_grad + if (act_method == "gelu") { + r = xpu::gelu_grad(xpu_ctx, + linear1_out_ptr, + linear1_out_ptr, + d_dropout1_out_ptr, + d_act_out_ptr, + bsz_seq * dim_feedforward); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "gelu_grad"); + } else if (act_method == "relu") { + r = xpu::relu_grad(xpu_ctx, + linear1_out_ptr, + linear1_out_ptr, + d_dropout1_out_ptr, + d_act_out_ptr, + bsz_seq * dim_feedforward); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "relu_grad"); + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "Currently only supports gelu or relu activation functions!")); + } + + // linear1_grad + r = xpu::reduce_sum(xpu_ctx, + d_act_out_ptr, + d_linear1_bias_ptr, + {bsz_seq, dim_feedforward}, + {0}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum"); + + phi::XpuFcInfo linear1_fc_info; + linear1_fc_info.InitFcInfo(0, + bsz_seq, + dim_feedforward, + d_model, + false, + false, + nullptr, + nullptr, + nullptr); + + a_1 = reinterpret_cast(NULL); + b_1 = reinterpret_cast(NULL); + a_2 = reinterpret_cast(NULL); + b_2 = reinterpret_cast(NULL); + + c_1 = (pre_layer_norm == true ? d_linear1_out_ptr : d_x_ptr); + c_2 = d_linear1_weight_ptr; + phi::XpuFcInfo info_dx; + phi::XpuFcInfo info_dw1; + + const XPUTypeT* linear1_x_ptr = + (pre_layer_norm == true + ? reinterpret_cast(ln1_out->data()) + : x_ptr); + + if (l3_total_size >= d_model * bsz_seq * sizeof(T) && info_dw1.trans_x) { + r = xpu::transpose( + xpu_ctx, linear1_x_ptr, small_tmp_l3_ptr, {bsz_seq, d_model}, {1, 0}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose"); + a_2 = small_tmp_l3_ptr; + info_dw1.trans_x = !info_dw1.trans_x; + info_dw1.stride_x = info_dw1.k; + } + + fc_info = phi::MatmulGradFcInfo(xpu_ctx, + &RAII_GUARD, + linear1_fc_info, + false, + false, + linear1_x_ptr, + linear1_weight_ptr, + d_act_out_ptr); + + std::tie(info_dx, info_dw1, a_1, b_1, a_2, b_2) = fc_info; + + phi::MatMulXPUFunction(xpu_ctx, a_1, b_1, c_1, info_dx, 1.0f, true); + + phi::MatMulXPUFunction( + xpu_ctx, a_2, b_2, c_2, info_dw1, 1.0f, true); + + if (pre_layer_norm) { + r = xpu::layer_norm_grad(xpu_ctx, + x_ptr, + c_1, + c_1, + bsz_seq, + d_model, + epsilon, + ln_scale_ptr, + ln_mean_ptr, + ln_variance_ptr, + d_ln_scale_ptr, + d_ln_bias_ptr); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "layer_norm_grad"); + } + + r = xpu::add(xpu_ctx, c_1, d_residual_ptr, d_x_ptr, d_model * bsz_seq); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "add"); +} + +template +void FusedFeedForwardGradKernel( + const Context& dev_ctx, + const DenseTensor& out_grad, + const DenseTensor& x, + const DenseTensor& linear1_weight, + const DenseTensor& linear1_bias, + const DenseTensor& linear2_weight, + const DenseTensor& dropout1_mask, + const DenseTensor& dropout2_mask, + const DenseTensor& linear1_out, + const DenseTensor& dropout1_out, + const DenseTensor& dropout2_out, + const paddle::optional& ln1_scale, + const paddle::optional& ln1_bias, + const paddle::optional& ln1_out, + const paddle::optional& ln1_mean, + const paddle::optional& ln1_variance, + const paddle::optional& ln2_scale, + const paddle::optional& ln2_bias, + const paddle::optional& ln2_mean, + const paddle::optional& ln2_variance, + const paddle::optional& linear2_bias, + bool pre_layer_norm, + float ln1_epsilon, + float ln2_epsilon, + const std::string& act_method, + float dropout1_prob, + float dropout2_prob, + const std::string& dropout1_implementation, + const std::string& dropout2_implementation, + bool is_test, + bool dropout1_fix_seed, + bool dropout2_fix_seed, + int dropout1_seed_val, + int dropout2_seed_val, + bool add_residual, + int ring_id, + DenseTensor* x_grad, + DenseTensor* ln1_scale_grad, + DenseTensor* ln1_bias_grad, + DenseTensor* ln2_scale_grad, + DenseTensor* ln2_bias_grad, + DenseTensor* linear1_weight_grad, + DenseTensor* linear1_bias_grad, + DenseTensor* linear2_weight_grad, + DenseTensor* linear2_bias_grad) { + // inputs + auto* d_out = &out_grad; + auto* x_ptr = &x; + + auto* dropout1_mask_ptr = &dropout1_mask; + auto* dropout2_mask_ptr = &dropout2_mask; + auto* linear1_out_ptr = &linear1_out; + auto* ln1_out_ptr = pre_layer_norm ? ln1_out.get_ptr() : nullptr; + + auto* dropout1_out_ptr = &dropout1_out; + auto* dropout2_out_ptr = &dropout2_out; + auto* linear1_weight_ptr = &linear1_weight; + auto* linear2_weight_ptr = &linear2_weight; + + const phi::DenseTensor* ln_mean = nullptr; + const phi::DenseTensor* ln_variance = nullptr; + const phi::DenseTensor* ln_scale = nullptr; + + if (pre_layer_norm) { + ln_mean = ln1_mean.get_ptr(); + ln_variance = ln1_variance.get_ptr(); + ln_scale = ln1_scale.get_ptr(); + } else { + ln_mean = ln2_mean.get_ptr(); + ln_variance = ln2_variance.get_ptr(); + ln_scale = ln2_scale.get_ptr(); + } + + // output + auto* d_x = x_grad; + + phi::DenseTensor* d_ln_scale = nullptr; + phi::DenseTensor* d_ln_bias = nullptr; + + if (pre_layer_norm) { + d_ln_scale = ln1_scale_grad; + d_ln_bias = ln1_bias_grad; + } else { + d_ln_scale = ln2_scale_grad; + d_ln_bias = ln2_bias_grad; + } + + auto* d_linear1_weight = linear1_weight_grad; + auto* d_linear1_bias = linear1_bias_grad; + auto* d_linear2_weight = linear2_weight_grad; + auto* d_linear2_bias = linear2_bias_grad; + + float epsilon = 0.0f; + if (pre_layer_norm) { + epsilon = ln1_epsilon; + } else { + epsilon = ln2_epsilon; + } + + bool is_upscale_in_train_1 = dropout1_implementation == "upscale_in_train"; + bool is_upscale_in_train_2 = dropout2_implementation == "upscale_in_train"; + + phi::XPUDropoutParam dropout_param1; + dropout_param1.initXPUDropoutParam(dropout1_prob, + is_upscale_in_train_1, + is_test, + dropout1_fix_seed, + nullptr, + dropout1_seed_val); + phi::XPUDropoutParam dropout_param2; + dropout_param2.initXPUDropoutParam(dropout2_prob, + is_upscale_in_train_2, + is_test, + dropout2_fix_seed, + nullptr, + dropout2_seed_val); + + dev_ctx.template Alloc(d_ln_scale); + dev_ctx.template Alloc(d_ln_bias); + dev_ctx.template Alloc(d_linear1_bias); + dev_ctx.template Alloc(d_linear2_bias); + dev_ctx.template Alloc(d_linear1_weight); + dev_ctx.template Alloc(d_linear2_weight); + + auto x_dim = x_ptr->dims(); + auto mat_dim_x = phi::funcs::CreateMatrixDescriptor( + phi::RowMatrixFromVector(x_dim), 0, false); + + auto linear1_weight_dim = linear1_weight_ptr->dims(); + int d_model = linear1_weight_dim[0]; + int dim_feedforward = linear1_weight_dim[linear1_weight_dim.size() - 1]; + int bsz_seq = mat_dim_x.batch_size_ * mat_dim_x.height_; + + FFNGrad(dev_ctx, + d_out, + x_ptr, + dropout1_mask_ptr, + dropout2_mask_ptr, + linear1_out_ptr, + ln1_out_ptr, + dropout1_out_ptr, + dropout2_out_ptr, + linear1_weight_ptr, + linear2_weight_ptr, + ln_scale, + ln_mean, + ln_variance, + d_x, + d_linear1_weight, + d_linear1_bias, + d_linear2_weight, + d_linear2_bias, + d_ln_scale, + d_ln_bias, + bsz_seq, + d_model, + dim_feedforward, + dropout_param1, + dropout_param2, + act_method, + pre_layer_norm, + epsilon, + ring_id); +} +} // namespace fusion +} // namespace phi + +PD_REGISTER_KERNEL(fused_feedward_grad, + XPU, + ALL_LAYOUT, + phi::fusion::FusedFeedForwardGradKernel, + float, + phi::dtype::float16) { + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); +} diff --git a/paddle/phi/kernels/fusion/xpu/fused_feedforward_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/fused_feedforward_xpu_kernel.cc new file mode 100644 index 0000000000000..35039ba571e57 --- /dev/null +++ b/paddle/phi/kernels/fusion/xpu/fused_feedforward_xpu_kernel.cc @@ -0,0 +1,390 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/backends/xpu/xpu_context.h" +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/impl/matmul_grad_kernel_impl.h" +#include "paddle/phi/kernels/xpu/xpu_api_wrapper.h" +#include "paddle/phi/kernels/xpu/xpu_fused_common_function.h" + +namespace phi { +namespace fusion { + +template +void FFN(const phi::XPUContext& dev_ctx, + const phi::DenseTensor* x, + const phi::DenseTensor* linear1_weight, + const phi::DenseTensor* linear1_bias, + const phi::DenseTensor* linear2_weight, + const phi::DenseTensor* linear2_bias, + const phi::DenseTensor* ln_scale, + const phi::DenseTensor* ln_bias, + phi::DenseTensor* out, + phi::DenseTensor* dropout1_mask, + phi::DenseTensor* dropout2_mask, + phi::DenseTensor* ln_mean, + phi::DenseTensor* ln_variance, + phi::DenseTensor* linear1_out, + phi::DenseTensor* ln1_out, + phi::DenseTensor* dropout1_out, + phi::DenseTensor* dropout2_out, + const int bsz_seq, + const int d_model, + const int dim_feedforward, + const std::string& act_method, + const bool pre_layer_norm, + const float epsilon1, + const float epsilon2, + const phi::XPUDropoutParam& dropout_param1, + const phi::XPUDropoutParam& dropout_param2, + int ring_id) { + using XPUTypeT = typename XPUTypeTrait::Type; + xpu::Context* xpu_ctx = dev_ctx.x_context(); + xpu::ctx_guard RAII_GUARD(xpu_ctx); + + int r = xpu::SUCCESS; + + const XPUTypeT* x_ptr = reinterpret_cast(x->data()); + const XPUTypeT* residual_ptr = x_ptr; + const XPUTypeT* linear1_weight_ptr = + reinterpret_cast(linear1_weight->data()); + const XPUTypeT* linear1_bias_ptr = + reinterpret_cast(linear1_bias->data()); + const XPUTypeT* linear2_weight_ptr = + reinterpret_cast(linear2_weight->data()); + const XPUTypeT* linear2_bias_ptr = + reinterpret_cast(linear2_bias->data()); + + const float* ln_scale_ptr = ln_scale->data(); + + const float* ln_bias_ptr = ln_bias->data(); + + // out + XPUTypeT* out_ptr = reinterpret_cast(out->data()); + XPUTypeT* linear1_out_ptr = + reinterpret_cast(linear1_out->data()); + XPUTypeT* dropout1_mask_ptr = + reinterpret_cast(dropout1_mask->data()); + XPUTypeT* dropout2_mask_ptr = + reinterpret_cast(dropout2_mask->data()); + float* ln_mean_ptr = ln_mean->data(); + float* ln_variance_ptr = ln_variance->data(); + + XPUTypeT* dropout1_out_ptr = + reinterpret_cast(dropout1_out->data()); + XPUTypeT* dropout2_out_ptr = + reinterpret_cast(dropout2_out->data()); + + size_t l3_total_size = xpu_ctx->_l3_mgr.get_size(); + XPUTypeT* linear2_before_tmp_ptr = NULL; // dim_feedforward * bsz_seq + XPUTypeT* linear2_after_tmp_ptr = NULL; // d_model * bsz_seq + if (l3_total_size >= dim_feedforward * bsz_seq * sizeof(T)) { + XPUTypeT* l3_ptr = RAII_GUARD.alloc_l3(dim_feedforward * bsz_seq); + PADDLE_ENFORCE_XDNN_NOT_NULL(l3_ptr); + linear2_before_tmp_ptr = linear2_after_tmp_ptr = l3_ptr; + } else if ((l3_total_size < dim_feedforward * bsz_seq * sizeof(T)) && + (l3_total_size >= d_model * bsz_seq * sizeof(T))) { + XPUTypeT* l3_ptr = RAII_GUARD.alloc_l3(d_model * bsz_seq); + PADDLE_ENFORCE_XDNN_NOT_NULL(l3_ptr); + linear2_after_tmp_ptr = l3_ptr; + linear2_before_tmp_ptr = + RAII_GUARD.alloc(dim_feedforward * bsz_seq); + PADDLE_ENFORCE_XDNN_NOT_NULL(linear2_before_tmp_ptr); + + } else { + XPUTypeT* gm_ptr = RAII_GUARD.alloc(dim_feedforward * bsz_seq); + PADDLE_ENFORCE_XDNN_NOT_NULL(gm_ptr); + linear2_before_tmp_ptr = linear2_after_tmp_ptr = gm_ptr; + } + + // layernorm + if (pre_layer_norm) { + XPUTypeT* ln1_out_ptr = reinterpret_cast(ln1_out->data()); + r = xpu::layer_norm(xpu_ctx, + x_ptr, + ln1_out_ptr, + bsz_seq, + d_model, + epsilon1, + ln_scale_ptr, + ln_bias_ptr, + ln_mean_ptr, + ln_variance_ptr); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "layer_norm "); + x_ptr = ln1_out_ptr; + } + + // fc + phi::XpuFcInfo linear1_fc_info; + linear1_fc_info.InitFcInfo(0, + bsz_seq, + dim_feedforward, + d_model, + false, + false, + nullptr, + nullptr, + nullptr); + phi::MatMulXPUFunction(xpu_ctx, + x_ptr, + linear1_weight_ptr, + linear2_before_tmp_ptr, + linear1_fc_info, + 1.0f); + + // bias + r = xpu::broadcast_add(xpu_ctx, + linear2_before_tmp_ptr, + linear1_bias_ptr, + linear1_out_ptr, + {bsz_seq, dim_feedforward}, + {dim_feedforward}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_add"); + + // act + if (act_method == "gelu") { + r = xpu::gelu( + xpu_ctx, linear1_out_ptr, linear2_before_tmp_ptr, linear1_out->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "gelu"); + } else if (act_method == "relu") { + r = xpu::relu( + xpu_ctx, linear1_out_ptr, linear2_before_tmp_ptr, linear1_out->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "relu"); + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "Currently only supports gelu or relu activation functions!")); + } + + // dropout1 + phi::Dropout(xpu_ctx, + linear2_before_tmp_ptr, + dropout1_mask_ptr, + dropout1_out_ptr, + dropout_param1, + dropout1_out->numel()); + + // fc + phi::XpuFcInfo linear2_fc_info; + linear2_fc_info.InitFcInfo(0, + bsz_seq, + d_model, + dim_feedforward, + false, + false, + nullptr, + nullptr, + nullptr); + phi::MatMulXPUFunction(xpu_ctx, + dropout1_out_ptr, + linear2_weight_ptr, + dropout2_out_ptr, + linear2_fc_info, + 1.0f); + + // bias + r = xpu::broadcast_add(xpu_ctx, + dropout2_out_ptr, + linear2_bias_ptr, + dropout2_out_ptr, + {bsz_seq, d_model}, + {d_model}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_add"); + + // dropout2 + phi::Dropout(xpu_ctx, + dropout2_out_ptr, + dropout2_mask_ptr, + dropout2_out_ptr, + dropout_param2, + dropout2_out->numel()); + + // residual_ptr + dropout_out + XPUTypeT* residual_add_out_ptr = out_ptr; + if (pre_layer_norm == false) { + residual_add_out_ptr = dropout2_out_ptr; + } + r = xpu::broadcast_add(xpu_ctx, + residual_ptr, + dropout2_out_ptr, + residual_add_out_ptr, + {bsz_seq, d_model}, + {bsz_seq, d_model}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_add"); + + if (pre_layer_norm == false) { + r = xpu::layer_norm(xpu_ctx, + residual_add_out_ptr, + out_ptr, + bsz_seq, + d_model, + epsilon2, + ln_scale_ptr, + ln_bias_ptr, + ln_mean_ptr, + ln_variance_ptr); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "layer_norm"); + } +} + +template +void FusedFeedForwardKernel(const Context& dev_ctx, + const DenseTensor& x, + const paddle::optional& dropout1_seed, + const paddle::optional& dropout2_seed, + const DenseTensor& linear1_weight, + const paddle::optional& linear1_bias, + const DenseTensor& linear2_weight, + const paddle::optional& linear2_bias, + const paddle::optional& ln1_scale, + const paddle::optional& ln1_bias, + const paddle::optional& ln2_scale, + const paddle::optional& ln2_bias, + bool pre_layer_norm, + float ln1_epsilon, + float ln2_epsilon, + const std::string& act_method, + float dropout1_prob, + float dropout2_prob, + const std::string& dropout1_implementation, + const std::string& dropout2_implementation, + bool is_test, + bool dropout1_fix_seed, + bool dropout2_fix_seed, + int dropout1_seed_val, + int dropout2_seed_val, + bool add_residual, + int ring_id, + DenseTensor* out, + DenseTensor* dropout1_mask, + DenseTensor* dropout2_mask, + DenseTensor* ln1_mean, + DenseTensor* ln1_variance, + DenseTensor* ln2_mean, + DenseTensor* ln2_variance, + DenseTensor* linear1_out, + DenseTensor* ln1_out, + DenseTensor* dropout1_out, + DenseTensor* dropout2_out) { + auto* x_ptr = &x; + auto* linear1_weight_ptr = &linear1_weight; + auto* linear1_bias_ptr = linear1_bias.get_ptr(); + auto* linear2_weight_ptr = &linear2_weight; + auto* linear2_bias_ptr = linear2_bias.get_ptr(); + + const phi::DenseTensor* ln_scale = nullptr; + const phi::DenseTensor* ln_bias = nullptr; + phi::DenseTensor* ln_mean = nullptr; + phi::DenseTensor* ln_variance = nullptr; + + if (pre_layer_norm) { + ln_scale = ln1_scale.get_ptr(); + ln_bias = ln1_bias.get_ptr(); + ln_mean = ln1_mean; + ln_variance = ln1_variance; + dev_ctx.template Alloc(ln1_out); + } else { + ln_scale = ln2_scale.get_ptr(); + ln_bias = ln2_bias.get_ptr(); + ln_mean = ln2_mean; + ln_variance = ln2_variance; + } + + const float epsilon1 = ln1_epsilon; + const float epsilon2 = ln2_epsilon; + + bool is_upscale_in_train_1 = dropout1_implementation == "upscale_in_train"; + bool is_upscale_in_train_2 = dropout2_implementation == "upscale_in_train"; + + auto* dropout1_seed_ptr = dropout1_seed.get_ptr(); + auto* dropout2_seed_ptr = dropout2_seed.get_ptr(); + phi::XPUDropoutParam dropout_param1; + dropout_param1.initXPUDropoutParam(dropout1_prob, + is_upscale_in_train_1, + is_test, + dropout1_fix_seed, + dropout1_seed_ptr, + dropout1_seed_val); + phi::XPUDropoutParam dropout_param2; + dropout_param2.initXPUDropoutParam(dropout2_prob, + is_upscale_in_train_2, + is_test, + dropout2_fix_seed, + dropout2_seed_ptr, + dropout2_seed_val); + + dev_ctx.template Alloc(ln_mean); + dev_ctx.template Alloc(ln_variance); + + dev_ctx.template Alloc(out); + dev_ctx.template Alloc(dropout1_mask); + dev_ctx.template Alloc(dropout2_mask); + dev_ctx.template Alloc(dropout1_out); + dev_ctx.template Alloc(dropout2_out); + dev_ctx.template Alloc(linear1_out); + + auto x_dim = x_ptr->dims(); + auto mat_dim_x = phi::funcs::CreateMatrixDescriptor( + phi::RowMatrixFromVector(x_dim), 0, false); + + auto dim = linear1_weight_ptr->dims(); + int d_model = dim[0]; + int dim_feedforward = dim[dim.size() - 1]; + int bsz_seq = mat_dim_x.batch_size_ * mat_dim_x.height_; + + phi::fusion::FFN(dev_ctx, + x_ptr, + linear1_weight_ptr, + linear1_bias_ptr, + linear2_weight_ptr, + linear2_bias_ptr, + ln_scale, + ln_bias, + out, + dropout1_mask, + dropout2_mask, + ln_mean, + ln_variance, + linear1_out, + ln1_out, + dropout1_out, + dropout2_out, + bsz_seq, + d_model, + dim_feedforward, + act_method, + pre_layer_norm, + epsilon1, + epsilon2, + dropout_param1, + dropout_param2, + ring_id); +} +} // namespace fusion +} // namespace phi + +PD_REGISTER_KERNEL(fused_feedward, + XPU, + ALL_LAYOUT, + phi::fusion::FusedFeedForwardKernel, + float, + phi::dtype::float16) { + kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(5).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(6).SetDataType(phi::DataType::FLOAT32); +} diff --git a/paddle/phi/ops/compat/fused_feedforward_sig.cc b/paddle/phi/ops/compat/fused_feedforward_sig.cc new file mode 100644 index 0000000000000..1dd78288deaf8 --- /dev/null +++ b/paddle/phi/ops/compat/fused_feedforward_sig.cc @@ -0,0 +1,102 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature FeedForwardFuseOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("fused_feedforward", + {"X", + "Dropout1Seed", + "Dropout2Seed", + "Linear1Weight", + "Linear1Bias", + "Linear2Weight", + "Linear2Bias", + "Ln1Scale", + "Ln1Bias", + "Ln2Scale", + "Ln2Bias"}, + {"pre_layer_norm", + "ln1_epsilon", + "ln2_epsilon", + "act_method", + "dropout1_rate", + "dropout2_rate", + "dropout1_implementation", + "dropout2_implementation", + "is_test", + "dropout1_fix_seed", + "dropout2_fix_seed", + "dropout1_seed", + "dropout2_seed", + "add_residual", + "ring_id"}, + {"Out", + "Dropout1Mask", + "Dropout2Mask", + "Ln1Mean", + "Ln1Variance", + "Ln2Mean", + "Ln2Variance", + "Linear1Out", + "Ln1Out", + "Dropout1Out", + "Dropout2Out"}); +} + +KernelSignature FeedForwardGradFuseOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("fused_feedforward_grad", + {"Out@GRAD", "X", + "Linear1Weight", "Linear1Bias", + "Linear2Weight", "Dropout1Mask", + "Dropout2Mask", "Linear1Out", + "Dropout1Out", "Dropout2Out", + "Ln1Scale", "Ln1Bias", + "Ln1Out", "Ln1Mean", + "Ln1Variance", "Ln2Scale", + "Ln2Bias", "Ln2Mean", + "Ln2Variance", "Linear2Bias"}, + {"pre_layer_norm", + "ln1_epsilon", + "ln2_epsilon", + "act_method", + "dropout1_rate", + "dropout2_rate", + "dropout1_implementation", + "dropout2_implementation", + "is_test", + "dropout1_fix_seed", + "dropout2_fix_seed", + "dropout1_seed", + "dropout2_seed", + "add_residual", + "ring_id"}, + {"X@GRAD", + "Ln1Scale@GRAD", + "Ln1Bias@GRAD", + "Ln2Scale@GRAD", + "Ln2Bias@GRAD", + "Linear1Weight@GRAD", + "Linear1Bias@GRAD", + "Linear2Weight@GRAD", + "Linear2Bias@GRAD"}); +} +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(fused_feedforward, + phi::FeedForwardFuseOpArgumentMapping); + +PD_REGISTER_ARG_MAPPING_FN(fused_feedforward_grad, + phi::FeedForwardGradFuseOpArgumentMapping); From 81fb7df23d0321aa9b10516d9136dee23ea2ec36 Mon Sep 17 00:00:00 2001 From: zqw_1997 <118182234+zhengqiwen1997@users.noreply.github.com> Date: Mon, 24 Apr 2023 16:02:08 +0800 Subject: [PATCH 014/405] [Zero-Dim] Support output 0D for to_tensor. (#52741) * test=allcase * test=allcase * test=allcase * test=allcase * test=allcase * test=allcase * test=allcase * test=allcase * test=allcase * test=allcase * test=allcase * test=allcase * test=allcase * test=allcase * test=allcase * test=allcase * test=allcase * test=allcase * test=allcase * test=allcase * test=allcase * test=allcase * test=allcase * test=allcase * test=allcase * test=allcase * fix doc erros, test=allcase --- paddle/fluid/pybind/imperative.cc | 2 +- python/paddle/audio/functional/window.py | 9 ++- .../pp_utils/p2p_communication.py | 8 +-- .../unittests/parallel_dygraph_none_var.py | 2 +- .../fluid/tests/unittests/test_cholesky_op.py | 2 +- .../fluid/tests/unittests/test_deg2rad.py | 2 +- .../unittests/test_deprecated_decorator.py | 2 +- .../fluid/tests/unittests/test_einsum_v2.py | 2 +- .../fluid/tests/unittests/test_inplace.py | 4 +- .../tests/unittests/test_jit_save_load.py | 6 +- .../fluid/tests/unittests/test_lbfgs_v2.py | 66 +++++++++---------- .../fluid/tests/unittests/test_rad2deg.py | 4 +- .../unittests/test_state_dict_convert.py | 2 +- .../unittests/test_tensor_register_hook.py | 6 +- .../fluid/tests/unittests/test_var_base.py | 4 +- .../tests/unittests/test_zero_dim_tensor.py | 27 ++++++++ python/paddle/tensor/creation.py | 24 +++---- python/paddle/tensor/math.py | 40 +++++------ test/auto_parallel/random_control_unittest.py | 2 +- .../test_distribution_bernoulli.py | 36 +++++----- .../test_distribution_transform.py | 14 ++-- ...t_distribution_transformed_distribution.py | 2 +- test/distribution/test_kl.py | 4 +- .../test_cpu_cuda_to_tensor.py | 4 +- test/dygraph_to_static/test_fallback.py | 2 +- test/dygraph_to_static/test_to_tensor.py | 12 ++++ test/legacy_test/auto_parallel_gpt_model.py | 2 +- test/legacy_test/test_audio_functions.py | 6 +- test/quantization/imperative_test_utils.py | 2 +- 29 files changed, 169 insertions(+), 129 deletions(-) diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 44d4d070eafb1..e60211286ed37 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -1321,7 +1321,7 @@ void BindImperative(py::module *m_ptr) { import paddle - x = paddle.to_tensor(1.0, stop_gradient=False) + x = paddle.to_tensor([1.0], stop_gradient=False) detach_x = x.detach() detach_x[:] = 10.0 print(x) # Tensor(shape=[1], dtype=float32, place=CPUPlace, stop_gradient=False, diff --git a/python/paddle/audio/functional/window.py b/python/paddle/audio/functional/window.py index b2dd63973d3f2..eb84d6f188970 100644 --- a/python/paddle/audio/functional/window.py +++ b/python/paddle/audio/functional/window.py @@ -13,6 +13,8 @@ import math from typing import List, Tuple, Union +import numpy as np + import paddle from paddle import Tensor @@ -38,7 +40,12 @@ def get(self, name): @window_function_register.register() def _cat(x: List[Tensor], data_type: str) -> Tensor: - l = [paddle.to_tensor(_, data_type) for _ in x] + l = [] + for t in x: + if np.isscalar(t) and not isinstance(t, str): + l.append(paddle.to_tensor([t], data_type)) + else: + l.append(paddle.to_tensor(t, data_type)) return paddle.concat(l) diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py index 81385f9a0a4a1..9d3cb5bd17098 100644 --- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py +++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py @@ -117,7 +117,7 @@ def recv_meta(self, group): def _send_dims_shape_dtype(self, tensor, group): # send len(shape) - dims = paddle.to_tensor(len(tensor.shape)) + dims = paddle.to_tensor([len(tensor.shape)]) dst_rank = _hcg._get_p2p_next_rank() paddle.distributed.send(dims, dst=dst_rank, group=group) @@ -127,11 +127,11 @@ def _send_dims_shape_dtype(self, tensor, group): paddle.distributed.send(shape, dst=dst_rank, group=group) # send dtype - dtype = paddle.to_tensor(paddle_2_number(tensor.dtype)) + dtype = paddle.to_tensor([paddle_2_number(tensor.dtype)]) paddle.distributed.send(dtype, dst=dst_rank, group=group) # send trainable - stop_grad = paddle.to_tensor(int(tensor.stop_gradient)) + stop_grad = paddle.to_tensor([int(tensor.stop_gradient)]) paddle.distributed.send(stop_grad, dst=dst_rank, group=group) def send_meta(self, tensor, group): @@ -148,7 +148,7 @@ def send_meta(self, tensor, group): # send tensor type paddle.distributed.send(tensor_type, dst=dst_rank, group=group) - nums = paddle.to_tensor(len(tensor)) + nums = paddle.to_tensor([len(tensor)]) paddle.distributed.send(nums, dst=dst_rank, group=group) for d in tensor: diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_none_var.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_none_var.py index fa5d71dbfe2a9..f7720902946e2 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_none_var.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_none_var.py @@ -40,7 +40,7 @@ def __init__(self): self.step = 0 def forward(self, x): - return paddle.to_tensor(0.0, dtype='float32') + return paddle.to_tensor([0.0], dtype='float32') def fake_sample_reader(): diff --git a/python/paddle/fluid/tests/unittests/test_cholesky_op.py b/python/paddle/fluid/tests/unittests/test_cholesky_op.py index ba7a6fb631ad3..b1261c3d17b22 100644 --- a/python/paddle/fluid/tests/unittests/test_cholesky_op.py +++ b/python/paddle/fluid/tests/unittests/test_cholesky_op.py @@ -104,7 +104,7 @@ def test_dygraph(self): a = np.random.rand(3, 3) a_t = np.transpose(a, [1, 0]) x_data = np.matmul(a, a_t) + 1e-03 - x = paddle.to_tensor(x_data) + x = paddle.to_tensor([x_data]) out = paddle.cholesky(x, upper=False) diff --git a/python/paddle/fluid/tests/unittests/test_deg2rad.py b/python/paddle/fluid/tests/unittests/test_deg2rad.py index 77dce311a939d..0f038e86f2522 100644 --- a/python/paddle/fluid/tests/unittests/test_deg2rad.py +++ b/python/paddle/fluid/tests/unittests/test_deg2rad.py @@ -74,7 +74,7 @@ def setUp(self): def test_dygraph(self): paddle.disable_static() - x2 = paddle.to_tensor(180) + x2 = paddle.to_tensor([180]) result2 = paddle.deg2rad(x2) np.testing.assert_allclose(np.pi, result2.numpy(), rtol=1e-05) diff --git a/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py b/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py index e2ed84a57c5d0..4624897f0168e 100755 --- a/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py +++ b/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py @@ -111,7 +111,7 @@ def test_ops_elementwise_mul(self): def test_tensor_gradient(self): paddle.__version__ = '2.1.0' - x = paddle.to_tensor(5.0, stop_gradient=False) + x = paddle.to_tensor([5.0], stop_gradient=False) y = paddle.pow(x, 4.0) y.backward() diff --git a/python/paddle/fluid/tests/unittests/test_einsum_v2.py b/python/paddle/fluid/tests/unittests/test_einsum_v2.py index becadd85c02d8..434c59b5b804e 100644 --- a/python/paddle/fluid/tests/unittests/test_einsum_v2.py +++ b/python/paddle/fluid/tests/unittests/test_einsum_v2.py @@ -553,7 +553,7 @@ def test_shape(self): B = paddle.to_tensor(np.array([2.0, 3.0])).astype(paddle.bfloat16) B = B.cuda() C = paddle.einsum('i,i->', A, B) - D = paddle.to_tensor(8.0).astype(paddle.bfloat16) + D = paddle.to_tensor([8.0]).astype(paddle.bfloat16) self.assertEqual(C.item(), D.item()) diff --git a/python/paddle/fluid/tests/unittests/test_inplace.py b/python/paddle/fluid/tests/unittests/test_inplace.py index c10d7d2c5c457..91a569d34c62b 100644 --- a/python/paddle/fluid/tests/unittests/test_inplace.py +++ b/python/paddle/fluid/tests/unittests/test_inplace.py @@ -259,11 +259,11 @@ def inplace_api_processing(self, var): class TestDygraphInplaceReshapeTensor(TestDygraphInplace): def non_inplace_api_processing(self, var): - shape = paddle.to_tensor(-1) + shape = paddle.to_tensor([-1]) return paddle.reshape(var, shape) def inplace_api_processing(self, var): - shape = paddle.to_tensor(-1) + shape = paddle.to_tensor([-1]) return paddle.reshape_(var, shape) diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py index d04357e201cbb..7f58638e7ac7a 100644 --- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py @@ -283,7 +283,7 @@ def __init__(self, in_size, out_size): super().__init__() self._linear_0 = Linear(in_size, out_size) self._linear_1 = Linear(in_size, out_size) - self._scale = paddle.to_tensor(9.9) + self._scale = paddle.to_tensor([9.9]) @paddle.jit.to_static def forward(self, x): @@ -1196,7 +1196,7 @@ def __init__(self, in_size, out_size): self._linear_1_0 = Linear(self.hidden, self.hidden) self._linear_1_1 = Linear(self.hidden, self.hidden) self._linear_2 = Linear(self.hidden, out_size) - self._scale = paddle.to_tensor(9.9) + self._scale = paddle.to_tensor([9.9]) @paddle.jit.to_static def forward(self, x): @@ -1319,7 +1319,7 @@ def __init__(self, in_size, out_size, load_path): self._linear_1_0 = Linear(out_size, in_size) self._linear_1_1 = Linear(out_size, in_size) self._linear_2 = Linear(out_size, out_size) - self._scale = paddle.to_tensor(9.9) + self._scale = paddle.to_tensor([9.9]) # Load multiple times self._load_l1 = paddle.jit.load(load_path) diff --git a/python/paddle/fluid/tests/unittests/test_lbfgs_v2.py b/python/paddle/fluid/tests/unittests/test_lbfgs_v2.py index fa64c480ed970..9617938967cd3 100644 --- a/python/paddle/fluid/tests/unittests/test_lbfgs_v2.py +++ b/python/paddle/fluid/tests/unittests/test_lbfgs_v2.py @@ -208,64 +208,64 @@ def error_func(): def test_line_search(self): def func1(x, alpha, d): - return paddle.to_tensor(x + alpha * d), paddle.to_tensor(0.0) + return paddle.to_tensor(x + alpha * d), paddle.to_tensor([0.0]) def func2(x, alpha, d): - return paddle.to_tensor(x + alpha * d), paddle.to_tensor(1.0) + return paddle.to_tensor(x + alpha * d), paddle.to_tensor([1.0]) def func3(x, alpha, d): - return paddle.to_tensor(x + alpha * d), paddle.to_tensor(-1.0) + return paddle.to_tensor(x + alpha * d), paddle.to_tensor([-1.0]) _strong_wolfe( func1, - paddle.to_tensor(1.0), - paddle.to_tensor(0.001), - paddle.to_tensor(0.0), - paddle.to_tensor(1.0), - paddle.to_tensor(0.0), - paddle.to_tensor(0.0), + paddle.to_tensor([1.0]), + paddle.to_tensor([0.001]), + paddle.to_tensor([0.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([0.0]), + paddle.to_tensor([0.0]), max_ls=0, ) _strong_wolfe( func2, - paddle.to_tensor(1.0), - paddle.to_tensor(-0.001), - paddle.to_tensor(1.0), - paddle.to_tensor(1.0), - paddle.to_tensor(1.0), - paddle.to_tensor(1.0), + paddle.to_tensor([1.0]), + paddle.to_tensor([-0.001]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), max_ls=1, ) _strong_wolfe( func3, - paddle.to_tensor(1.0), - paddle.to_tensor(-0.001), - paddle.to_tensor(1.0), - paddle.to_tensor(1.0), - paddle.to_tensor(1.0), - paddle.to_tensor(1.0), + paddle.to_tensor([1.0]), + paddle.to_tensor([-0.001]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), max_ls=1, ) _cubic_interpolate( - paddle.to_tensor(2.0), - paddle.to_tensor(1.0), - paddle.to_tensor(0.0), - paddle.to_tensor(1.0), - paddle.to_tensor(2.0), - paddle.to_tensor(0.0), + paddle.to_tensor([2.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([0.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([2.0]), + paddle.to_tensor([0.0]), [0.1, 0.5], ) _cubic_interpolate( - paddle.to_tensor(2.0), - paddle.to_tensor(0.0), - paddle.to_tensor(-3.0), - paddle.to_tensor(1.0), - paddle.to_tensor(1.0), - paddle.to_tensor(-0.1), + paddle.to_tensor([2.0]), + paddle.to_tensor([0.0]), + paddle.to_tensor([-3.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([-0.1]), [0.1, 0.5], ) diff --git a/python/paddle/fluid/tests/unittests/test_rad2deg.py b/python/paddle/fluid/tests/unittests/test_rad2deg.py index 8629d7dcd37f2..710d77f0d9fc1 100644 --- a/python/paddle/fluid/tests/unittests/test_rad2deg.py +++ b/python/paddle/fluid/tests/unittests/test_rad2deg.py @@ -73,7 +73,7 @@ def setUp(self): def test_dygraph(self): paddle.disable_static() - x2 = paddle.to_tensor(np.pi / 2) + x2 = paddle.to_tensor([np.pi / 2]) result2 = paddle.rad2deg(x2) np.testing.assert_allclose(90, result2.numpy(), rtol=1e-05) @@ -91,7 +91,7 @@ def setUp(self): def test_dygraph(self): paddle.disable_static() - x2 = paddle.to_tensor(1) + x2 = paddle.to_tensor([1]) result2 = paddle.rad2deg(x2) np.testing.assert_allclose(180 / np.pi, result2.numpy(), rtol=1e-05) diff --git a/python/paddle/fluid/tests/unittests/test_state_dict_convert.py b/python/paddle/fluid/tests/unittests/test_state_dict_convert.py index a6b6975eed7a7..90bdd3c1949f5 100644 --- a/python/paddle/fluid/tests/unittests/test_state_dict_convert.py +++ b/python/paddle/fluid/tests/unittests/test_state_dict_convert.py @@ -88,7 +88,7 @@ class TestStateDictReturn(unittest.TestCase): def test_missing_keys_and_unexpected_keys(self): model1 = MyModel2() tmp_dict = {} - tmp_dict["unexpected_keys"] = paddle.to_tensor(1) + tmp_dict["unexpected_keys"] = paddle.to_tensor([1]) missing_keys, unexpected_keys = model1.set_state_dict(tmp_dict) self.assertEqual(len(missing_keys), 2) self.assertEqual(missing_keys[0], "linear.weight") diff --git a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py index 02934d07a8924..16b6d32ce404d 100644 --- a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py +++ b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py @@ -553,7 +553,7 @@ def test_register_backward_hook(self): global HOOK_INIT_VALUE global HOOK_IS_CALLED for device in self.devices: - x = paddle.to_tensor(5.0, stop_gradient=False) + x = paddle.to_tensor([5.0], stop_gradient=False) x._register_backward_hook(global_void_hook) for i in range(5): y = paddle.pow(x, 4.0) @@ -567,14 +567,14 @@ def test_register_backward_hook(self): HOOK_IS_CALLED = False def test_register_backward_hook_for_interior_var(self): - x = paddle.to_tensor(5.0, stop_gradient=False) + x = paddle.to_tensor([5.0], stop_gradient=False) y = paddle.pow(x, 4.0) with self.assertRaises(ValueError): y._register_backward_hook(global_void_hook) def test_register_backward_hook_for_var_without_gradient(self): - x = paddle.to_tensor(5.0) + x = paddle.to_tensor([5.0]) y = paddle.pow(x, 4.0) with self.assertRaises(ValueError): diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py index b533dc2b6e588..9d489c77374c2 100644 --- a/python/paddle/fluid/tests/unittests/test_var_base.py +++ b/python/paddle/fluid/tests/unittests/test_var_base.py @@ -114,7 +114,7 @@ def check_with_place(place): ) np.testing.assert_array_equal(x.numpy(), [1.0]) self.assertEqual(x.dtype, core.VarDesc.VarType.FP32) - self.assertEqual(x.shape, [1]) + self.assertEqual(x.shape, []) self.assertEqual(x.stop_gradient, False) self.assertEqual(x.type, core.VarDesc.VarType.LOD_TENSOR) @@ -407,7 +407,7 @@ def test_leaf_tensor(self): def test_detach(self): with fluid.dygraph.guard(): - x = paddle.to_tensor(1.0, dtype="float64", stop_gradient=False) + x = paddle.to_tensor([1.0], dtype="float64", stop_gradient=False) detach_x = x.detach() self.assertTrue(detach_x.stop_gradient, True) diff --git a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py index d18f94e78da39..7ea98f7c889a3 100644 --- a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py @@ -2385,6 +2385,20 @@ def body(i, x): self.assertEqual(x.grad.shape, []) np.testing.assert_allclose(x.grad, np.array(1.0)) + def test_to_tensor(self): + out1 = paddle.to_tensor(1) + out2 = paddle.to_tensor(2.5) + + out1.retain_grads() + out1.backward() + out2.retain_grads() + out2.backward() + + self.assertEqual(out1.shape, []) + self.assertEqual(out1, 1) + self.assertEqual(out2.shape, []) + self.assertEqual(out2, 2.5) + def test_linalg_slogdet(self): # 2-D input x = paddle.randn([3, 3]) @@ -4355,6 +4369,19 @@ def test_broadcast_tensors(self): self.assertEqual(out1.shape, (2, 3)) self.assertEqual(out2.shape, (2, 3)) + @prog_scope() + def test_to_tensor(self): + out1 = paddle.to_tensor(1) + out2 = paddle.to_tensor(2.5) + + prog = paddle.static.default_main_program() + res = self.exe.run(prog, fetch_list=[out1, out2]) + + self.assertEqual(res[0].shape, ()) + self.assertEqual(res[0], 1) + self.assertEqual(res[1].shape, ()) + self.assertEqual(res[1], 2.5) + @prog_scope() def test_linalg_slogdet(self): # 2-D input diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py index d9b22ac045f8d..186eda03e74d8 100644 --- a/python/paddle/tensor/creation.py +++ b/python/paddle/tensor/creation.py @@ -555,7 +555,7 @@ def _handle_dtype(data, dtype): return data if np.isscalar(data) and not isinstance(data, str): - data = np.array([data]) + data = np.array(data) elif isinstance(data, (list, tuple)): data = np.array(data) if data.dtype == np.object_: @@ -649,7 +649,7 @@ def _to_tensor_static(data, dtype=None, stop_gradient=None): if not isinstance(data, np.ndarray): if np.isscalar(data) and not isinstance(data, str): - data = np.array([data]) + data = np.array(data) elif isinstance(data, (list, tuple)): data = np.array(data) @@ -677,12 +677,6 @@ def _to_tensor_static(data, dtype=None, stop_gradient=None): and len(data.shape) > 0 and any(isinstance(x, Variable) for x in data) ): - if not all( - [x.shape == (1,) for x in data if isinstance(x, Variable)] - ): - raise TypeError( - "Unsupport paddle.to_tensor([Variable, Variable...]) with non-scalar variable." - ) to_stack_list = [None] * data.shape[0] for idx, d in enumerate(data): to_stack_list[idx] = _to_tensor_static(d, dtype, stop_gradient) @@ -717,7 +711,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True): (0D-Tensor) default_dtype Python Number ───────────────► paddle.Tensor - (1D-Tensor) + (0D-Tensor) Keep dtype np.ndarray ───────────► paddle.Tensor @@ -746,17 +740,17 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True): # paddle.to_tensor(1) - # Tensor(shape=[1], dtype=int64, place=CPUPlace, stop_gradient=True, - # [1]) + # Tensor(shape=[], dtype=int64, place=CPUPlace, stop_gradient=True, + # 1) x = paddle.to_tensor(1, stop_gradient=False) print(x) - # Tensor(shape=[1], dtype=int64, place=CPUPlace, stop_gradient=False, - # [1]) + # Tensor(shape=[], dtype=int64, place=CPUPlace, stop_gradient=False, + # 1) paddle.to_tensor(x) # A new tensor will be created with default stop_gradient=True - # Tensor(shape=[1], dtype=int64, place=CPUPlace, stop_gradient=True, - # [1]) + # Tensor(shape=[], dtype=int64, place=CPUPlace, stop_gradient=True, + # 1) paddle.to_tensor([[0.1, 0.2], [0.3, 0.4]], place=paddle.CPUPlace(), stop_gradient=False) # Tensor(shape=[2, 2], dtype=float32, place=CPUPlace, stop_gradient=False, diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index efa8cbfc54e02..6b07f57e33a17 100644 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -4421,8 +4421,8 @@ def rad2deg(x, name=None): x3 = paddle.to_tensor(1) result3 = paddle.rad2deg(x3) print(result3) - # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [57.29578018]) + # Tensor(shape=[], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # 57.29578018) """ rad2deg_scale = 180 / np.pi if in_dygraph_mode(): @@ -4485,8 +4485,8 @@ def deg2rad(x, name=None): x2 = paddle.to_tensor(180) result2 = paddle.deg2rad(x2) print(result2) - # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [3.14159274]) + # Tensor(shape=[], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # 3.14159274) """ deg2rad_scale = np.pi / 180.0 if in_dygraph_mode(): @@ -4545,8 +4545,8 @@ def gcd(x, y, name=None): x1 = paddle.to_tensor(12) x2 = paddle.to_tensor(20) paddle.gcd(x1, x2) - # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True, - # [4]) + # Tensor(shape=[], dtype=int64, place=CUDAPlace(0), stop_gradient=True, + # 4) x3 = paddle.arange(6) paddle.gcd(x3, x2) @@ -4555,17 +4555,17 @@ def gcd(x, y, name=None): x4 = paddle.to_tensor(0) paddle.gcd(x4, x2) - # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True, - # [20]) + # Tensor(shape=[], dtype=int64, place=CUDAPlace(0), stop_gradient=True, + # 20) paddle.gcd(x4, x4) - # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True, - # [0]) + # Tensor(shape=[], dtype=int64, place=CUDAPlace(0), stop_gradient=True, + # 0) x5 = paddle.to_tensor(-20) paddle.gcd(x1, x5) - # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True, - # [4]) + # Tensor(shape=[], dtype=int64, place=CUDAPlace(0), stop_gradient=True, + # 4) """ shape = paddle.broadcast_shape(x.shape, y.shape) x = paddle.broadcast_to(x, shape) @@ -4630,8 +4630,8 @@ def lcm(x, y, name=None): x1 = paddle.to_tensor(12) x2 = paddle.to_tensor(20) paddle.lcm(x1, x2) - # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True, - # [60]) + # Tensor(shape=[], dtype=int64, place=CUDAPlace(0), stop_gradient=True, + # 60) x3 = paddle.arange(6) paddle.lcm(x3, x2) @@ -4640,17 +4640,17 @@ def lcm(x, y, name=None): x4 = paddle.to_tensor(0) paddle.lcm(x4, x2) - # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True, - # [0]) + # Tensor(shape=[], dtype=int64, place=CUDAPlace(0), stop_gradient=True, + # 0) paddle.lcm(x4, x4) - # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True, - # [0]) + # Tensor(shape=[], dtype=int64, place=CUDAPlace(0), stop_gradient=True, + # 0) x5 = paddle.to_tensor(-20) paddle.lcm(x1, x5) - # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True, - # [60]) + # Tensor(shape=[], dtype=int64, place=CUDAPlace(0), stop_gradient=True, + # 60) """ d = paddle.gcd(x, y) # paddle.mod will raise an error when any element of y is 0. To avoid diff --git a/test/auto_parallel/random_control_unittest.py b/test/auto_parallel/random_control_unittest.py index 52e6e216074fd..f55b57b9f1b1e 100644 --- a/test/auto_parallel/random_control_unittest.py +++ b/test/auto_parallel/random_control_unittest.py @@ -82,7 +82,7 @@ def compare_mask_between_ranks( ): for np_mask in [mask_np_list[i] for i in comapre_idx]: - mask_tensor_local = paddle.to_tensor(np_mask.astype("float32")) + mask_tensor_local = paddle.to_tensor([np_mask.astype("float32")]) if rank == 0: mask_tensor_remote = paddle.ones_like(mask_tensor_local) dy_broadcast_helper(mask_tensor_remote) diff --git a/test/distribution/test_distribution_bernoulli.py b/test/distribution/test_distribution_bernoulli.py index 2229880b7a6bf..490bd9aa54d7a 100644 --- a/test/distribution/test_distribution_bernoulli.py +++ b/test/distribution/test_distribution_bernoulli.py @@ -184,10 +184,10 @@ def init_dynamic_data(self, probs, default_dtype, dtype): ('probs_00', 0.0, 'float64', 'float32'), ('probs_03', 0.3, 'float64', 'float32'), ('probs_10', 1.0, 'float64', 'float32'), - ('probs_tensor_03_32', paddle.to_tensor(0.3), 'float32', 'float32'), + ('probs_tensor_03_32', paddle.to_tensor([0.3]), 'float32', 'float32'), ( 'probs_tensor_03_64', - paddle.to_tensor(0.3, dtype='float64'), + paddle.to_tensor([0.3], dtype='float64'), 'float64', 'float64', ), @@ -257,11 +257,11 @@ def test_variance(self): ), ( paddle.to_tensor( - 0.0, + [0.0], ), ), - (paddle.to_tensor(1.0),), - (paddle.to_tensor(0.0, dtype='float64'),), + (paddle.to_tensor([1.0]),), + (paddle.to_tensor([0.0], dtype='float64'),), ] ) def test_log_prob(self, value): @@ -291,9 +291,9 @@ def test_log_prob(self, value): ] ), ), - (paddle.to_tensor(0.0),), - (paddle.to_tensor(1.0),), - (paddle.to_tensor(0.0, dtype='float64'),), + (paddle.to_tensor([0.0]),), + (paddle.to_tensor([1.0]),), + (paddle.to_tensor([0.0], dtype='float64'),), ] ) def test_prob(self, value): @@ -323,11 +323,11 @@ def test_prob(self, value): ] ), ), - (paddle.to_tensor(0.0),), - (paddle.to_tensor(0.3),), - (paddle.to_tensor(0.7),), - (paddle.to_tensor(1.0),), - (paddle.to_tensor(0.0, dtype='float64'),), + (paddle.to_tensor([0.0]),), + (paddle.to_tensor([0.3]),), + (paddle.to_tensor([0.7]),), + (paddle.to_tensor([1.0]),), + (paddle.to_tensor([0.0], dtype='float64'),), ] ) def test_cdf(self, value): @@ -359,7 +359,7 @@ def test_entropy(self): def test_kl_divergence(self): with paddle.fluid.dygraph.guard(self.place): - other_probs = paddle.to_tensor(0.9, dtype=self.dtype) + other_probs = paddle.to_tensor([0.9], dtype=self.dtype) rv_paddle_other = Bernoulli(other_probs) rv_np_other = BernoulliNumpy(other_probs) @@ -422,7 +422,7 @@ def test_kl_divergence(self): # 1-D probs ( 'probs_1d_1d_32', - paddle.to_tensor(0.3), + paddle.to_tensor([0.3]), 'float32', 'float32', [ @@ -432,7 +432,7 @@ def test_kl_divergence(self): ), ( 'probs_1d_1d_64', - paddle.to_tensor(0.3, dtype='float64'), + paddle.to_tensor([0.3], dtype='float64'), 'float64', 'float64', paddle.to_tensor( @@ -444,7 +444,7 @@ def test_kl_divergence(self): ), ( 'probs_1d_2d', - paddle.to_tensor(0.3), + paddle.to_tensor([0.3]), 'float32', 'float32', [100, 2], @@ -452,7 +452,7 @@ def test_kl_divergence(self): ), ( 'probs_1d_3d', - paddle.to_tensor(0.3), + paddle.to_tensor([0.3]), 'float32', 'float32', [100, 2, 3], diff --git a/test/distribution/test_distribution_transform.py b/test/distribution/test_distribution_transform.py index 4bc90d8f792e2..63ecd99d77baa 100644 --- a/test/distribution/test_distribution_transform.py +++ b/test/distribution/test_distribution_transform.py @@ -523,7 +523,7 @@ def test_codomain(self, input, expected): transform.ChainTransform( ( transform.AffineTransform( - paddle.to_tensor(0.0), paddle.to_tensor(1.0) + paddle.to_tensor([0.0]), paddle.to_tensor([1.0]) ), transform.ExpTransform(), ) @@ -560,7 +560,7 @@ def test_forward(self, chain, input, expected): transform.ChainTransform( ( transform.AffineTransform( - paddle.to_tensor(0.0), paddle.to_tensor(-1.0) + paddle.to_tensor([0.0]), paddle.to_tensor([-1.0]) ), transform.ExpTransform(), ) @@ -595,9 +595,9 @@ def test_inverse(self, chain, input, expected): transform.ChainTransform( ( transform.AffineTransform( - paddle.to_tensor(0.0), paddle.to_tensor(-1.0) + paddle.to_tensor([0.0]), paddle.to_tensor([-1.0]) ), - transform.PowerTransform(paddle.to_tensor(2.0)), + transform.PowerTransform(paddle.to_tensor([2.0])), ) ), np.array([1.0, 2.0, 3.0]), @@ -619,7 +619,7 @@ def test_forward_log_det_jacobian(self, chain, input, expected): transform.ChainTransform( ( transform.AffineTransform( - paddle.to_tensor(0.0), paddle.to_tensor(-1.0) + paddle.to_tensor([0.0]), paddle.to_tensor([-1.0]) ), transform.ExpTransform(), ) @@ -638,7 +638,7 @@ def test_forward_shape(self, chain, shape, expected_shape): transform.ChainTransform( ( transform.AffineTransform( - paddle.to_tensor(0.0), paddle.to_tensor(-1.0) + paddle.to_tensor([0.0]), paddle.to_tensor([-1.0]) ), transform.ExpTransform(), ) @@ -743,7 +743,7 @@ def test_inverse_shape(self, shape, expected_shape): @param.place(config.DEVICES) class TestPowerTransform(unittest.TestCase): def setUp(self): - self._t = transform.PowerTransform(paddle.to_tensor(2.0)) + self._t = transform.PowerTransform(paddle.to_tensor([2.0])) def test_init(self): with self.assertRaises(TypeError): diff --git a/test/distribution/test_distribution_transformed_distribution.py b/test/distribution/test_distribution_transformed_distribution.py index 57264b5f8972a..09b5bce4ecc0a 100644 --- a/test/distribution/test_distribution_transformed_distribution.py +++ b/test/distribution/test_distribution_transformed_distribution.py @@ -41,7 +41,7 @@ def _np_sum_rightmost(self, value, n): return np.sum(value, tuple(range(-n, 0))) if n > 0 else value def test_log_prob(self): - value = paddle.to_tensor(0.5) + value = paddle.to_tensor([0.5]) np.testing.assert_allclose( self.simple_log_prob(value, self.base, self.transforms), self._t.log_prob(value), diff --git a/test/distribution/test_kl.py b/test/distribution/test_kl.py index e4b5c51fa75a4..d75f17208a8b9 100644 --- a/test/distribution/test_kl.py +++ b/test/distribution/test_kl.py @@ -139,8 +139,8 @@ def test_dispatch_with_unregister(self): ), ( 'test-same-dist', - mock.Exponential(paddle.to_tensor(1.0)), - mock.Exponential(paddle.to_tensor(1.0)), + mock.Exponential(paddle.to_tensor([1.0])), + mock.Exponential(paddle.to_tensor([1.0])), ), ], ) diff --git a/test/dygraph_to_static/test_cpu_cuda_to_tensor.py b/test/dygraph_to_static/test_cpu_cuda_to_tensor.py index 24f0bd84556a2..f8d15971a7bc0 100644 --- a/test/dygraph_to_static/test_cpu_cuda_to_tensor.py +++ b/test/dygraph_to_static/test_cpu_cuda_to_tensor.py @@ -35,8 +35,8 @@ def func(x): class TestToTensor(unittest.TestCase): def test_to_tensor_with_variable_list(self): def func(x): - ones = paddle.to_tensor([1]) - twos = paddle.to_tensor([2]) + ones = paddle.to_tensor(1) + twos = paddle.to_tensor(2) x = paddle.to_tensor([ones, twos, 3, 4]) return x diff --git a/test/dygraph_to_static/test_fallback.py b/test/dygraph_to_static/test_fallback.py index 6da8602e6043f..e4dc0114054ad 100644 --- a/test/dygraph_to_static/test_fallback.py +++ b/test/dygraph_to_static/test_fallback.py @@ -52,7 +52,7 @@ def forward(self, x): class TestFallback(unittest.TestCase): def setUp(self): - self.x = paddle.to_tensor(2).astype('int') + self.x = paddle.to_tensor([2]).astype('int') def tearDown(self): pass diff --git a/test/dygraph_to_static/test_to_tensor.py b/test/dygraph_to_static/test_to_tensor.py index b0131263c4e69..05cd5ec78f2c8 100644 --- a/test/dygraph_to_static/test_to_tensor.py +++ b/test/dygraph_to_static/test_to_tensor.py @@ -84,6 +84,12 @@ def case6(x): return a +def case7(x): + a = paddle.to_tensor(10.0) + + return a + + class TestToTensorReturnVal(unittest.TestCase): def test_to_tensor_badreturn(self): paddle.disable_static() @@ -131,6 +137,12 @@ def test_to_tensor_badreturn(self): self.assertTrue(a.stop_gradient == b.stop_gradient) self.assertTrue(a.place._equals(b.place)) + a = paddle.jit.to_static(case7)(x) + b = case7(x) + self.assertTrue(a.dtype == b.dtype) + self.assertTrue(a.stop_gradient == b.stop_gradient) + self.assertTrue(a.place._equals(b.place)) + class TestStatic(unittest.TestCase): def test_static(self): diff --git a/test/legacy_test/auto_parallel_gpt_model.py b/test/legacy_test/auto_parallel_gpt_model.py index 5aef64e412ad1..7cf48cf1c475b 100644 --- a/test/legacy_test/auto_parallel_gpt_model.py +++ b/test/legacy_test/auto_parallel_gpt_model.py @@ -233,7 +233,7 @@ def core_attn(self, q, k, v, attn_mask): product = paddle.matmul(x=q, y=k, transpose_y=True) product = paddle.multiply( product, - paddle.to_tensor(self.head_dim**-0.5, dtype=product.dtype), + paddle.to_tensor([self.head_dim**-0.5], dtype=product.dtype), ) if attn_mask is not None: product = product + attn_mask diff --git a/test/legacy_test/test_audio_functions.py b/test/legacy_test/test_audio_functions.py index 8400bd4ecb40e..47adbdd490501 100644 --- a/test/legacy_test/test_audio_functions.py +++ b/test/legacy_test/test_audio_functions.py @@ -60,7 +60,7 @@ def get_wav_data(dtype: str, num_channels: int, num_frames: int): def test_audio_function(self, val: float, htk_flag: bool): mel_paddle = paddle.audio.functional.hz_to_mel(val, htk_flag) mel_paddle_tensor = paddle.audio.functional.hz_to_mel( - paddle.to_tensor(val), htk_flag + paddle.to_tensor([val]), htk_flag ) mel_librosa = librosa.hz_to_mel(val, htk_flag) np.testing.assert_almost_equal(mel_paddle, mel_librosa, decimal=5) @@ -70,7 +70,7 @@ def test_audio_function(self, val: float, htk_flag: bool): hz_paddle = paddle.audio.functional.mel_to_hz(val, htk_flag) hz_paddle_tensor = paddle.audio.functional.mel_to_hz( - paddle.to_tensor(val), htk_flag + paddle.to_tensor([val]), htk_flag ) hz_librosa = librosa.mel_to_hz(val, htk_flag) np.testing.assert_almost_equal(hz_paddle, hz_librosa, decimal=4) @@ -79,7 +79,7 @@ def test_audio_function(self, val: float, htk_flag: bool): ) decibel_paddle = paddle.audio.functional.power_to_db( - paddle.to_tensor(val) + paddle.to_tensor([val]) ) decibel_librosa = librosa.power_to_db(val) np.testing.assert_almost_equal( diff --git a/test/quantization/imperative_test_utils.py b/test/quantization/imperative_test_utils.py index 9e9c2cca447a5..36e931091543f 100644 --- a/test/quantization/imperative_test_utils.py +++ b/test/quantization/imperative_test_utils.py @@ -165,7 +165,7 @@ def forward(self, inputs): x = self.features(x) x = paddle.flatten(x, 1) - x = self.add(x, paddle.to_tensor(0.0)) # For CI + x = self.add(x, paddle.to_tensor([0.0])) # For CI x = self.fc(x) return x From b06ec0c759cf79aa92d6124ac6fcde315785574f Mon Sep 17 00:00:00 2001 From: HongyuJia Date: Mon, 24 Apr 2023 16:02:21 +0800 Subject: [PATCH 015/405] [CppExtension Unittest] Add unit test of vector (#53040) --- .../tests/cpp_extension/custom_extension.cc | 11 +++++++++++ .../cpp_extension/test_cpp_extension_jit.py | 15 +++++++++++++++ .../cpp_extension/test_cpp_extension_setup.py | 17 +++++++++++++++++ 3 files changed, 43 insertions(+) diff --git a/python/paddle/fluid/tests/cpp_extension/custom_extension.cc b/python/paddle/fluid/tests/cpp_extension/custom_extension.cc index 2fc5c42a80d75..67653bffb3bdb 100644 --- a/python/paddle/fluid/tests/cpp_extension/custom_extension.cc +++ b/python/paddle/fluid/tests/cpp_extension/custom_extension.cc @@ -26,6 +26,16 @@ paddle::Tensor custom_add(const paddle::Tensor& x, const paddle::Tensor& y) { return x.exp() + y.exp(); } +std::vector custom_tensor( + const std::vector& inputs) { + std::vector out; + out.reserve(inputs.size()); + for (const auto& input : inputs) { + out.push_back(input + 1.0); + } + return out; +} + paddle::Tensor nullable_tensor(bool return_none = false) { paddle::Tensor t; if (!return_none) { @@ -45,6 +55,7 @@ paddle::optional optional_tensor(bool return_option = false) { PYBIND11_MODULE(custom_cpp_extension, m) { m.def("custom_add", &custom_add, "exp(x) + exp(y)"); m.def("custom_sub", &custom_sub, "exp(x) - exp(y)"); + m.def("custom_tensor", &custom_tensor, "x + 1"); m.def("nullable_tensor", &nullable_tensor, "returned Tensor might be None"); m.def( "optional_tensor", &optional_tensor, "returned Tensor might be optional"); diff --git a/python/paddle/fluid/tests/cpp_extension/test_cpp_extension_jit.py b/python/paddle/fluid/tests/cpp_extension/test_cpp_extension_jit.py index bc6f8113afd91..5723df1b585b0 100644 --- a/python/paddle/fluid/tests/cpp_extension/test_cpp_extension_jit.py +++ b/python/paddle/fluid/tests/cpp_extension/test_cpp_extension_jit.py @@ -68,6 +68,7 @@ def tearDown(self): def test_cpp_extension(self): self._test_extension_function() self._test_extension_class() + self._test_vector_tensor() self._test_nullable_tensor() self._test_optional_tensor() if paddle.is_compiled_with_cuda(): @@ -109,6 +110,20 @@ def _test_extension_class(self): atol=1e-5, ) + def _test_vector_tensor(self): + for dtype in self.dtypes: + np_inputs = [ + np.random.uniform(-1, 1, [4, 8]).astype(dtype) for _ in range(3) + ] + inputs = [paddle.to_tensor(np_x, dtype=dtype) for np_x in np_inputs] + + out = custom_cpp_extension.custom_tensor(inputs) + target_out = [x + 1.0 for x in inputs] + for i in range(3): + np.testing.assert_allclose( + out[i].numpy(), target_out[i].numpy(), atol=1e-5 + ) + def _test_nullable_tensor(self): x = custom_cpp_extension.nullable_tensor(True) assert x is None, "Return None when input parameter return_none = True" diff --git a/python/paddle/fluid/tests/cpp_extension/test_cpp_extension_setup.py b/python/paddle/fluid/tests/cpp_extension/test_cpp_extension_setup.py index 53dffde432095..15d0cb77d03d5 100644 --- a/python/paddle/fluid/tests/cpp_extension/test_cpp_extension_setup.py +++ b/python/paddle/fluid/tests/cpp_extension/test_cpp_extension_setup.py @@ -148,6 +148,7 @@ def test_cpp_extension(self): # Extension self._test_extension_function_plain() self._test_extension_function_mixed() + self._test_vector_tensor() self._test_extension_class() self._test_nullable_tensor() self._test_optional_tensor() @@ -218,6 +219,22 @@ def _test_extension_class(self): atol=1e-5, ) + def _test_vector_tensor(self): + import custom_cpp_extension + + for dtype in self.dtypes: + np_inputs = [ + np.random.uniform(-1, 1, [4, 8]).astype(dtype) for _ in range(3) + ] + inputs = [paddle.to_tensor(np_x, dtype=dtype) for np_x in np_inputs] + + out = custom_cpp_extension.custom_tensor(inputs) + target_out = [x + 1 for x in inputs] + for i in range(3): + np.testing.assert_allclose( + out[i].numpy(), target_out[i].numpy(), atol=1e-5 + ) + def _test_nullable_tensor(self): import custom_cpp_extension From 987fb2d88e8731544a777342f75ffcc2cdcc0359 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?= <83450930+Liyulingyue@users.noreply.github.com> Date: Mon, 24 Apr 2023 16:11:59 +0800 Subject: [PATCH 016/405] rm mlu (#53194) --- paddle/fluid/imperative/amp_auto_cast.cc | 4 ++-- paddle/fluid/operators/collective/c_comm_init_op.cc | 4 ++-- .../fluid/operators/generator/get_expected_kernel_func.cc | 2 +- paddle/fluid/operators/reduce_ops/reduce_op.h | 2 +- paddle/fluid/operators/softmax_op.cc | 4 ++-- paddle/fluid/platform/device_context.h | 1 - paddle/fluid/platform/profiler/dump/nodetree.proto | 2 -- paddle/fluid/platform/profiler/profiler.h | 2 +- paddle/fluid/pybind/imperative.cc | 7 +++---- paddle/phi/api/profiler/trace_event.h | 2 -- paddle/phi/common/backend.h | 1 - paddle/phi/common/place.h | 1 - python/paddle/amp/auto_cast.py | 2 +- python/paddle/amp/grad_scaler.py | 2 +- python/paddle/distributed/launch/main.py | 2 +- python/paddle/distributed/spawn.py | 8 +++----- python/paddle/profiler/profiler.py | 8 ++------ 17 files changed, 20 insertions(+), 34 deletions(-) diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc index 2689a4eafa442..bf6bdf382ce44 100644 --- a/paddle/fluid/imperative/amp_auto_cast.cc +++ b/paddle/fluid/imperative/amp_auto_cast.cc @@ -57,7 +57,7 @@ OpSupportedInfos(const std::string& place, 0, platform::errors::InvalidArgument( "The argument `place` should be 'GPU', 'CPU', 'XPU', " - "'NPU', 'MLU', but got '%s'.", + "'NPU', but got '%s'.", place)); std::unordered_set all_ops; @@ -148,7 +148,7 @@ AmpOperators::AmpOperators() OpSupportedInfos("GPU", paddle::framework::proto::VarType::BF16)); unsupported_bf16_ops_->insert(unsupported_ops_gpu_bf16.begin(), unsupported_ops_gpu_bf16.end()); -// NOTE: GPU/NPU/XPU/MLU is compiled seperatly. +// NOTE: GPU/NPU/XPU is compiled seperatly. #elif defined(PADDLE_WITH_XPU) auto unsupported_ops_xpu_fp16 = std::get<2>( OpSupportedInfos("XPU", paddle::framework::proto::VarType::FP16)); diff --git a/paddle/fluid/operators/collective/c_comm_init_op.cc b/paddle/fluid/operators/collective/c_comm_init_op.cc index 9a34a3a2f3779..b32857a27b2d2 100644 --- a/paddle/fluid/operators/collective/c_comm_init_op.cc +++ b/paddle/fluid/operators/collective/c_comm_init_op.cc @@ -57,14 +57,14 @@ class CCommInitOp : public framework::OperatorBase { using CommContext = platform::BKCLCommContext; #else PADDLE_THROW(platform::errors::PreconditionNotMet( - "PaddlePaddle should be compiled with GPU or XPU or MLU.")); + "PaddlePaddle should be compiled with GPU or XPU.")); #endif PADDLE_ENFORCE_EQ( platform::is_gpu_place(place) || platform::is_xpu_place(place), true, platform::errors::PreconditionNotMet( - "CCommInitOp can run on gpu or xpu or mlu place only.")); + "CCommInitOp can run on gpu or xpu place only.")); #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) diff --git a/paddle/fluid/operators/generator/get_expected_kernel_func.cc b/paddle/fluid/operators/generator/get_expected_kernel_func.cc index 931bbc1fecc7d..558b0c400a4ca 100644 --- a/paddle/fluid/operators/generator/get_expected_kernel_func.cc +++ b/paddle/fluid/operators/generator/get_expected_kernel_func.cc @@ -80,7 +80,7 @@ phi::KernelKey GetReduceExpectedKernelType( platform::is_custom_place(ctx.GetPlace()), true, platform::errors::InvalidArgument( - "float16 can only be used on GPU or NPU or MLU or XPU place")); + "float16 can only be used on GPU or NPU or XPU place")); } return phi::KernelKey(input_data_type, ctx.GetPlace()); } diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h index 1a3925e4422d0..3c2b0b948bb22 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.h @@ -629,7 +629,7 @@ class ReduceBaseOp : public framework::OperatorWithKernel { platform::is_custom_place(ctx.GetPlace()), true, platform::errors::InvalidArgument( - "float16 can only be used on GPU or NPU or MLU or XPU place")); + "float16 can only be used on GPU or NPU or XPU place")); } return phi::KernelKey(input_data_type, ctx.GetPlace()); } diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index 4f440ec495d42..ab5816965f05c 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -48,7 +48,7 @@ class SoftmaxOp : public framework::OperatorWithKernel { platform::is_custom_place(ctx.GetPlace()), true, platform::errors::InvalidArgument( - "float16 can only be used on GPU/NPU/XPU/MLU and custom place")); + "float16 can only be used on GPU/NPU/XPU and custom place")); } return phi::KernelKey( ctx.GetPlace(), layout_, phi::TransToPhiDataType(input_data_type)); @@ -132,7 +132,7 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { platform::is_xpu_place(ctx.GetPlace()) || platform::is_custom_place(ctx.GetPlace()))) PADDLE_THROW(platform::errors::InvalidArgument( - "float16 can only be used on GPU/NPU/XPU/MLU and custom place")); + "float16 can only be used on GPU/NPU/XPU and custom place")); } return phi::KernelKey( ctx.GetPlace(), layout_, phi::TransToPhiDataType(input_data_type)); diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index d2866a9aa1165..96ddef2c60efe 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -96,7 +96,6 @@ enum DeviceType { NPU = 2, XPU = 3, IPU = 4, - MLU = 5, CUSTOM_DEVICE = 6, MAX_DEVICE_TYPES = 7, diff --git a/paddle/fluid/platform/profiler/dump/nodetree.proto b/paddle/fluid/platform/profiler/dump/nodetree.proto index dc7b5046b079d..cfb424b020269 100644 --- a/paddle/fluid/platform/profiler/dump/nodetree.proto +++ b/paddle/fluid/platform/profiler/dump/nodetree.proto @@ -46,8 +46,6 @@ enum TracerEventTypeProto { PythonOp = 13; // Used to mark python level userdefined PythonUserDefined = 14; - // Used to mark mlu runtime record returned by cnpapi - MluRuntime = 15; }; enum TracerMemEventTypeProto { diff --git a/paddle/fluid/platform/profiler/profiler.h b/paddle/fluid/platform/profiler/profiler.h index 2a7100b0a64ea..b486e7543d96c 100644 --- a/paddle/fluid/platform/profiler/profiler.h +++ b/paddle/fluid/platform/profiler/profiler.h @@ -39,7 +39,7 @@ static constexpr uint32_t kProfileCustomDeviceOptionBit = 3; void SynchronizeDevice(); struct ProfilerOptions { - uint32_t trace_switch = 0; // bit 0: cpu, bit 1: gpu, bit 2: mlu + uint32_t trace_switch = 0; // bit 0: cpu, bit 1: gpu uint32_t trace_level = FLAGS_host_trace_level; }; diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index e60211286ed37..d6a5a8b8dfc87 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -155,7 +155,7 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) { PADDLE_THROW(platform::errors::InvalidArgument( "Place should be one of " "Place/CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace/IPUPlace/" - "MLUPlace/CustomPlace")); + "CustomPlace")); } } @@ -209,8 +209,7 @@ static void InitVarBaseAndTensor(imperative::VarBase *self, } else { PADDLE_THROW(platform::errors::InvalidArgument( "Place should be one of " - "CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace/IPUPlace/" - "MLUPlace")); + "CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace/IPUPlace/")); } self->SetDataType(framework::TransToProtoVarType(tensor->dtype())); } @@ -2214,7 +2213,7 @@ void BindImperative(py::module *m_ptr) { } else { PADDLE_THROW(platform::errors::InvalidArgument( "Incompatible Place Type: supports XPUPlace, CUDAPlace, " - "CPUPlace, NPUPlace, IPUPlace, MLUPlace" + "CPUPlace, NPUPlace, IPUPlace" "and CUDAPinnedPlace, " "but got Unknown Type!")); } diff --git a/paddle/phi/api/profiler/trace_event.h b/paddle/phi/api/profiler/trace_event.h index 1b049a0f105cb..e526953d5c8e0 100644 --- a/paddle/phi/api/profiler/trace_event.h +++ b/paddle/phi/api/profiler/trace_event.h @@ -51,8 +51,6 @@ enum class TracerEventType { PythonOp = 13, // Used to mark python level userdefined PythonUserDefined = 14, - // Used to mark mlu runtime record returned by cnpapi - MluRuntime = 15, // A flag to denote the number of current types NumTypes }; diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h index 904038a4f5602..b7f30797ca78e 100644 --- a/paddle/phi/common/backend.h +++ b/paddle/phi/common/backend.h @@ -53,7 +53,6 @@ enum class Backend : uint8_t { // various acceleration devices' backends XPU, // XPU currently does not exist at the same time as CUDA NPU, // NPU currently does not exist at the same time as CUDA - MLU, // MLU currently does not exist at the same time as CUDA IPU, // paddle kernel primitives backend diff --git a/paddle/phi/common/place.h b/paddle/phi/common/place.h index e65235cfa3d69..543a79977eb7e 100644 --- a/paddle/phi/common/place.h +++ b/paddle/phi/common/place.h @@ -34,7 +34,6 @@ enum class AllocationType : int8_t { NPU = 5, NPUPINNED = 6, IPU = 7, - MLU = 8, CUSTOM = 9, }; diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py index 1f82533edbfb3..ae9c957df68fa 100644 --- a/python/paddle/amp/auto_cast.py +++ b/python/paddle/amp/auto_cast.py @@ -349,7 +349,7 @@ def amp_guard( or tracer._expected_place.is_custom_place() ): warnings.warn( - 'amp_guard can only be enabled on CUDAPlace, XPUPlace, MLUPlace, NPUPlace, and CustomPlace, current place is %s, so it makes no effect.' + 'amp_guard can only be enabled on CUDAPlace, XPUPlace, NPUPlace, and CustomPlace, current place is %s, so it makes no effect.' % tracer._expected_place ) enable = False diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py index 5c2d033d33633..0f6d9f21a32c6 100644 --- a/python/paddle/amp/grad_scaler.py +++ b/python/paddle/amp/grad_scaler.py @@ -108,7 +108,7 @@ def __init__( or tracer._expected_place.is_custom_place() ): warnings.warn( - 'AmpScaler can only be enabled on CUDAPlace, XPUPlace, MLUPlace and CustomPlace, current place is %s, so it makes no effect.' + 'AmpScaler can only be enabled on CUDAPlace, XPUPlace and CustomPlace, current place is %s, so it makes no effect.' % tracer._expected_place ) enable = False diff --git a/python/paddle/distributed/launch/main.py b/python/paddle/distributed/launch/main.py index da113e72c35c5..02099c743933e 100644 --- a/python/paddle/distributed/launch/main.py +++ b/python/paddle/distributed/launch/main.py @@ -52,7 +52,7 @@ def launch(): - ``--job_id``: The job unique id, it affects the log files' name. e.g., ``--job_id=job1``. Default ``--job_id=default``. - - ``--devices``: The selected accelerate devices on nodes, can be gpu/xpu/npu/mlu etc.. e.g., ``--devices=0,1,2,3`` will launch four training processes each bound to one device. + - ``--devices``: The selected accelerate devices on nodes, can be gpu/xpu/npu etc.. e.g., ``--devices=0,1,2,3`` will launch four training processes each bound to one device. - ``training_script``: The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script. e.g., ``training.py`` diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py index 713ba7d118f0a..62e6eb0b14228 100644 --- a/python/paddle/distributed/spawn.py +++ b/python/paddle/distributed/spawn.py @@ -428,9 +428,9 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options): Start multiple processes with ``spawn`` method for parallel training. .. note:: - ``spawn`` now only supports GPU or XPU or MLU collective mode. The collective mode - of GPU and XPU and MLU cannot be started at the same time, so the option `gpus` and - `xpus` and 'mlus' cannot be configured at the same time. + ``spawn`` now only supports GPU or XPU collective mode. The collective mode + of GPU and XPU cannot be started at the same time, so the option `gpus` and + `xpus` cannot be configured at the same time. Args: func (function): The target function is called by spawned process. @@ -457,8 +457,6 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options): selected gpus, such as "0,1,2,3". Default: None; (3) xpus (string): The training process will run on the selected xpus, such as "0,1,2,3". Default: None; - (4) mlus (string): The training process will run on the - selected mlus, such as "0,1,2,3". Default: None; (5) ips (string): Paddle cluster nodes ips, such as "192.168.0.16,192.168.0.17". Default: "127.0.0.1" . diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py index 570580a3d2cd4..065721a274750 100644 --- a/python/paddle/profiler/profiler.py +++ b/python/paddle/profiler/profiler.py @@ -98,19 +98,16 @@ class ProfilerState(Enum): class ProfilerTarget(Enum): r""" - ProfilerTarget is used to specify target device for :ref:`profiling ` . Only CPU, GPU and MLU are supported currently. + ProfilerTarget is used to specify target device for :ref:`profiling ` . Only CPU and GPU are supported currently. The meaning of each ProfilerState is as following - **ProfilerTarget.CPU** : Profile events on CPU. - **ProfilerTarget.GPU** : Profile events on GPU. - - - **ProfilerTarget.MLU** : Profile events on MLU. """ CPU = 0 GPU = 1 - MLU = 2 CUSTOM_DEVICE = 3 @@ -335,7 +332,6 @@ def _get_supported_targets() -> Iterable[ProfilerTarget]: if _Profiler.is_cnpapi_supported(): return [ ProfilerTarget.CPU, - ProfilerTarget.MLU, ProfilerTarget.CUSTOM_DEVICE, ] return [ProfilerTarget.CPU, ProfilerTarget.CUSTOM_DEVICE] @@ -346,7 +342,7 @@ class Profiler: Profiler context manager, user interface to manage profiling process to start, stop, export profiling data and print summary table. Args: - targets (list, optional): specify target devices to profile, and all existing and supported devices will be chosen by default. Currently supported values, :ref:`ProfilerTarget.CPU ` , :ref:`ProfilerTarget.GPU ` and :ref:`ProfilerTarget.MLU ` . + targets (list, optional): specify target devices to profile, and all existing and supported devices will be chosen by default. Currently supported values, :ref:`ProfilerTarget.CPU ` and :ref:`ProfilerTarget.GPU ` . scheduler (Callable|tuple, optional): If it is a callable object, it takes a step number as parameter and return the corresponding :ref:`ProfilerState `. This callable object can be generated by :ref:`make_scheduler ` function. If not provided (None), the default scheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch, which means profiling range [start_batch, end_batch). From 6a8d98e0bbca6b9c45b75bf7fe156330c0a995f7 Mon Sep 17 00:00:00 2001 From: Siming Dai <908660116@qq.com> Date: Mon, 24 Apr 2023 16:13:10 +0800 Subject: [PATCH 017/405] Add weighted sample (#52013) Add paddle.geometric.weighted_sample_neighbors API --- paddle/phi/api/yaml/ops.yaml | 9 + paddle/phi/infermeta/multiary.cc | 47 ++ paddle/phi/infermeta/multiary.h | 11 + .../cpu/weighted_sample_neighbors_kernel.cc | 255 +++++++++ paddle/phi/kernels/funcs/block_radix_topk.cuh | 349 ++++++++++++ paddle/phi/kernels/funcs/random.cuh | 80 +++ .../gpu/weighted_sample_neighbors_kernel.cu | 535 ++++++++++++++++++ .../weighted_sample_neighbors_kernel.h | 35 ++ .../test_weighted_sample_neighbors.py | 217 +++++++ python/paddle/geometric/__init__.py | 2 + python/paddle/geometric/sampling/__init__.py | 1 + python/paddle/geometric/sampling/neighbors.py | 152 ++++- 12 files changed, 1691 insertions(+), 2 deletions(-) create mode 100644 paddle/phi/kernels/cpu/weighted_sample_neighbors_kernel.cc create mode 100644 paddle/phi/kernels/funcs/block_radix_topk.cuh create mode 100644 paddle/phi/kernels/funcs/random.cuh create mode 100644 paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu create mode 100644 paddle/phi/kernels/weighted_sample_neighbors_kernel.h create mode 100644 python/paddle/fluid/tests/unittests/test_weighted_sample_neighbors.py diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml index b07bf0ecb99a5..1541d1890a07a 100644 --- a/paddle/phi/api/yaml/ops.yaml +++ b/paddle/phi/api/yaml/ops.yaml @@ -2144,6 +2144,15 @@ intermediate: warprnntgrad backward : warprnnt_grad +- op : weighted_sample_neighbors + args : (Tensor row, Tensor colptr, Tensor edge_weight, Tensor input_nodes, Tensor eids, int sample_size, bool return_eids) + output : Tensor(out_neighbors), Tensor(out_count), Tensor(out_eids) + infer_meta : + func : WeightedSampleNeighborsInferMeta + kernel : + func : weighted_sample_neighbors + optional: eids + - op : where args : (Tensor condition, Tensor x, Tensor y) output : Tensor diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 45769cdcb591f..5a8e38e21fd72 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -3249,5 +3249,52 @@ void MoeInferMeta(const MetaTensor& x, out->set_layout(x.layout()); } +void WeightedSampleNeighborsInferMeta(const MetaTensor& row, + const MetaTensor& col_ptr, + const MetaTensor& edge_weight, + const MetaTensor& x, + const MetaTensor& eids, + int sample_size, + bool return_eids, + MetaTensor* out, + MetaTensor* out_count, + MetaTensor* out_eids) { + // GSN: GraphSampleNeighbors + auto GSNShapeCheck = [](const phi::DDim& dims, std::string tensor_name) { + if (dims.size() == 2) { + PADDLE_ENFORCE_EQ( + dims[1], + 1, + phi::errors::InvalidArgument("The last dim of %s should be 1 when it " + "is 2D, but we get %d", + tensor_name, + dims[1])); + } else { + PADDLE_ENFORCE_EQ( + dims.size(), + 1, + phi::errors::InvalidArgument( + "The %s should be 1D, when it is not 2D, but we get %d", + tensor_name, + dims.size())); + } + }; + + GSNShapeCheck(row.dims(), "row"); + GSNShapeCheck(col_ptr.dims(), "colptr"); + GSNShapeCheck(edge_weight.dims(), "edge_weight"); + GSNShapeCheck(x.dims(), "input_nodes"); + if (return_eids) { + GSNShapeCheck(eids.dims(), "eids"); + out_eids->set_dims({-1}); + out_eids->set_dtype(row.dtype()); + } + + out->set_dims({-1}); + out->set_dtype(row.dtype()); + out_count->set_dims({-1}); + out_count->set_dtype(DataType::INT32); +} + } // namespace phi PD_REGISTER_INFER_META_FN(batch_norm_infer, phi::BatchNormInferInferMeta); diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index f094ea90d9a9d..993e6c21ff6ff 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -550,6 +550,17 @@ void WarprnntInferMeta(const MetaTensor& input, MetaTensor* loss, MetaTensor* warpctcgrad); +void WeightedSampleNeighborsInferMeta(const MetaTensor& row, + const MetaTensor& col_ptr, + const MetaTensor& edge_weight, + const MetaTensor& x, + const MetaTensor& eids, + int sample_size, + bool return_eids, + MetaTensor* out, + MetaTensor* out_count, + MetaTensor* out_eids); + void WhereInferMeta(const MetaTensor& condition, const MetaTensor& x, const MetaTensor& y, diff --git a/paddle/phi/kernels/cpu/weighted_sample_neighbors_kernel.cc b/paddle/phi/kernels/cpu/weighted_sample_neighbors_kernel.cc new file mode 100644 index 0000000000000..cc2b6cdbdf2fa --- /dev/null +++ b/paddle/phi/kernels/cpu/weighted_sample_neighbors_kernel.cc @@ -0,0 +1,255 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/weighted_sample_neighbors_kernel.h" + +#include +#include +#include + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +struct GraphWeightedNode { + T node_id; + float weight_key; + T eid; + GraphWeightedNode() { + node_id = 0; + weight_key = 0; + eid = 0; + } + GraphWeightedNode(T node_id, float weight_key, T eid = 0) + : node_id(node_id), weight_key(weight_key), eid(eid) {} + void operator=(const GraphWeightedNode& other) { + node_id = other.node_id; + weight_key = other.weight_key; + eid = other.eid; + } + friend bool operator>(const GraphWeightedNode& n1, + const GraphWeightedNode& n2) { + return n1.weight_key > n2.weight_key; + } +}; + +template +void SampleWeightedNeighbors( + std::vector& out_src, // NOLINT + const std::vector& out_weight, + std::vector& out_eids, // NOLINT + int sample_size, + std::mt19937& rng, // NOLINT + std::uniform_real_distribution& dice_distribution, // NOLINT + bool return_eids) { + std::priority_queue, + std::vector>, + std::greater>> + min_heap; + for (size_t i = 0; i < out_src.size(); i++) { + float weight_key = log2(dice_distribution(rng)) * (1 / out_weight[i]); + if (static_cast(i) < sample_size) { + if (!return_eids) { + min_heap.push(phi::GraphWeightedNode(out_src[i], weight_key)); + } else { + min_heap.push( + phi::GraphWeightedNode(out_src[i], weight_key, out_eids[i])); + } + } else { + const phi::GraphWeightedNode& small = min_heap.top(); + phi::GraphWeightedNode cmp; + if (!return_eids) { + cmp = GraphWeightedNode(out_src[i], weight_key); + } else { + cmp = GraphWeightedNode(out_src[i], weight_key, out_eids[i]); + } + bool flag = cmp > small; + if (flag) { + min_heap.pop(); + min_heap.push(cmp); + } + } + } + + int cnt = 0; + while (!min_heap.empty()) { + const phi::GraphWeightedNode& tmp = min_heap.top(); + out_src[cnt] = tmp.node_id; + if (return_eids) { + out_eids[cnt] = tmp.eid; + } + cnt++; + min_heap.pop(); + } +} + +template +void SampleNeighbors(const T* row, + const T* col_ptr, + const float* edge_weight, + const T* eids, + const T* input, + std::vector* output, + std::vector* output_count, + std::vector* output_eids, + int sample_size, + int bs, + bool return_eids) { + std::vector> out_src_vec; + std::vector> out_weight_vec; + std::vector> out_eids_vec; + // `sample_cumsum_sizes` record the start position and end position + // after sampling. + std::vector sample_cumsum_sizes(bs + 1); + // `total_neighbors` the size of output after sample. + int total_neighbors = 0; + sample_cumsum_sizes[0] = total_neighbors; + for (int i = 0; i < bs; i++) { + T node = input[i]; + int cap = col_ptr[node + 1] - col_ptr[node]; + int k = cap > sample_size ? sample_size : cap; + total_neighbors += k; + sample_cumsum_sizes[i + 1] = total_neighbors; + std::vector out_src; + out_src.resize(cap); + out_src_vec.emplace_back(out_src); + std::vector out_weight; + out_weight.resize(cap); + out_weight_vec.emplace_back(out_weight); + if (return_eids) { + std::vector out_eids; + out_eids.resize(cap); + out_eids_vec.emplace_back(out_eids); + } + } + + output_count->resize(bs); + output->resize(total_neighbors); + if (return_eids) { + output_eids->resize(total_neighbors); + } + + std::random_device rd; + std::mt19937 rng{rd()}; + std::uniform_real_distribution dice_distribution(0, 1); + +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + // Sample the neighbors in parallelism. + for (int i = 0; i < bs; i++) { + T node = input[i]; + T begin = col_ptr[node], end = col_ptr[node + 1]; + int cap = end - begin; + if (sample_size < cap) { // sample_size < neighbor_len + std::copy(row + begin, row + end, out_src_vec[i].begin()); + std::copy( + edge_weight + begin, edge_weight + end, out_weight_vec[i].begin()); + if (return_eids) { + std::copy(eids + begin, eids + end, out_eids_vec[i].begin()); + } + SampleWeightedNeighbors(out_src_vec[i], + out_weight_vec[i], + out_eids_vec[i], + sample_size, + rng, + dice_distribution, + return_eids); + *(output_count->data() + i) = sample_size; + } else { // sample_size >= neighbor_len, directly copy + std::copy(row + begin, row + end, out_src_vec[i].begin()); + if (return_eids) { + std::copy(eids + begin, eids + end, out_eids_vec[i].begin()); + } + *(output_count->data() + i) = cap; + } + } + +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + // Copy the results parallelism + for (int i = 0; i < bs; i++) { + int k = sample_cumsum_sizes[i + 1] - sample_cumsum_sizes[i]; + std::copy(out_src_vec[i].begin(), + out_src_vec[i].begin() + k, + output->data() + sample_cumsum_sizes[i]); + if (return_eids) { + std::copy(out_eids_vec[i].begin(), + out_eids_vec[i].begin() + k, + output_eids->data() + sample_cumsum_sizes[i]); + } + } +} + +template +void WeightedSampleNeighborsKernel(const Context& dev_ctx, + const DenseTensor& row, + const DenseTensor& col_ptr, + const DenseTensor& edge_weight, + const DenseTensor& x, + const paddle::optional& eids, + int sample_size, + bool return_eids, + DenseTensor* out, + DenseTensor* out_count, + DenseTensor* out_eids) { + const T* row_data = row.data(); + const T* col_ptr_data = col_ptr.data(); + const float* weights_data = edge_weight.data(); + const T* x_data = x.data(); + const T* eids_data = + (eids.get_ptr() == nullptr ? nullptr : eids.get_ptr()->data()); + int bs = x.dims()[0]; + + std::vector output; + std::vector output_count; + std::vector output_eids; + + SampleNeighbors(row_data, + col_ptr_data, + weights_data, + eids_data, + x_data, + &output, + &output_count, + &output_eids, + sample_size, + bs, + return_eids); + + if (return_eids) { + out_eids->Resize({static_cast(output_eids.size())}); + T* out_eids_data = dev_ctx.template Alloc(out_eids); + std::copy(output_eids.begin(), output_eids.end(), out_eids_data); + } + + out->Resize({static_cast(output.size())}); + T* out_data = dev_ctx.template Alloc(out); + std::copy(output.begin(), output.end(), out_data); + out_count->Resize({bs}); + int* out_count_data = dev_ctx.template Alloc(out_count); + std::copy(output_count.begin(), output_count.end(), out_count_data); +} + +} // namespace phi + +PD_REGISTER_KERNEL(weighted_sample_neighbors, + CPU, + ALL_LAYOUT, + phi::WeightedSampleNeighborsKernel, + int, + int64_t) {} diff --git a/paddle/phi/kernels/funcs/block_radix_topk.cuh b/paddle/phi/kernels/funcs/block_radix_topk.cuh new file mode 100644 index 0000000000000..320d8ad8fc4f3 --- /dev/null +++ b/paddle/phi/kernels/funcs/block_radix_topk.cuh @@ -0,0 +1,349 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#ifdef PADDLE_WITH_CUDA +#include +#include +#include +#include +#include + +namespace paddle { +namespace framework { + +template< + typename KeyT, + int BLOCK_SIZE, + bool GREATER = true, + int RADIX_BITS = 8> +class BlockRadixTopKGlobalMemory { + static_assert(cub::PowerOfTwo::VALUE && (RADIX_BITS <= (sizeof(KeyT) * 8)), + "RADIX_BITS should be power of 2, and <= (sizeof(KeyT) * 8)"); + static_assert(cub::PowerOfTwo::VALUE, "BLOCK_SIZE should be power of 2"); + using KeyTraits = cub::Traits; + using UnsignedBits = typename KeyTraits::UnsignedBits; + using BlockScanT = cub::BlockScan; + static constexpr int RADIX_SIZE = (1 << RADIX_BITS); + static constexpr int SCAN_ITEMS_PER_THREAD = (RADIX_SIZE + BLOCK_SIZE - 1) / BLOCK_SIZE; + using BinBlockLoad = cub::BlockLoad; + using BinBlockStore = cub::BlockStore; + struct _TempStorage { + typename BlockScanT::TempStorage scan_storage; + union { + typename BinBlockLoad::TempStorage load_storage; + typename BinBlockStore::TempStorage store_storage; + } load_store; + union { + int shared_bins[RADIX_SIZE]; + }; + int share_target_k; + int share_bucket_id; + }; + + public: + struct TempStorage : cub::Uninitialized<_TempStorage> { + }; + __device__ __forceinline__ BlockRadixTopKGlobalMemory(TempStorage &temp_storage) + : temp_storage_{temp_storage.Alias()}, tid_(threadIdx.x){}; + __device__ __forceinline__ void radixTopKGetThreshold(const KeyT *data, int k, int size, KeyT &topK, bool &topk_is_unique) { + assert(k < size && k > 0); + int target_k = k; + UnsignedBits key_pattern = 0; + int digit_pos = sizeof(KeyT) * 8 - RADIX_BITS; + for (; digit_pos >= 0; digit_pos -= RADIX_BITS) { + UpdateSharedBins(data, size, digit_pos, key_pattern); + InclusiveScanBins(); + UpdateTopK(digit_pos, target_k, key_pattern); + if (target_k == 0) break; + } + if (target_k == 0) { + key_pattern -= 1; + topk_is_unique = true; + } else { + topk_is_unique = false; + } + if (GREATER) key_pattern = ~key_pattern; + UnsignedBits topK_unsigned = KeyTraits::TwiddleOut(key_pattern); + topK = reinterpret_cast(topK_unsigned); + } + + private: + __device__ __forceinline__ void UpdateSharedBins(const KeyT *key, int size, int digit_pos, UnsignedBits key_pattern) { + for (int id = tid_; id < RADIX_SIZE; id += BLOCK_SIZE) { + temp_storage_.shared_bins[id] = 0; + } + cub::CTA_SYNC(); + UnsignedBits key_mask = ((UnsignedBits)(-1)) << ((UnsignedBits)(digit_pos + RADIX_BITS)); +#pragma unroll + for (int idx = tid_; idx < size; idx += BLOCK_SIZE) { + KeyT key_data = key[idx]; + UnsignedBits twiddled_data = KeyTraits::TwiddleIn(reinterpret_cast(key_data)); + if (GREATER) twiddled_data = ~twiddled_data; + UnsignedBits digit_in_radix = cub::BFE(twiddled_data, digit_pos, RADIX_BITS); + if ((twiddled_data & key_mask) == (key_pattern & key_mask)) { + atomicAdd(&temp_storage_.shared_bins[digit_in_radix], 1); + } + } + cub::CTA_SYNC(); + } + __device__ __forceinline__ void InclusiveScanBins() { + int items[SCAN_ITEMS_PER_THREAD]; + BinBlockLoad(temp_storage_.load_store.load_storage).Load(temp_storage_.shared_bins, items, RADIX_SIZE, 0); + cub::CTA_SYNC(); + BlockScanT(temp_storage_.scan_storage).InclusiveSum(items, items); + cub::CTA_SYNC(); + BinBlockStore(temp_storage_.load_store.store_storage).Store(temp_storage_.shared_bins, items, RADIX_SIZE); + cub::CTA_SYNC(); + } + __device__ __forceinline__ void UpdateTopK(int digit_pos, + int &target_k, + UnsignedBits &target_pattern) { + for (int idx = tid_; (idx < RADIX_SIZE); idx += BLOCK_SIZE) { + int prev_count = (idx == 0) ? 0 : temp_storage_.shared_bins[idx - 1]; + int cur_count = temp_storage_.shared_bins[idx]; + if (prev_count <= target_k && cur_count > target_k) { + temp_storage_.share_target_k = target_k - prev_count; + temp_storage_.share_bucket_id = idx; + } + } + cub::CTA_SYNC(); + target_k = temp_storage_.share_target_k; + int target_bucket_id = temp_storage_.share_bucket_id; + UnsignedBits key_segment = ((UnsignedBits) target_bucket_id) << ((UnsignedBits) digit_pos); + target_pattern |= key_segment; + } + _TempStorage &temp_storage_; + int tid_; +}; + +template< + typename KeyT, + int BLOCK_SIZE, + int ITEMS_PER_THREAD, + bool GREATER = true, + typename ValueT = cub::NullType, + int RADIX_BITS = 8> +class BlockRadixTopKRegister { + static_assert(cub::PowerOfTwo::VALUE && (RADIX_BITS <= (sizeof(KeyT) * 8)), + "RADIX_BITS should be power of 2, and <= (sizeof(KeyT) * 8)"); + static_assert(cub::PowerOfTwo::VALUE, "BLOCK_SIZE should be power of 2"); + using KeyTraits = cub::Traits; + using UnsignedBits = typename KeyTraits::UnsignedBits; + using BlockScanT = cub::BlockScan; + static constexpr int RADIX_SIZE = (1 << RADIX_BITS); + static constexpr bool KEYS_ONLY = std::is_same::value; + static constexpr int SCAN_ITEMS_PER_THREAD = (RADIX_SIZE + BLOCK_SIZE - 1) / BLOCK_SIZE; + using BinBlockLoad = cub::BlockLoad; + using BinBlockStore = cub::BlockStore; + using BlockExchangeKey = cub::BlockExchange; + using BlockExchangeValue = cub::BlockExchange; + + using _ExchangeKeyTempStorage = typename BlockExchangeKey::TempStorage; + using _ExchangeValueTempStorage = typename BlockExchangeValue::TempStorage; + typedef union ExchangeKeyTempStorageType { + _ExchangeKeyTempStorage key_storage; + } ExchKeyTempStorageType; + typedef union ExchangeKeyValueTempStorageType { + _ExchangeKeyTempStorage key_storage; + _ExchangeValueTempStorage value_storage; + } ExchKeyValueTempStorageType; + using _ExchangeType = typename std::conditional::type; + + struct _TempStorage { + typename BlockScanT::TempStorage scan_storage; + union { + typename BinBlockLoad::TempStorage load_storage; + typename BinBlockStore::TempStorage store_storage; + } load_store; + union { + int shared_bins[RADIX_SIZE]; + _ExchangeType exchange_storage; + }; + int share_target_k; + int share_bucket_id; + int share_prev_count; + }; + + public: + struct TempStorage : cub::Uninitialized<_TempStorage> { + }; + __device__ __forceinline__ BlockRadixTopKRegister(TempStorage &temp_storage) + : temp_storage_{temp_storage.Alias()}, tid_(threadIdx.x){}; + __device__ __forceinline__ void radixTopKToStriped(KeyT (&keys)[ITEMS_PER_THREAD], + const int k, const int valid_count) { + TopKGenRank(keys, k, valid_count); + int is_valid[ITEMS_PER_THREAD]; + GenValidArray(is_valid, k); + BlockExchangeKey{temp_storage_.exchange_storage.key_storage}.ScatterToStripedFlagged(keys, keys, ranks_, is_valid); + cub::CTA_SYNC(); + } + __device__ __forceinline__ void radixTopKToStriped(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], + const int k, const int valid_count) { + TopKGenRank(keys, k, valid_count); + int is_valid[ITEMS_PER_THREAD]; + GenValidArray(is_valid, k); + BlockExchangeKey{temp_storage_.exchange_storage.key_storage}.ScatterToStripedFlagged(keys, keys, ranks_, is_valid); + cub::CTA_SYNC(); + BlockExchangeValue{temp_storage_.exchange_storage.value_storage}.ScatterToStripedFlagged(values, values, ranks_, is_valid); + cub::CTA_SYNC(); + } + + private: + __device__ __forceinline__ void TopKGenRank(KeyT (&keys)[ITEMS_PER_THREAD], const int k, const int valid_count) { + assert(k <= BLOCK_SIZE * ITEMS_PER_THREAD); + assert(k <= valid_count); + if (k == valid_count) return; + UnsignedBits(&unsigned_keys)[ITEMS_PER_THREAD] = reinterpret_cast(keys); + search_mask_ = 0; + top_k_mask_ = 0; + +#pragma unroll + for (unsigned int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) { + int idx = KEY * BLOCK_SIZE + tid_; + unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]); + if (GREATER) unsigned_keys[KEY] = ~unsigned_keys[KEY]; + if (idx < valid_count) search_mask_ |= (1U << KEY); + } + + int target_k = k; + int prefix_k = 0; + + for (int digit_pos = sizeof(KeyT) * 8 - RADIX_BITS; digit_pos >= 0; digit_pos -= RADIX_BITS) { + UpdateSharedBins(unsigned_keys, digit_pos, prefix_k); + InclusiveScanBins(); + UpdateTopK(unsigned_keys, digit_pos, target_k, prefix_k, digit_pos == 0); + if (target_k == 0) break; + } + +#pragma unroll + for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) { + if (GREATER) unsigned_keys[KEY] = ~unsigned_keys[KEY]; + unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]); + } + } + __device__ __forceinline__ void GenValidArray(int (&is_valid)[ITEMS_PER_THREAD], int k) { +#pragma unroll + for (unsigned int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) { + if ((top_k_mask_ & (1U << KEY)) && ranks_[KEY] < k) { + is_valid[KEY] = 1; + } else { + is_valid[KEY] = 0; + } + } + } + __device__ __forceinline__ void UpdateSharedBins(UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD], + int digit_pos, int prefix_k) { + for (int id = tid_; id < RADIX_SIZE; id += BLOCK_SIZE) { + temp_storage_.shared_bins[id] = 0; + } + cub::CTA_SYNC(); +//#define USE_MATCH +#ifdef USE_MATCH + int lane_mask = cub::LaneMaskLt(); +#pragma unroll + for (unsigned int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) { + bool is_search = search_mask_ & (1U << KEY); + int bucket_idx = -1; + if (is_search) { + UnsignedBits digit_in_radix = cub::BFE(unsigned_keys[KEY], digit_pos, RADIX_BITS); + bucket_idx = (int) digit_in_radix; + } + int warp_match_mask = __match_any_sync(0xffffffff, bucket_idx); + int same_count = __popc(warp_match_mask); + int idx_in_same_bucket = __popc(warp_match_mask & lane_mask); + int same_bucket_root_lane = __ffs(warp_match_mask) - 1; + int same_bucket_start_idx; + if (idx_in_same_bucket == 0 && is_search) { + same_bucket_start_idx = atomicAdd(&temp_storage_.shared_bins[bucket_idx], same_count); + } + same_bucket_start_idx = __shfl_sync(0xffffffff, same_bucket_start_idx, same_bucket_root_lane, 32); + if (is_search) { + ranks_[KEY] = same_bucket_start_idx + idx_in_same_bucket + prefix_k; + } + } +#else +#pragma unroll + for (unsigned int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) { + bool is_search = search_mask_ & (1U << KEY); + int bucket_idx = -1; + if (is_search) { + UnsignedBits digit_in_radix = cub::BFE(unsigned_keys[KEY], digit_pos, RADIX_BITS); + bucket_idx = (int) digit_in_radix; + ranks_[KEY] = atomicAdd(&temp_storage_.shared_bins[bucket_idx], 1) + prefix_k; + } + } +#endif + cub::CTA_SYNC(); + } + __device__ __forceinline__ void InclusiveScanBins() { + int items[SCAN_ITEMS_PER_THREAD]; + BinBlockLoad(temp_storage_.load_store.load_storage).Load(temp_storage_.shared_bins, items, RADIX_SIZE, 0); + cub::CTA_SYNC(); + BlockScanT(temp_storage_.scan_storage).InclusiveSum(items, items); + cub::CTA_SYNC(); + BinBlockStore(temp_storage_.load_store.store_storage).Store(temp_storage_.shared_bins, items, RADIX_SIZE); + cub::CTA_SYNC(); + } + __device__ __forceinline__ void UpdateTopK(UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD], + int digit_pos, + int &target_k, + int &prefix_k, + bool mark_equal) { + for (int idx = tid_; (idx < RADIX_SIZE); idx += BLOCK_SIZE) { + int prev_count = (idx == 0) ? 0 : temp_storage_.shared_bins[idx - 1]; + int cur_count = temp_storage_.shared_bins[idx]; + if (prev_count <= target_k && cur_count > target_k) { + temp_storage_.share_target_k = target_k - prev_count; + temp_storage_.share_bucket_id = idx; + temp_storage_.share_prev_count = prev_count; + } + } + cub::CTA_SYNC(); + target_k = temp_storage_.share_target_k; + prefix_k += temp_storage_.share_prev_count; + int target_bucket_id = temp_storage_.share_bucket_id; +#pragma unroll + for (unsigned int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) { + if (search_mask_ & (1U << KEY)) { + UnsignedBits digit_in_radix = cub::BFE(unsigned_keys[KEY], digit_pos, RADIX_BITS); + if (digit_in_radix < target_bucket_id) { + top_k_mask_ |= (1U << KEY); + search_mask_ &= ~(1U << KEY); + } else if (digit_in_radix > target_bucket_id) { + search_mask_ &= ~(1U << KEY); + } else { + if (mark_equal) top_k_mask_ |= (1U << KEY); + } + if (digit_in_radix <= target_bucket_id) { + int prev_count = (digit_in_radix == 0) ? 0 : temp_storage_.shared_bins[digit_in_radix - 1]; + ranks_[KEY] += prev_count; + } + } + } + cub::CTA_SYNC(); + } + + _TempStorage &temp_storage_; + int tid_; + int ranks_[ITEMS_PER_THREAD]; + unsigned int search_mask_; + unsigned int top_k_mask_; +}; + +}; // end namespace framework +}; // end namespace paddle +#endif diff --git a/paddle/phi/kernels/funcs/random.cuh b/paddle/phi/kernels/funcs/random.cuh new file mode 100644 index 0000000000000..502b7e85ee97f --- /dev/null +++ b/paddle/phi/kernels/funcs/random.cuh @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#ifdef __NVCC__ +#include // NOLINT +#endif + +class RandomNumGen { + public: + __host__ __device__ __forceinline__ RandomNumGen(int gid, unsigned long long seed) { + next_random = seed + gid; + next_random ^= next_random >> 33U; + next_random *= 0xff51afd7ed558ccdUL; + next_random ^= next_random >> 33U; + next_random *= 0xc4ceb9fe1a85ec53UL; + next_random ^= next_random >> 33U; + } + __host__ __device__ __forceinline__ ~RandomNumGen() = default; + __host__ __device__ __forceinline__ void SetSeed(int seed) { + next_random = seed; + NextValue(); + } + __host__ __device__ __forceinline__ unsigned long long SaveState() const { + return next_random; + } + __host__ __device__ __forceinline__ void LoadState(unsigned long long state) { + next_random = state; + } + __host__ __device__ __forceinline__ int Random() { + int ret_value = (int) (next_random & 0x7fffffffULL); + NextValue(); + return ret_value; + } + __host__ __device__ __forceinline__ int RandomMod(int mod) { + return Random() % mod; + } + __host__ __device__ __forceinline__ int64_t Random64() { + int64_t ret_value = (next_random & 0x7FFFFFFFFFFFFFFFLL); + NextValue(); + return ret_value; + } + __host__ __device__ __forceinline__ int64_t RandomMod64(int64_t mod) { + return Random64() % mod; + } + __host__ __device__ __forceinline__ float RandomUniformFloat(float max = 1.0f, float min = 0.0f) { + int value = (int) (next_random & 0xffffff); + auto ret_value = (float) value; + ret_value /= 0xffffffL; + ret_value *= (max - min); + ret_value += min; + NextValue(); + return ret_value; + } + __host__ __device__ __forceinline__ bool RandomBool(float true_prob) { + float value = RandomUniformFloat(); + return value <= true_prob; + } + __host__ __device__ __forceinline__ void NextValue() { + //next_random = next_random * (unsigned long long)0xc4ceb9fe1a85ec53UL + generator_id; + //next_random = next_random * (unsigned long long)25214903917ULL + 11; + next_random = next_random * (unsigned long long) 13173779397737131ULL + 1023456798976543201ULL; + } + + private: + unsigned long long next_random = 1; +}; diff --git a/paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu b/paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu new file mode 100644 index 0000000000000..d4e0ca632e04d --- /dev/null +++ b/paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu @@ -0,0 +1,535 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include + +#ifdef PADDLE_WITH_CUDA +#include +#include +#include "cub/cub.cuh" +#endif + +#include "math.h" // NOLINT +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/memory_utils.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/block_radix_topk.cuh" +#include "paddle/phi/kernels/funcs/random.cuh" +#include "paddle/phi/kernels/weighted_sample_neighbors_kernel.h" +#define SAMPLE_SIZE_THRESHOLD 1024 + +namespace phi { + +#ifdef PADDLE_WITH_CUDA +__device__ __forceinline__ float GenKeyFromWeight( + const float weight, + RandomNumGen& rng) { // NOLINT + rng.NextValue(); + float u = -rng.RandomUniformFloat(1.0f, 0.5f); + long long random_num2 = 0; // NOLINT + int seed_count = -1; + do { + random_num2 = rng.Random64(); + seed_count++; + } while (!random_num2); + int one_bit = __clzll(random_num2) + seed_count * 64; + u *= exp2f(-one_bit); + float logk = (log1pf(u) / logf(2.0)) * (1 / weight); + return logk; +} +#endif + +template +__global__ void GetSampleCountAndNeighborCountKernel(const T* col_ptr, + const T* input_nodes, + int* actual_size, + int* neighbor_count, + int sample_size, + int n) { + int i = threadIdx.x + blockIdx.x * blockDim.x; + if (i >= n) return; + T nid = input_nodes[i]; + int neighbor_size = static_cast(col_ptr[nid + 1] - col_ptr[nid]); + // sample_size < 0 means sample all. + int k = neighbor_size; + if (sample_size >= 0) { + k = min(neighbor_size, sample_size); + } + actual_size[i] = k; + if (NeedNeighbor) { + neighbor_count[i] = (neighbor_size <= sample_size) ? 0 : neighbor_size; + } +} + +#ifdef PADDLE_WITH_CUDA +template +__launch_bounds__(BLOCK_SIZE) __global__ + void WeightedSampleLargeKernel(T* sample_output, + const int* sample_offset, + const int* target_neighbor_offset, + float* weight_keys_buf, + const T* input_nodes, + int input_node_count, + const T* in_rows, + const T* col_ptr, + const float* edge_weight, + const T* eids, + int max_sample_count, + unsigned long long random_seed, // NOLINT + T* out_eids, + bool return_eids) { + int i = blockIdx.x; + if (i >= input_node_count) return; + int gidx = threadIdx.x + blockIdx.x * BLOCK_SIZE; + T nid = input_nodes[i]; + T start = col_ptr[nid + 1]; + T end = col_ptr[nid]; + int neighbor_count = static_cast(end - start); + + float* weight_keys_local_buff = weight_keys_buf + target_neighbor_offset[i]; + int offset = sample_offset[i]; + if (neighbor_count <= max_sample_count) { + for (int j = threadIdx.x; j < neighbor_count; j += BLOCK_SIZE) { + sample_output[offset + j] = in_rows[start + j]; + if (return_eids) { + out_eids[offset + j] = eids[start + j]; + } + } + } else { + RandomNumGen rng(gidx, random_seed); + for (int j = threadIdx.x; j < neighbor_count; j += BLOCK_SIZE) { + float thread_weight = edge_weight[start + j]; + weight_keys_local_buff[j] = + static_cast(GenKeyFromWeight(thread_weight, rng)); + } + __syncthreads(); + + float topk_val; + bool topk_is_unique; + + using BlockRadixSelectT = + paddle::framework::BlockRadixTopKGlobalMemory; + __shared__ typename BlockRadixSelectT::TempStorage share_storage; + + BlockRadixSelectT{share_storage}.radixTopKGetThreshold( + weight_keys_local_buff, + max_sample_count, + neighbor_count, + topk_val, + topk_is_unique); + __shared__ int cnt; + + if (threadIdx.x == 0) { + cnt = 0; + } + __syncthreads(); + + // We use atomicAdd 1 operations instead of binaryScan to calculate the + // write index, since we do not need to keep the relative positions of + // element. + + if (topk_is_unique) { + for (int j = threadIdx.x; j < neighbor_count; j += BLOCK_SIZE) { + float key = weight_keys_local_buff[j]; + bool has_topk = (key >= topk_val); + + if (has_topk) { + int write_index = atomicAdd(&cnt, 1); + sample_output[offset + write_index] = in_rows[start + j]; + if (return_eids) { + out_eids[offset + write_index] = eids[start + j]; + } + } + } + } else { + for (int j = threadIdx.x; j < neighbor_count; j += BLOCK_SIZE) { + float key = weight_keys_local_buff[j]; + bool has_topk = (key > topk_val); + + if (has_topk) { + int write_index = atomicAdd(&cnt, 1); + sample_output[offset + write_index] = in_rows[start + j]; + if (return_eids) { + out_eids[offset + write_index] = eids[start + j]; + } + } + } + __syncthreads(); + + for (int j = threadIdx.x; j < neighbor_count; j += BLOCK_SIZE) { + float key = weight_keys_local_buff[j]; + bool has_topk = (key == topk_val); + if (has_topk) { + int write_index = atomicAdd(&cnt, 1); + if (write_index >= max_sample_count) { + break; + } + sample_output[offset + write_index] = in_rows[start + j]; + if (return_eids) { + out_eids[offset + write_index] = eids[start + j]; + } + } + } + } + } +} +#endif + +template +__global__ void SampleAllKernel(T* sample_output, + const int* sample_offset, + const T* input_nodes, + int input_node_count, + const T* in_rows, + const T* col_ptr, + const T* eids, + T* out_eids, + bool return_eids) { + int i = blockIdx.x; + if (i >= input_node_count) return; + T nid = input_nodes[i]; + T start = col_ptr[nid + 1]; + T end = col_ptr[nid]; + int neighbor_count = static_cast(end - start); + if (neighbor_count <= 0) return; + int offset = sample_offset[i]; + for (int j = threadIdx.x; j < neighbor_count; j += blockDim.x) { + sample_output[offset + j] = in_rows[start + j]; + if (return_eids) { + out_eids[offset + j] = eids[start + j]; + } + } +} + +// A-RES algorithm +#ifdef PADDLE_WITH_CUDA +template +__launch_bounds__(BLOCK_SIZE) __global__ + void WeightedSampleKernel(T* sample_output, + const int* sample_offset, + const T* input_nodes, + int input_node_count, + const T* in_rows, + const T* col_ptr, + const float* edge_weight, + const T* eids, + int max_sample_count, + unsigned long long random_seed, // NOLINT + T* out_eids, + bool return_eids) { + int i = blockIdx.x; + if (i >= input_node_count) return; + int gidx = threadIdx.x + blockIdx.x * BLOCK_SIZE; + T nid = input_nodes[i]; + T start = col_ptr[nid]; + T end = col_ptr[nid + 1]; + int neighbor_count = static_cast(end - start); + int offset = sample_offset[i]; + + if (neighbor_count <= max_sample_count) { + for (int j = threadIdx.x; j < neighbor_count; j += BLOCK_SIZE) { + sample_output[offset + j] = in_rows[start + j]; + if (return_eids) { + out_eids[offset + j] = eids[start + j]; + } + } + } else { + RandomNumGen rng(gidx, random_seed); + float weight_keys[ITEMS_PER_THREAD]; + int neighbor_idxs[ITEMS_PER_THREAD]; + using BlockRadixTopKT = paddle::framework:: + BlockRadixTopKRegister; + __shared__ typename BlockRadixTopKT::TempStorage sort_tmp_storage; + + const int tx = threadIdx.x; +#pragma unroll + for (int j = 0; j < ITEMS_PER_THREAD; j++) { + int idx = BLOCK_SIZE * j + tx; + if (idx < neighbor_count) { + float thread_weight = edge_weight[start + idx]; + weight_keys[j] = GenKeyFromWeight(thread_weight, rng); + neighbor_idxs[j] = idx; + } + } + const int valid_count = (neighbor_count < (BLOCK_SIZE * ITEMS_PER_THREAD)) + ? neighbor_count + : (BLOCK_SIZE * ITEMS_PER_THREAD); + BlockRadixTopKT{sort_tmp_storage}.radixTopKToStriped( + weight_keys, neighbor_idxs, max_sample_count, valid_count); + __syncthreads(); + const int stride = BLOCK_SIZE * ITEMS_PER_THREAD - max_sample_count; + + for (int idx_offset = ITEMS_PER_THREAD * BLOCK_SIZE; + idx_offset < neighbor_count; + idx_offset += stride) { +#pragma unroll + for (int j = 0; j < ITEMS_PER_THREAD; j++) { + int local_idx = BLOCK_SIZE * j + tx - max_sample_count; + int target_idx = idx_offset + local_idx; + if (local_idx >= 0 && target_idx < neighbor_count) { + float thread_weight = edge_weight[start + target_idx]; + weight_keys[j] = GenKeyFromWeight(thread_weight, rng); + neighbor_idxs[j] = target_idx; + } + } + const int iter_valid_count = + ((neighbor_count - idx_offset) >= stride) + ? (BLOCK_SIZE * ITEMS_PER_THREAD) + : (max_sample_count + neighbor_count - idx_offset); + BlockRadixTopKT{sort_tmp_storage}.radixTopKToStriped( + weight_keys, neighbor_idxs, max_sample_count, iter_valid_count); + __syncthreads(); + } +#pragma unroll + for (int j = 0; j < ITEMS_PER_THREAD; j++) { + int idx = j * BLOCK_SIZE + tx; + if (idx < max_sample_count) { + sample_output[offset + idx] = in_rows[start + neighbor_idxs[j]]; + if (return_eids) { + out_eids[offset + idx] = eids[start + neighbor_idxs[j]]; + } + } + } + } +} +#endif + +template +void WeightedSampleNeighborsKernel(const Context& dev_ctx, + const DenseTensor& row, + const DenseTensor& col_ptr, + const DenseTensor& edge_weight, + const DenseTensor& x, + const paddle::optional& eids, + int sample_size, + bool return_eids, + DenseTensor* out, + DenseTensor* out_count, + DenseTensor* out_eids) { + auto* row_data = row.data(); + auto* col_ptr_data = col_ptr.data(); + auto* weights_data = edge_weight.data(); + auto* x_data = x.data(); + auto* eids_data = + (eids.get_ptr() == nullptr ? nullptr : eids.get_ptr()->data()); + int bs = x.dims()[0]; + + thread_local std::random_device rd; + thread_local std::mt19937 gen(rd()); + thread_local std::uniform_int_distribution // NOLINT + distrib; + unsigned long long random_seed = distrib(gen); // NOLINT + const bool need_neighbor_count = sample_size > SAMPLE_SIZE_THRESHOLD; + + out_count->Resize({bs}); + int* out_count_data = + dev_ctx.template Alloc(out_count); // finally copy sample_count + int* neighbor_count_ptr = nullptr; + std::shared_ptr neighbor_count; + auto sample_count = phi::memory_utils::Alloc( + dev_ctx.GetPlace(), + (bs + 1) * sizeof(int), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); + int* sample_count_ptr = reinterpret_cast(sample_count->ptr()); + + int grid_size = (bs + 127) / 128; + if (need_neighbor_count) { + neighbor_count = phi::memory_utils::AllocShared( + dev_ctx.GetPlace(), + (bs + 1) * sizeof(int), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); + neighbor_count_ptr = reinterpret_cast(neighbor_count->ptr()); + GetSampleCountAndNeighborCountKernel + <<>>(col_ptr_data, + x_data, + sample_count_ptr, + neighbor_count_ptr, + sample_size, + bs); + } else { + GetSampleCountAndNeighborCountKernel + <<>>( + col_ptr_data, x_data, sample_count_ptr, nullptr, sample_size, bs); + } + + auto sample_offset = phi::memory_utils::Alloc( + dev_ctx.GetPlace(), + (bs + 1) * sizeof(int), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); + int* sample_offset_ptr = reinterpret_cast(sample_offset->ptr()); + +#ifdef PADDLE_WITH_CUDA + const auto& exec_policy = thrust::cuda::par.on(dev_ctx.stream()); +#else + const auto& exec_policy = thrust::hip::par.on(dev_ctx.stream()); +#endif + thrust::exclusive_scan(exec_policy, + sample_count_ptr, + sample_count_ptr + bs + 1, + sample_offset_ptr); + int total_sample_size = 0; +#ifdef PADDLE_WITH_CUDA + cudaMemcpyAsync(&total_sample_size, + sample_offset_ptr + bs, + sizeof(int), + cudaMemcpyDeviceToHost, + dev_ctx.stream()); + cudaMemcpyAsync(out_count_data, + sample_count_ptr, + sizeof(int) * bs, + cudaMemcpyDeviceToDevice, + dev_ctx.stream()); + cudaStreamSynchronize(dev_ctx.stream()); +#else + hipMemcpyAsync(&total_sample_size, + sample_offset_ptr + bs, + sizeof(int), + hipMemcpyDeviceToHost, + dev_ctx.stream()); + hipMemcpyAsync(out_count_data, + sample_count_ptr, + sizeof(int) * bs, + hipMemcpyDeviceToDevice, + dev_ctx.stream()); + hipStreamSynchronize(dev_ctx.stream()); +#endif + + out->Resize({static_cast(total_sample_size)}); + T* out_data = dev_ctx.template Alloc(out); + T* out_eids_data = nullptr; + if (return_eids) { + out_eids->Resize({static_cast(total_sample_size)}); + out_eids_data = dev_ctx.template Alloc(out_eids); + } + + // large sample size +#ifdef PADDLE_WITH_CUDA + if (sample_size > SAMPLE_SIZE_THRESHOLD) { + thrust::exclusive_scan(exec_policy, + neighbor_count_ptr, + neighbor_count_ptr + bs + 1, + neighbor_count_ptr); + int* neighbor_offset = neighbor_count_ptr; + int target_neighbor_counts; + cudaMemcpyAsync(&target_neighbor_counts, + neighbor_offset + bs, + sizeof(int), + cudaMemcpyDeviceToHost, + dev_ctx.stream()); + cudaStreamSynchronize(dev_ctx.stream()); + + auto tmh_weights = phi::memory_utils::Alloc( + dev_ctx.GetPlace(), + target_neighbor_counts * sizeof(float), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); + float* target_weights_keys_buf_ptr = + reinterpret_cast(tmh_weights->ptr()); + constexpr int BLOCK_SIZE = 256; + WeightedSampleLargeKernel + <<>>(out_data, + sample_offset_ptr, + neighbor_offset, + target_weights_keys_buf_ptr, + x_data, + bs, + row_data, + col_ptr_data, + weights_data, + eids_data, + sample_size, + random_seed, + out_eids_data, + return_eids); + cudaStreamSynchronize(dev_ctx.stream()); + } else if (sample_size <= 0) { + SampleAllKernel<<>>(out_data, + sample_offset_ptr, + x_data, + bs, + row_data, + col_ptr_data, + eids_data, + out_eids_data, + return_eids); + cudaStreamSynchronize(dev_ctx.stream()); + } else { // sample_size < sample_count_threshold + using WeightedSampleFuncType = void (*)(T*, + const int*, + const T*, + int, + const T*, + const T*, + const float*, + const T*, + int, + unsigned long long, // NOLINT + T*, + bool); + static const WeightedSampleFuncType func_array[7] = { + WeightedSampleKernel, + WeightedSampleKernel, + WeightedSampleKernel, + WeightedSampleKernel, + WeightedSampleKernel, + WeightedSampleKernel, + WeightedSampleKernel, + }; + const int block_sizes[7] = {128, 128, 256, 256, 256, 256, 512}; + auto choose_func_idx = [](int sample_size) { + if (sample_size <= 128) { + return 0; + } + if (sample_size <= 384) { + return (sample_size - 129) / 64 + 4; + } + if (sample_size <= 512) { + return 5; + } else { + return 6; + } + }; + int func_idx = choose_func_idx(sample_size); + int block_size = block_sizes[func_idx]; + func_array[func_idx]<<>>( + out_data, + sample_offset_ptr, + x_data, + bs, + row_data, + col_ptr_data, + weights_data, + eids_data, + sample_size, + random_seed, + out_eids_data, + return_eids); + cudaStreamSynchronize(dev_ctx.stream()); + } +#endif +} + +} // namespace phi + +PD_REGISTER_KERNEL(weighted_sample_neighbors, + GPU, + ALL_LAYOUT, + phi::WeightedSampleNeighborsKernel, + int, + int64_t) {} diff --git a/paddle/phi/kernels/weighted_sample_neighbors_kernel.h b/paddle/phi/kernels/weighted_sample_neighbors_kernel.h new file mode 100644 index 0000000000000..2a0402f9fc494 --- /dev/null +++ b/paddle/phi/kernels/weighted_sample_neighbors_kernel.h @@ -0,0 +1,35 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void GraphWeightedSampleNeighborsKernel( + const Context& dev_ctx, + const DenseTensor& row, + const DenseTensor& col_ptr, + const DenseTensor& edge_weight, + const DenseTensor& x, + const paddle::optional& eids, + int sample_size, + bool return_eids, + DenseTensor* out, + DenseTensor* out_count, + DenseTensor* out_eids); + +} // namespace phi diff --git a/python/paddle/fluid/tests/unittests/test_weighted_sample_neighbors.py b/python/paddle/fluid/tests/unittests/test_weighted_sample_neighbors.py new file mode 100644 index 0000000000000..8be782b9adf29 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_weighted_sample_neighbors.py @@ -0,0 +1,217 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle + + +class TestWeightedSampleNeighbors(unittest.TestCase): + def setUp(self): + num_nodes = 20 + edges = np.random.randint(num_nodes, size=(100, 2)) + edges = np.unique(edges, axis=0) + self.edges_id = np.arange(0, len(edges)).astype("int64") + sorted_edges = edges[np.argsort(edges[:, 1])] + + # Calculate dst index cumsum counts, also means colptr + dst_count = np.zeros(num_nodes) + dst_src_dict = {} + for dst in range(0, num_nodes): + true_index = sorted_edges[:, 1] == dst + dst_count[dst] = np.sum(true_index) + dst_src_dict[dst] = sorted_edges[:, 0][true_index] + dst_count = dst_count.astype("int64") + colptr = np.cumsum(dst_count) + colptr = np.insert(colptr, 0, 0) + + self.row = sorted_edges[:, 0].astype("int64") + self.colptr = colptr.astype("int64") + self.nodes = np.unique(np.random.randint(num_nodes, size=5)).astype( + "int64" + ) + self.weight = np.ones(self.row.shape[0]).astype("float32") + self.sample_size = 5 + self.dst_src_dict = dst_src_dict + + def test_sample_result(self): + paddle.disable_static() + row = paddle.to_tensor(self.row) + colptr = paddle.to_tensor(self.colptr) + nodes = paddle.to_tensor(self.nodes) + weight = paddle.to_tensor(self.weight) + + out_neighbors, out_count = paddle.geometric.weighted_sample_neighbors( + row, colptr, weight, nodes, sample_size=self.sample_size + ) + out_count_cumsum = paddle.cumsum(out_count) + for i in range(len(out_count)): + if i == 0: + neighbors = out_neighbors[0 : out_count_cumsum[i]] + else: + neighbors = out_neighbors[ + out_count_cumsum[i - 1] : out_count_cumsum[i] + ] + # Ensure the correct sample size. + self.assertTrue( + out_count[i] == self.sample_size + or out_count[i] == len(self.dst_src_dict[self.nodes[i]]) + ) + # Ensure no repetitive sample neighbors. + self.assertTrue( + neighbors.shape[0] == paddle.unique(neighbors).shape[0] + ) + # Ensure the correct sample neighbors. + in_neighbors = np.isin( + neighbors.numpy(), self.dst_src_dict[self.nodes[i]] + ) + self.assertTrue(np.sum(in_neighbors) == in_neighbors.shape[0]) + + def test_sample_result_static(self): + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + row = paddle.static.data( + name="row", shape=self.row.shape, dtype=self.row.dtype + ) + colptr = paddle.static.data( + name="colptr", shape=self.colptr.shape, dtype=self.colptr.dtype + ) + weight = paddle.static.data( + name="weight", shape=self.weight.shape, dtype=self.weight.dtype + ) + nodes = paddle.static.data( + name="nodes", shape=self.nodes.shape, dtype=self.nodes.dtype + ) + + ( + out_neighbors, + out_count, + ) = paddle.geometric.weighted_sample_neighbors( + row, colptr, weight, nodes, sample_size=self.sample_size + ) + exe = paddle.static.Executor(paddle.CPUPlace()) + ret = exe.run( + feed={ + 'row': self.row, + 'colptr': self.colptr, + 'weight': self.weight, + 'nodes': self.nodes, + }, + fetch_list=[out_neighbors, out_count], + ) + out_neighbors, out_count = ret + out_count_cumsum = np.cumsum(out_count) + out_neighbors = np.split(out_neighbors, out_count_cumsum)[:-1] + for neighbors, node, count in zip( + out_neighbors, self.nodes, out_count + ): + self.assertTrue( + count == self.sample_size + or count == len(self.dst_src_dict[node]) + ) + self.assertTrue( + neighbors.shape[0] == np.unique(neighbors).shape[0] + ) + in_neighbors = np.isin(neighbors, self.dst_src_dict[node]) + self.assertTrue(np.sum(in_neighbors) == in_neighbors.shape[0]) + + def test_raise_errors(self): + paddle.disable_static() + row = paddle.to_tensor(self.row) + colptr = paddle.to_tensor(self.colptr) + weight = paddle.to_tensor(self.weight) + nodes = paddle.to_tensor(self.nodes) + + def check_eid_error(): + paddle.geometric.weighted_sample_neighbors( + row, + colptr, + weight, + nodes, + sample_size=self.sample_size, + return_eids=True, + ) + + self.assertRaises(ValueError, check_eid_error) + + def test_sample_result_with_eids(self): + paddle.disable_static() + row = paddle.to_tensor(self.row) + colptr = paddle.to_tensor(self.colptr) + weight = paddle.to_tensor(self.weight) + nodes = paddle.to_tensor(self.nodes) + eids = paddle.to_tensor(self.edges_id) + + ( + out_neighbors, + out_count, + out_eids, + ) = paddle.geometric.weighted_sample_neighbors( + row, + colptr, + weight, + nodes, + eids=eids, + sample_size=self.sample_size, + return_eids=True, + ) + + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + row = paddle.static.data( + name="row", shape=self.row.shape, dtype=self.row.dtype + ) + colptr = paddle.static.data( + name="colptr", shape=self.colptr.shape, dtype=self.colptr.dtype + ) + weight = paddle.static.data( + name="weight", shape=self.weight.shape, dtype=self.weight.dtype + ) + nodes = paddle.static.data( + name="nodes", shape=self.nodes.shape, dtype=self.nodes.dtype + ) + eids = paddle.static.data( + name="eids", shape=self.edges_id.shape, dtype=self.nodes.dtype + ) + + ( + out_neighbors, + out_count, + out_eids, + ) = paddle.geometric.weighted_sample_neighbors( + row, + colptr, + weight, + nodes, + sample_size=self.sample_size, + eids=eids, + return_eids=True, + ) + exe = paddle.static.Executor(paddle.CPUPlace()) + ret = exe.run( + feed={ + 'row': self.row, + 'colptr': self.colptr, + 'weight': self.weight, + 'nodes': self.nodes, + 'eids': self.edges_id, + }, + fetch_list=[out_neighbors, out_count, out_eids], + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/geometric/__init__.py b/python/paddle/geometric/__init__.py index 9618bc57a203e..6c132a529bc37 100644 --- a/python/paddle/geometric/__init__.py +++ b/python/paddle/geometric/__init__.py @@ -22,6 +22,7 @@ from .reindex import reindex_graph # noqa: F401 from .reindex import reindex_heter_graph # noqa: F401 from .sampling import sample_neighbors # noqa: F401 +from .sampling import weighted_sample_neighbors # noqa: F401 __all__ = [ 'send_u_recv', @@ -34,4 +35,5 @@ 'reindex_graph', 'reindex_heter_graph', 'sample_neighbors', + 'weighted_sample_neighbors', ] diff --git a/python/paddle/geometric/sampling/__init__.py b/python/paddle/geometric/sampling/__init__.py index 2e5b24fdd60b7..ee7bacfc9047f 100644 --- a/python/paddle/geometric/sampling/__init__.py +++ b/python/paddle/geometric/sampling/__init__.py @@ -13,5 +13,6 @@ # limitations under the License. from .neighbors import sample_neighbors # noqa: F401 +from .neighbors import weighted_sample_neighbors # noqa: F401 __all__ = [] diff --git a/python/paddle/geometric/sampling/neighbors.py b/python/paddle/geometric/sampling/neighbors.py index 093fd39617af3..c8d907c078bad 100644 --- a/python/paddle/geometric/sampling/neighbors.py +++ b/python/paddle/geometric/sampling/neighbors.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from paddle import _legacy_C_ops +from paddle import _C_ops, _legacy_C_ops from paddle.fluid.data_feeder import check_variable_and_dtype -from paddle.fluid.framework import _non_static_mode +from paddle.fluid.framework import _non_static_mode, in_dygraph_mode from paddle.fluid.layer_helper import LayerHelper __all__ = [] @@ -170,3 +170,151 @@ def sample_neighbors( if return_eids: return out_neighbors, out_count, out_eids return out_neighbors, out_count + + +def weighted_sample_neighbors( + row, + colptr, + edge_weight, + input_nodes, + sample_size=-1, + eids=None, + return_eids=False, + name=None, +): + """ + Graph Weighted Sample Neighbors API. + + This API is mainly used in Graph Learning domain, and the main purpose is to + provide high performance of graph weighted-sampling method. For example, we get the + CSC(Compressed Sparse Column) format of the input graph edges as `row` and + `colptr`, so as to convert graph data into a suitable format for sampling, and the + input `edge_weight` should also match the CSC format. Besides, `input_nodes` means + the nodes we need to sample neighbors, and `sample_sizes` means the number of neighbors + and number of layers we want to sample. This API will finally return the weighted sampled + neighbors, and the probability of being selected as a neighbor is related to its weight, + with higher weight and higher probability. + + Args: + row (Tensor): One of the components of the CSC format of the input graph, and + the shape should be [num_edges, 1] or [num_edges]. The available + data type is int32, int64. + colptr (Tensor): One of the components of the CSC format of the input graph, + and the shape should be [num_nodes + 1, 1] or [num_nodes + 1]. + The data type should be the same with `row`. + edge_weight (Tensor): The edge weight of the CSC format graph edges. And the shape + should be [num_edges, 1] or [num_edges]. The available data + type is float32. + input_nodes (Tensor): The input nodes we need to sample neighbors for, and the + data type should be the same with `row`. + sample_size (int, optional): The number of neighbors we need to sample. Default value is -1, + which means returning all the neighbors of the input nodes. + eids (Tensor, optional): The eid information of the input graph. If return_eids is True, + then `eids` should not be None. The data type should be the + same with `row`. Default is None. + return_eids (bool, optional): Whether to return eid information of sample edges. Default is False. + name (str, optional): Name for the operation (optional, default is None). + For more information, please refer to :ref:`api_guide_Name`. + + Returns: + - out_neighbors (Tensor), the sample neighbors of the input nodes. + + - out_count (Tensor), the number of sampling neighbors of each input node, and the shape + should be the same with `input_nodes`. + + - out_eids (Tensor), if `return_eids` is True, we will return the eid information of the + sample edges. + + Examples: + .. code-block:: python + + import paddle + + # edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4), + # (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8) + row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7] + colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13] + weight = [0.1, 0.5, 0.2, 0.5, 0.9, 1.9, 2.0, 2.1, 0.01, 0.9, 0,12, 0.59, 0.67] + nodes = [0, 8, 1, 2] + sample_size = 2 + row = paddle.to_tensor(row, dtype="int64") + colptr = paddle.to_tensor(colptr, dtype="int64") + weight = paddle.to_tensor(weight, dtype="float32") + nodes = paddle.to_tensor(nodes, dtype="int64") + out_neighbors, out_count = paddle.geometric.weighted_sample_neighbors(row, colptr, weight, nodes, sample_size=sample_size) + + """ + + if return_eids: + if eids is None: + raise ValueError( + "`eids` should not be None if `return_eids` is True." + ) + + if in_dygraph_mode(): + ( + out_neighbors, + out_count, + out_eids, + ) = _C_ops.weighted_sample_neighbors( + row, + colptr, + edge_weight, + input_nodes, + eids, + sample_size, + return_eids, + ) + if return_eids: + return out_neighbors, out_count, out_eids + return out_neighbors, out_count + + check_variable_and_dtype( + row, "row", ("int32", "int64"), "weighted_sample_neighbors" + ) + check_variable_and_dtype( + colptr, "colptr", ("int32", "int64"), "weighted_sample_neighbors" + ) + check_variable_and_dtype( + edge_weight, + "edge_weight", + ("float32"), + "weighted_sample_neighbors", + ) + check_variable_and_dtype( + input_nodes, + "input_nodes", + ("int32", "int64"), + "weighted_sample_neighbors", + ) + if return_eids: + check_variable_and_dtype( + eids, "eids", ("int32", "int64"), "weighted_sample_neighbors" + ) + + helper = LayerHelper("weighted_sample_neighbors", **locals()) + out_neighbors = helper.create_variable_for_type_inference(dtype=row.dtype) + out_count = helper.create_variable_for_type_inference(dtype=row.dtype) + out_eids = helper.create_variable_for_type_inference(dtype=row.dtype) + helper.append_op( + type="weighted_sample_neighbors", + inputs={ + "row": row, + "colptr": colptr, + "edge_weight": edge_weight, + "input_nodes": input_nodes, + "eids": eids if return_eids else None, + }, + outputs={ + "out_neighbors": out_neighbors, + "out_count": out_count, + "out_eids": out_eids, + }, + attrs={ + "sample_size": sample_size, + "return_eids": return_eids, + }, + ) + if return_eids: + return out_neighbors, out_count, out_eids + return out_neighbors, out_count From fb5fab079c3225ba469198b2439d2ab3b9b80002 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?= <83450930+Liyulingyue@users.noreply.github.com> Date: Mon, 24 Apr 2023 16:13:17 +0800 Subject: [PATCH 018/405] Update sharding_optimizer.py (#53164) --- .../distributed/fleet/meta_optimizers/sharding_optimizer.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py index 25f9928c826a9..88ad5cbfdc32c 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py @@ -755,9 +755,6 @@ def _init_pipeline_comm(self, startup_block): sync=False, ) - if core.is_compiled_with_custom_device('npu'): - return - # GPU for pair in self.pipeline_pair: pair_key = pair[0] * 1000 + pair[1] From f2c595a6dc6598afbb93b0c5eaa51377e440a251 Mon Sep 17 00:00:00 2001 From: Wang Xin Date: Mon, 24 Apr 2023 16:14:05 +0800 Subject: [PATCH 019/405] remove ASCEND* keyword in doc and cmakelists (#53132) * remove ASCEND* keyword in doc * fix PR-CI-Codestyle-Check error --- .../fluid/tests/unittests/CMakeLists.txt | 2 -- .../tests/unittests/collective/CMakeLists.txt | 2 +- .../tests/unittests/collective/README.md | 2 +- .../unittests/collective/fleet/testslist.csv | 28 +++++++++---------- .../tests/unittests/collective/testslist.csv | 2 +- 5 files changed, 17 insertions(+), 19 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 70e3e8a550e56..670a48faf5c9b 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -38,8 +38,6 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleetrun) list(APPEND MIXED_DIST_TEST_OPS test_fleet_run_random_port) list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_async) list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_cloud) -list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_ascend) -list(APPEND MIXED_DIST_TEST_OPS test_ascend_group) list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_nproc) list(APPEND MIXED_DIST_TEST_OPS test_fleet_base) diff --git a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt index 08c72f79055af..1aec76f705236 100644 --- a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt @@ -215,7 +215,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) test_collective_reduce_scatter_api ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") set_tests_properties(test_collective_reduce_scatter_api - PROPERTIES TIMEOUT "150" LABELS "RUN_TYPE=DIST") + PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST") endif() if((WITH_GPU OR WITH_ROCM) AND (LINUX)) py_test_modules( diff --git a/python/paddle/fluid/tests/unittests/collective/README.md b/python/paddle/fluid/tests/unittests/collective/README.md index a06cf6dc39640..2bf9491814184 100644 --- a/python/paddle/fluid/tests/unittests/collective/README.md +++ b/python/paddle/fluid/tests/unittests/collective/README.md @@ -7,7 +7,7 @@ the properties are the following: * `name`: the test's name * `os`: The supported operator system, ignoring case. If the test run in multiple operator systems, use ";" to split systems, for example, `apple;linux` means the test runs on both Apple and Linux. The supported values are `linux`,`win32` and `apple`. If the value is empty, this means the test runs on all opertaor systems. -* `arch`: the device's architecture. similar to `os`, multiple valuse ars splited by ";" and ignoring case. The supported architectures are `gpu`, `xpu`, `ASCEND`, `ASCEND_CL` and `rocm`. +* `arch`: the device's architecture. similar to `os`, multiple valuse ars splited by ";" and ignoring case. The supported architectures are `gpu`, `xpu` and `rocm`. * `timeout`: timeout of a unittest, whose unit is second. Blank means default. * `run_type`: run_type of a unittest. Supported values are `NIGHTLY`, `EXCLUSIVE`, `CINN`, `DIST`, `GPUPS`, `INFER`, `EXCLUSIVE:NIGHTLY`, `DIST:NIGHTLY`,which are case-insensitive. * `launcher`: the test launcher.Supported values are test_runner.py, dist_test.sh and custom scripts' name. Blank means test_runner.py. diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/testslist.csv b/python/paddle/fluid/tests/unittests/collective/fleet/testslist.csv index 9ac8fdf65729e..459a3e39df8e8 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/testslist.csv +++ b/python/paddle/fluid/tests/unittests/collective/fleet/testslist.csv @@ -1,5 +1,5 @@ name,os,arch,timeout,run_type,launcher,num_port,run_serial,envs,conditions -test_fleet_sharding_meta_optimizer,,GPU;XPU;ASCEND;ASCEND_CL,350,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_fleet_sharding_meta_optimizer,,GPU;XPU,350,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_fleet_static_mp_layers,LINUX;WIN32,,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_dgc_op,,,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_DGC test_dgc_optimizer,,,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_DGC @@ -7,14 +7,14 @@ test_parallel_margin_cross_entropy,,GPU,120,DIST,../../dist_test.sh,2,,http_prox test_parallel_dygraph_transformer,,GPU,,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL;${NCCL_VERSION} VERSION_GREATER_EQUAL 2212 test_parallel_dygraph_transformer,,ROCM,,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_fleet_fp16_allreduce_meta_optimizer,LINUX;WIN32,,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_rnn_dp,,GPU;XPU;ASCEND;ASCEND_CL,,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_rnn_dp,,GPU;XPU,,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_parallel_dygraph_mp_layers,,GPU,120,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL test_tcp_store,LINUX;APPLE,,,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_dygraph_sharding_stage3_for_eager,,,350,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_communicator_half_async,,,120,DIST,test_runner.py,2,,FLAGS_communicator_send_queue_size=1;FLAGS_communicator_max_merge_var_num=1;http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL test_parallel_dygraph_pipeline_parallel,,GPU,500,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_parallel_dygraph_pipeline_parallel_with_virtual_stage,,GPU,500,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_fleet_localsgd_meta_optimizer,LINUX,GPU;XPU;ASCEND;ASCEND_CL,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_fleet_localsgd_meta_optimizer,LINUX,GPU;XPU,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_parallel_class_center_sample,,GPU,120,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL test_pipeline,,,120,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_fleet_utils,LINUX;APPLE,,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., @@ -22,40 +22,40 @@ test_static_model_parallel,,,240,DIST,../../dist_test.sh,2,,http_proxy=;https_pr test_parallel_dygraph_no_sync,,GPU,300,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL test_dygraph_sharding_stage2,,,200,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_parallel_dygraph_control_flow,,,350,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_fleet_lars_meta_optimizer,,GPU;XPU;ASCEND;ASCEND_CL,,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_fleet_lars_meta_optimizer,,GPU;XPU,,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_hybrid_parallel_inference_helper,,,120,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_fleet_rolemaker_new,,,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_dist_mnist_gradient_merge,LINUX;WIN32,GPU;ROCM,360,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_recv_save_op,,,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_communicator_sync,,,,,test_runner.py,2,,FLAGS_communicator_send_queue_size=1;FLAGS_communicator_max_merge_var_num=1;http_proxy=;https_proxy=;PYTHONPATH=../.., -test_fleet_pipeline_meta_optimizer,,GPU;XPU;ASCEND;ASCEND_CL,,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_fleet_gradient_merge_meta_optimizer,,GPU;XPU;ASCEND;ASCEND_CL,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_fleet_pipeline_meta_optimizer,,GPU;XPU,,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_fleet_gradient_merge_meta_optimizer,,GPU;XPU,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_fleet_amp_init,LINUX;WIN32,,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_fleet_meta_optimizer_base,LINUX;WIN32,,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_fleet_raw_program_meta_optimizer,,GPU;XPU;ASCEND;ASCEND_CL,,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_fleet_raw_program_meta_optimizer,,GPU;XPU,,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_parallel_dygraph_sharding_parallel,,,120,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_parallel_dygraph_tensor_parallel,,,200,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_dygraph_group_sharded_api_for_eager,,,120,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_fleet_distributed_strategy,LINUX;WIN32,,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_fleet_dgc_meta_optimizer,,,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_DGC test_parallel_dygraph_unused_variables,,,350,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_fleet_lamb_meta_optimizer,LINUX,GPU;XPU;ASCEND;ASCEND_CL,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_fleet_lamb_meta_optimizer,LINUX,GPU;XPU,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_dgc_momentum_op,,,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_DGC test_parallel_dygraph_no_sync_gradient_check,,,60,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_fleet_pipeline_meta_optimizer_with_recompute,,GPU;XPU;ASCEND;ASCEND_CL,,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_fleet_hybrid_meta_optimizer,LINUX;WIN32,GPU;XPU;ASCEND;ASCEND_CL,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_fleet_pipeline_meta_optimizer_with_recompute,,GPU;XPU,,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_fleet_hybrid_meta_optimizer,LINUX;WIN32,GPU;XPU,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_parallel_dygraph_qat,,,120,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_parallel_dygraph_sparse_embedding,,GPU,200,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL;${NCCL_VERSION} VERSION_GREATER_EQUAL 2212 test_parallel_dygraph_sparse_embedding,,ROCM,200,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_fleet_amp_meta_optimizer,,GPU;XPU;ASCEND;ASCEND_CL,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_fleet_amp_meta_optimizer,,GPU;XPU,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_parallel_dygraph_sparse_embedding_over_height,,GPU,150,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL;${NCCL_VERSION} VERSION_GREATER_EQUAL 2212 test_parallel_dygraph_sparse_embedding_over_height,,ROCM,350,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_distributed_strategy,LINUX;APPLE,,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_auto_parallel_parallelizer,,,120,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_fleet_recompute_meta_optimizer,LINUX;WIN32,GPU;XPU;ASCEND;ASCEND_CL,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_fleet_recompute_meta_optimizer,LINUX;WIN32,GPU;XPU,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_fleet_private_function,LINUX;WIN32,,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_new_group,,GPU;XPU;ASCEND;ASCEND_CL,,DIST,test_new_group.sh,2,,http_proxy=;https_proxy=, -test_c_comm_init_op,LINUX,GPU;XPU;ASCEND;ASCEND_CL,120,DIST,test_c_comm_init_op.sh,2,,http_proxy=;https_proxy=, +test_new_group,,GPU;XPU,,DIST,test_new_group.sh,2,,http_proxy=;https_proxy=, +test_c_comm_init_op,LINUX,GPU;XPU,120,DIST,test_c_comm_init_op.sh,2,,http_proxy=;https_proxy=, test_fused_attention_pass_with_mp,LINUX,GPU,120,DIST,test_fused_attention_pass_with_mp.sh,2,,http_proxy=;https_proxy=, test_ir_pass_pipeline,,,120,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_parallel_dygraph_mnist,,GPU;ROCM,200,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., diff --git a/python/paddle/fluid/tests/unittests/collective/testslist.csv b/python/paddle/fluid/tests/unittests/collective/testslist.csv index cf2b6c6757b30..7d8e32998232e 100644 --- a/python/paddle/fluid/tests/unittests/collective/testslist.csv +++ b/python/paddle/fluid/tests/unittests/collective/testslist.csv @@ -45,7 +45,7 @@ test_communication_stream_reduce_scatter_api,linux,gpu;rocm,120,DIST,,2,,PYTHONP test_communication_stream_scatter_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=, test_communication_stream_sendrecv_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=, test_eager_dist_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., -test_gen_nccl_id_op,,gpu;rocm;ASCEND;ASCEND_CL,,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=.., +test_gen_nccl_id_op,,gpu;rocm,,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_new_group_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_world_size_and_rank,linux,rocm;gpu,120,DIST,test_world_size_and_rank.sh,2,,http_proxy=;https_proxy=, test_mpi_comm,linux,,,DIST,test_mpi_comm.sh,2,,http_proxy=;https_proxy=,WITH_MPI From a85e038a22df9246476f9818c1275c464d5393f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?= <83450930+Liyulingyue@users.noreply.github.com> Date: Mon, 24 Apr 2023 16:16:04 +0800 Subject: [PATCH 020/405] rm is_npu_place (#53105) --- paddle/fluid/eager/amp_auto_cast.h | 1 - paddle/fluid/eager/amp_utils.h | 1 - paddle/fluid/eager/eager_amp_auto_cast.h | 1 - paddle/fluid/framework/data_device_transform.cc | 2 +- .../new_executor/interpreter/data_transfer.cc | 4 +--- .../new_executor/interpreter/interpreter_util.cc | 5 ++--- .../new_executor/interpreter/stream_analyzer.cc | 7 ++----- paddle/fluid/framework/parallel_executor.cc | 2 +- paddle/fluid/framework/tensor_util.cc | 13 +++---------- paddle/fluid/imperative/amp_auto_cast.cc | 2 -- paddle/fluid/imperative/reducer.cc | 3 --- paddle/fluid/imperative/tracer.cc | 8 -------- paddle/fluid/inference/api/analysis_predictor.cc | 6 ------ .../fluid/inference/api/details/zero_copy_tensor.cc | 2 -- paddle/fluid/operators/array_operator.h | 1 - paddle/fluid/operators/beam_search_decode_op.h | 6 ++---- .../operators/controlflow/conditional_block_op.h | 1 - .../fluid/operators/controlflow/while_op_helper.cc | 1 - .../operators/generator/get_expected_kernel_func.cc | 1 - paddle/fluid/operators/reader/buffered_reader.cc | 2 -- paddle/fluid/operators/reduce_ops/reduce_op.h | 1 - paddle/fluid/operators/softmax_op.cc | 2 -- paddle/fluid/operators/tile_op_functor.h | 6 ++---- paddle/fluid/platform/device_context.cc | 2 -- paddle/fluid/platform/place.cc | 8 -------- paddle/fluid/platform/place.h | 1 - paddle/fluid/pybind/eager.cc | 2 -- paddle/fluid/pybind/eager_math_op_patch.cc | 2 -- paddle/fluid/pybind/imperative.cc | 2 -- paddle/fluid/pybind/place.cc | 2 -- paddle/fluid/pybind/tensor_py.h | 4 +--- 31 files changed, 15 insertions(+), 86 deletions(-) diff --git a/paddle/fluid/eager/amp_auto_cast.h b/paddle/fluid/eager/amp_auto_cast.h index 899216b70d20f..ab06e3d33e542 100644 --- a/paddle/fluid/eager/amp_auto_cast.h +++ b/paddle/fluid/eager/amp_auto_cast.h @@ -27,7 +27,6 @@ static inline bool NeedCast(const paddle::Tensor& tensor, if (paddle::platform::is_gpu_place(place) || paddle::platform::is_cuda_pinned_place(place) || paddle::platform::is_xpu_place(place) || - paddle::platform::is_npu_place(place) || paddle::platform::is_npu_pinned_place(place) || paddle::platform::is_custom_place(place)) { // CudaPinndePlace is added for varbase created by dataloader diff --git a/paddle/fluid/eager/amp_utils.h b/paddle/fluid/eager/amp_utils.h index 95c77cf8e32df..ac9edc569df9f 100644 --- a/paddle/fluid/eager/amp_utils.h +++ b/paddle/fluid/eager/amp_utils.h @@ -96,7 +96,6 @@ inline phi::DataType GetDtypeWithPlace( is_right_place = (paddle::platform::is_gpu_place(place) || paddle::platform::is_cuda_pinned_place(place) || paddle::platform::is_xpu_place(place) || - paddle::platform::is_npu_place(place) || paddle::platform::is_npu_pinned_place(place) || paddle::platform::is_custom_place(place)); if (is_right_place) { diff --git a/paddle/fluid/eager/eager_amp_auto_cast.h b/paddle/fluid/eager/eager_amp_auto_cast.h index c8088eec0f7ab..b4311cddc7e3e 100644 --- a/paddle/fluid/eager/eager_amp_auto_cast.h +++ b/paddle/fluid/eager/eager_amp_auto_cast.h @@ -27,7 +27,6 @@ static inline bool NeedCast(const paddle::Tensor& tensor, if (paddle::platform::is_gpu_place(place) || paddle::platform::is_cuda_pinned_place(place) || paddle::platform::is_xpu_place(place) || - paddle::platform::is_npu_place(place) || paddle::platform::is_npu_pinned_place(place) || paddle::platform::is_custom_place(place) || paddle::platform::is_cpu_place(place)) { diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc index c8c92e95ea3a5..f40710f50bc28 100644 --- a/paddle/fluid/framework/data_device_transform.cc +++ b/paddle/fluid/framework/data_device_transform.cc @@ -29,7 +29,7 @@ void TransDataDevice(const phi::DenseTensor &in, "supported between CPU and CUDA.")); // NOTE(zhiqiu): Special case for CPU->NPU, avoid stream sync. - if (platform::is_cpu_place(in.place()) && platform::is_npu_place(dst_place)) { + if (platform::is_cpu_place(in.place())) { paddle::framework::TensorCopy( in, dst_place, diff --git a/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc b/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc index 0db7313bafc1f..6d0615a4b7666 100644 --- a/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc +++ b/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc @@ -227,8 +227,7 @@ void DataTranferHelper::RunAndConstructOpFuncNode( // NOTE(winter-wang): in npu and custom device, D2H kernel is asynchronous. // need to explicit synchronization. - if ((platform::is_npu_place(place) || platform::is_custom_place(place)) && - op_type == kMemcpyD2H) { + if ((platform::is_custom_place(place)) && op_type == kMemcpyD2H) { dev_ctx->Wait(); } @@ -419,7 +418,6 @@ std::shared_ptr TransferDevice(const std::string& var_name, if (IsSupportedHeterPlace(dst_place)) { op_type = kMemcpyH2D; int dst_place_type = platform::is_gpu_place(dst_place) ? 0 - : platform::is_npu_place(dst_place) ? 1 : platform::is_ipu_place(dst_place) ? 3 : platform::is_xpu_place(dst_place) ? 2 : platform::is_custom_place(dst_place) ? 6 diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc index 513b5f5cb3f87..090e973155334 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc @@ -146,9 +146,8 @@ bool IsGradOp(const std::string& op_name) { } bool IsSupportedHeterPlace(const phi::Place& place) { - return platform::is_gpu_place(place) || platform::is_npu_place(place) || - platform::is_xpu_place(place) || platform::is_ipu_place(place) || - platform::is_custom_place(place); + return platform::is_gpu_place(place) || platform::is_xpu_place(place) || + platform::is_ipu_place(place) || platform::is_custom_place(place); } bool IsMemcpyD2H(const Instruction& instr) { diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc index 6401248a2ff54..37d0c41953f96 100644 --- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc +++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc @@ -152,8 +152,7 @@ DeviceContext* StreamAnalyzer::ParseDeviceContext( // only gpu/npu need update. xpu not need, because xpu memcpy op kernel is // synchronous. - if (platform::is_gpu_place(place_) || platform::is_npu_place(place_) || - platform::is_custom_place(place_)) { + if (platform::is_gpu_place(place_) || platform::is_custom_place(place_)) { VLOG(6) << "Parse DeviceContext for " << op_type << ", execution stream = " << execution_stream; if (execution_stream != kDefaultStream) { @@ -447,8 +446,6 @@ platform::DeviceType StreamAnalyzer::GetWaiterType( } else { if (platform::is_xpu_place(place_)) { return platform::kXPU; - } else if (platform::is_npu_place(place_)) { - return platform::kNPU; } else if (platform::is_custom_place(place_)) { return platform::kCUSTOM_DEVICE; } @@ -464,7 +461,7 @@ DownstreamRunType StreamAnalyzer::AnalyseRunTypeForTwoInstructions( } // npu d2h kernel is asynchronous. - if (platform::is_npu_place(place_) || platform::is_custom_place(place_)) { + if (platform::is_custom_place(place_)) { if (interpreter::IsCpuOp(cur_instr) || interpreter::IsMemcpyH2D(next_instr)) { return DownstreamRunType::kDirectRun; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index ce0c2891c7dd6..675ec593d9366 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -672,7 +672,7 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, const BuildStrategy &build_strategy, ir::Graph *graph) : member_(new ParallelExecutorPrivate(places, scope)) { - PADDLE_ENFORCE_EQ(places.size() > 0 && !platform::is_npu_place(places[0]), + PADDLE_ENFORCE_EQ(places.size() > 0, true, platform::errors::Unavailable( "NPU is not supported in ParallelExecutor.")); diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 38e33520f53ce..28a8d9564ec0b 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -275,7 +275,7 @@ void TensorCopyImpl(const TENSOR& src, TENSOR* dst) { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); const platform::DeviceContext* dev_ctx; - if (platform::is_gpu_place(dst_place) || platform::is_npu_place(dst_place) || + if (platform::is_gpu_place(dst_place) || platform::is_custom_place(dst_place)) { dev_ctx = pool.Get(dst_place); } else { @@ -614,7 +614,6 @@ void TensorFromStream(std::istream& is, size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); if (platform::is_gpu_place(dev_ctx.GetPlace()) || platform::is_xpu_place(dev_ctx.GetPlace()) || - platform::is_npu_place(dev_ctx.GetPlace()) || platform::is_custom_place(dev_ctx.GetPlace())) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE) @@ -626,8 +625,7 @@ void TensorFromStream(std::istream& is, is.read(static_cast(buf), size); auto dst_place = dev_ctx.GetPlace(); framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor); - if (platform::is_npu_place(dev_ctx.GetPlace()) || - platform::is_custom_place(dev_ctx.GetPlace())) { + if (platform::is_custom_place(dev_ctx.GetPlace())) { dev_ctx.Wait(); } #else @@ -689,7 +687,6 @@ void TensorFromStream(std::istream& is, size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); if (platform::is_gpu_place(dev_ctx.GetPlace()) || platform::is_xpu_place(dev_ctx.GetPlace()) || - platform::is_npu_place(dev_ctx.GetPlace()) || platform::is_custom_place(dev_ctx.GetPlace())) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE) @@ -701,8 +698,7 @@ void TensorFromStream(std::istream& is, is.read(static_cast(buf), size); auto dst_place = dev_ctx.GetPlace(); framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor); - if (platform::is_npu_place(dev_ctx.GetPlace()) || - platform::is_custom_place(dev_ctx.GetPlace())) { + if (platform::is_custom_place(dev_ctx.GetPlace())) { dev_ctx.Wait(); } #else @@ -712,9 +708,6 @@ void TensorFromStream(std::istream& is, } else if (platform::is_xpu_place(dev_ctx.GetPlace())) { PADDLE_THROW(platform::errors::Unimplemented( "XPUPlace is not supported when not compiled with XPU")); - } else if (platform::is_npu_place(dev_ctx.GetPlace())) { - PADDLE_THROW(platform::errors::Unimplemented( - "NPUPlace is not supported when not compiled with NPU")); } else { PADDLE_THROW(platform::errors::Unimplemented( "CutomPlace is not supported when not compiled with CustomDevice")); diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc index bf6bdf382ce44..335bd4b132533 100644 --- a/paddle/fluid/imperative/amp_auto_cast.cc +++ b/paddle/fluid/imperative/amp_auto_cast.cc @@ -51,7 +51,6 @@ OpSupportedInfos(const std::string& place, {"GPU", &platform::is_gpu_place}, {"CPU", &platform::is_cpu_place}, {"XPU", &platform::is_xpu_place}, - {"NPU", &platform::is_npu_place}, }; PADDLE_ENFORCE_NE(is_target_place.count(query_place), 0, @@ -245,7 +244,6 @@ inline bool NeedCast(const std::shared_ptr& var) { paddle::platform::is_cuda_pinned_place(place) || paddle::platform::is_xpu_place(place) || paddle::platform::is_custom_place(place) || - paddle::platform::is_npu_place(place) || paddle::platform::is_npu_pinned_place(place)) { // CudaPinndePlace is added for varbase created by dataloader if (data_type == paddle::framework::proto::VarType::FP32 || diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index 8c8ff75b2de96..f90e1243d0f8d 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -42,9 +42,6 @@ void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) DivNRanks(tensor, nranks, context); #endif - } else if (platform::is_npu_place(tensor->place())) { - // TODO(kuizhiqing) - VLOG(4) << "divnrank for npu not support yet"; } else if (platform::is_cpu_place(tensor->place())) { VLOG(4) << "before div 2" << *tensor; VLOG(4) << "NDiv for cpu devices : rank = " << nranks; diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index af39832b4f5e4..6207fc54f4d16 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -135,11 +135,6 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( } else if (platform::is_cpu_place(place)) { gc.reset(new framework::CPUGarbageCollector(place, 0)); VLOG(10) << "Created GarbageCollector at " << place; - } else if (platform::is_npu_place(place)) { - PADDLE_THROW(platform::errors::PermissionDenied( - "Paddle can't use NPU device since it's not compiled with NPU," - "Please recompile or reinstall Paddle with NPU support.")); - } else if (platform::is_ipu_place(place)) { #if defined(PADDLE_WITH_IPU) gc.reset(new framework::IPUGarbageCollector(place, 0)); @@ -289,9 +284,6 @@ void Tracer::TraceOpImpl(const std::string& type, PADDLE_THROW(platform::errors::PreconditionNotMet( "PaddlePaddle should compile with XPU if use XPUPlace.")); #endif - } else if (platform::is_npu_place(place)) { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "PaddlePaddle should compile with NPU if use NPUPlace.")); } else if (platform::is_custom_place(place)) { #ifdef PADDLE_WITH_CUSTOM_DEVICE phi::DeviceManager::SetDevice(place); diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index b0ef79a0c7bdf..219c3c2754c68 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1874,9 +1874,6 @@ std::unique_ptr AnalysisPredictor::GetInputTensor( auto xpu_place = place_; res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId()); } - } else if (platform::is_npu_place(place_)) { - auto npu_place = place_; - res->SetPlace(PaddlePlace::kNPU, npu_place.GetDeviceId()); } else if (platform::is_custom_place(place_)) { auto custom_place = place_; auto paddleplace = static_cast( @@ -1931,9 +1928,6 @@ std::unique_ptr AnalysisPredictor::GetOutputTensor( auto xpu_place = place_; res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId()); } - } else if (platform::is_npu_place(place_)) { - auto npu_place = place_; - res->SetPlace(PaddlePlace::kNPU, npu_place.GetDeviceId()); } else if (platform::is_custom_place(place_)) { auto custom_place = place_; auto paddleplace = static_cast( diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index c10e6b4a43fdd..9fd94fa5d57e2 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -152,8 +152,6 @@ T *Tensor::data(PlaceType *place, int *size) const { *place = PlaceType::kGPU; } else if (paddle::platform::is_xpu_place(tensor->place())) { *place = PlaceType::kXPU; - } else if (paddle::platform::is_npu_place(tensor->place())) { - *place = PlaceType::kNPU; } else if (paddle::platform::is_custom_place(tensor->place())) { *place = PlaceType::kCUSTOM; } else { diff --git a/paddle/fluid/operators/array_operator.h b/paddle/fluid/operators/array_operator.h index fb3e96db4dbd1..c7b8ce3f381d1 100644 --- a/paddle/fluid/operators/array_operator.h +++ b/paddle/fluid/operators/array_operator.h @@ -52,7 +52,6 @@ class ArrayOp : public framework::OperatorBase { size_t offset; if (platform::is_gpu_place(i_tensor.place()) || platform::is_xpu_place(i_tensor.place()) || - platform::is_npu_place(i_tensor.place()) || platform::is_custom_place(i_tensor.place())) { // FIXME: Avoid copy from GPU to CPU phi::DenseTensor t; diff --git a/paddle/fluid/operators/beam_search_decode_op.h b/paddle/fluid/operators/beam_search_decode_op.h index c4f7b3b5785f4..a6d807b028c1b 100644 --- a/paddle/fluid/operators/beam_search_decode_op.h +++ b/paddle/fluid/operators/beam_search_decode_op.h @@ -36,8 +36,7 @@ struct BeamSearchDecodeFunctor { tensor_on_gpu_ = false; tensor_on_npu_ = false; // First make a copy of GPU data on CPU - if (platform::is_gpu_place(step_ids_origin_[0].place()) || - platform::is_npu_place(step_ids_origin_[0].place())) { + if (platform::is_gpu_place(step_ids_origin_[0].place())) { if (platform::is_gpu_place(step_ids_origin_[0].place())) { tensor_on_gpu_ = true; } else { @@ -61,8 +60,7 @@ struct BeamSearchDecodeFunctor { step_ids_.push_back(out); } } - if (platform::is_gpu_place(step_scores_origin_[0].place()) || - platform::is_npu_place(step_scores_origin_[0].place())) { + if (platform::is_gpu_place(step_scores_origin_[0].place())) { if (platform::is_gpu_place(step_scores_origin_[0].place())) { tensor_on_gpu_ = true; } else { diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.h b/paddle/fluid/operators/controlflow/conditional_block_op.h index 7b89a211ce4ad..0f04a295ed263 100644 --- a/paddle/fluid/operators/controlflow/conditional_block_op.h +++ b/paddle/fluid/operators/controlflow/conditional_block_op.h @@ -83,7 +83,6 @@ class ConditionalOp : public framework::OperatorBase { platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait(); res = cpu_tensor.data()[0]; #endif - } else if (platform::is_npu_place(ips[0]->place())) { } else if (platform::is_xpu_place(ips[0]->place())) { #ifdef PADDLE_WITH_XPU phi::DenseTensor cpu_tensor; diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc index 38865a1c53e0b..6ae32f33e957a 100644 --- a/paddle/fluid/operators/controlflow/while_op_helper.cc +++ b/paddle/fluid/operators/controlflow/while_op_helper.cc @@ -225,7 +225,6 @@ bool GetCondData(const phi::DenseTensor &cond) { return cond.data()[0]; } // when platform::is_gpu_place(cond.place()) or - // platform::is_npu_place(cond.place()) or // platform::is_xpu_place(cond.place()) is true std::unique_ptr cpu_cond{new phi::DenseTensor()}; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ diff --git a/paddle/fluid/operators/generator/get_expected_kernel_func.cc b/paddle/fluid/operators/generator/get_expected_kernel_func.cc index 558b0c400a4ca..6085dabaed6d8 100644 --- a/paddle/fluid/operators/generator/get_expected_kernel_func.cc +++ b/paddle/fluid/operators/generator/get_expected_kernel_func.cc @@ -75,7 +75,6 @@ phi::KernelKey GetReduceExpectedKernelType( if (input_data_type == framework::proto::VarType::FP16) { PADDLE_ENFORCE_EQ( platform::is_gpu_place(ctx.GetPlace()) || - platform::is_npu_place(ctx.GetPlace()) || platform::is_xpu_place(ctx.GetPlace()) || platform::is_custom_place(ctx.GetPlace()), true, diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index 3551b829b0487..f0f54eafaa36b 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -384,8 +384,6 @@ void BufferedReader::ReadNextImpl(paddle::framework::LoDTensorArray *out) { if (platform::is_gpu_place(place_)) { *out = std::move(cuda_buffer_[i]); - } else if (platform::is_npu_place(place_)) { - *out = std::move(npu_buffer_[i]); } else if (platform::is_xpu_place(place_)) { *out = std::move(xpu_buffer_[i]); } else if (platform::is_custom_place(place_)) { diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h index 3c2b0b948bb22..3349400a2f93b 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.h @@ -624,7 +624,6 @@ class ReduceBaseOp : public framework::OperatorWithKernel { if (input_data_type == framework::proto::VarType::FP16) { PADDLE_ENFORCE_EQ( platform::is_gpu_place(ctx.GetPlace()) || - platform::is_npu_place(ctx.GetPlace()) || platform::is_xpu_place(ctx.GetPlace()) || platform::is_custom_place(ctx.GetPlace()), true, diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index ab5816965f05c..633ef748be698 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -43,7 +43,6 @@ class SoftmaxOp : public framework::OperatorWithKernel { if (input_data_type == framework::proto::VarType::FP16) { PADDLE_ENFORCE_EQ( platform::is_gpu_place(ctx.GetPlace()) || - platform::is_npu_place(ctx.GetPlace()) || platform::is_xpu_place(ctx.GetPlace()) || platform::is_custom_place(ctx.GetPlace()), true, @@ -128,7 +127,6 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { ctx, framework::GradVarName("Out")); if (input_data_type == framework::proto::VarType::FP16) { if (!(platform::is_gpu_place(ctx.GetPlace()) || - platform::is_npu_place(ctx.GetPlace()) || platform::is_xpu_place(ctx.GetPlace()) || platform::is_custom_place(ctx.GetPlace()))) PADDLE_THROW(platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/tile_op_functor.h b/paddle/fluid/operators/tile_op_functor.h index 4d49510d71d7b..5b2dc31beb044 100644 --- a/paddle/fluid/operators/tile_op_functor.h +++ b/paddle/fluid/operators/tile_op_functor.h @@ -29,8 +29,7 @@ inline std::vector get_repeat_times( auto* repeat_data = repeat_tensor->data(); phi::DenseTensor cpu_repeat_tensor; if (platform::is_gpu_place(repeat_tensor->place()) || - platform::is_xpu_place(repeat_tensor->place()) || - platform::is_npu_place(repeat_tensor->place())) { + platform::is_xpu_place(repeat_tensor->place())) { paddle::framework::TensorCopySync( *repeat_tensor, platform::CPUPlace(), &cpu_repeat_tensor); repeat_data = cpu_repeat_tensor.data(); @@ -48,8 +47,7 @@ inline std::vector get_repeat_times( for (size_t i = 0; i < list_repeat_times_tensor.size(); ++i) { auto tensor = list_repeat_times_tensor[i]; if (platform::is_gpu_place(tensor->place()) || - platform::is_xpu_place(tensor->place()) || - platform::is_npu_place(tensor->place())) { + platform::is_xpu_place(tensor->place())) { phi::DenseTensor temp; paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); vec_repeat_times.push_back(*temp.data()); diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index ee12b42c80530..e14ba8b1710cf 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -45,8 +45,6 @@ DeviceType Place2DeviceType(const platform::Place& place) { return platform::DeviceType::XPU; } else if (platform::is_ipu_place(place)) { return platform::DeviceType::IPU; - } else if (platform::is_npu_place(place)) { - return platform::DeviceType::NPU; } else if (platform::is_custom_place(place)) { return platform::DeviceType::CUSTOM_DEVICE; } else { diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc index e87531e2b8819..354259ca91b74 100644 --- a/paddle/fluid/platform/place.cc +++ b/paddle/fluid/platform/place.cc @@ -33,10 +33,6 @@ bool is_xpu_place(const Place &p) { return p.GetType() == phi::AllocationType::XPU; } -bool is_npu_place(const Place &p) { - return p.GetType() == phi::AllocationType::NPU; -} - bool is_ipu_place(const Place &p) { return p.GetType() == phi::AllocationType::IPU; } @@ -73,8 +69,6 @@ bool is_same_place(const Place &p1, const Place &p2) { return true; } else if (is_xpu_place(p1)) { return p1 == p2; - } else if (is_npu_place(p1)) { - return p1 == p2; } else if (is_ipu_place(p1)) { return p1 == p2; } else if (is_custom_place(p1)) { @@ -93,8 +87,6 @@ std::string PlaceHelper::GetDeviceType(const Place &place) { return "cpu"; } else if (is_gpu_place(place)) { return "gpu"; - } else if (is_npu_place(place)) { - return "npu"; } else if (is_xpu_place(place)) { return "xpu"; } else if (is_custom_place(place)) { diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h index c5f96f5db80c2..f787a1b28783d 100644 --- a/paddle/fluid/platform/place.h +++ b/paddle/fluid/platform/place.h @@ -47,7 +47,6 @@ class PlaceHelper { bool is_gpu_place(const Place &); bool is_xpu_place(const Place &); -bool is_npu_place(const Place &); bool is_ipu_place(const Place &); bool is_cpu_place(const Place &); bool is_cuda_pinned_place(const Place &); diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc index cd980340c47a4..d69417a6c0a73 100644 --- a/paddle/fluid/pybind/eager.cc +++ b/paddle/fluid/pybind/eager.cc @@ -153,8 +153,6 @@ void InitTensorWithNumpyValue(TensorObject* self, } else if (platform::is_cuda_pinned_place(place)) { SetTensorFromPyArray( impl_ptr, array, place, zero_copy); - } else if (platform::is_npu_place(place)) { - SetTensorFromPyArray(impl_ptr, array, place, zero_copy); } else if (platform::is_custom_place(place)) { SetTensorFromPyArray( impl_ptr, array, place, zero_copy); diff --git a/paddle/fluid/pybind/eager_math_op_patch.cc b/paddle/fluid/pybind/eager_math_op_patch.cc index 8756abb5f3673..d764aedd8a1d6 100644 --- a/paddle/fluid/pybind/eager_math_op_patch.cc +++ b/paddle/fluid/pybind/eager_math_op_patch.cc @@ -102,8 +102,6 @@ void InitTensorWithNumpyValue(const py::object& array, } else if (platform::is_cuda_pinned_place(place)) { SetTensorFromPyArray( impl_ptr, array, place, zero_copy); - } else if (platform::is_npu_place(place)) { - SetTensorFromPyArray(impl_ptr, array, place, zero_copy); } else if (platform::is_custom_place(place)) { SetTensorFromPyArray( impl_ptr, array, place, zero_copy); diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index d6a5a8b8dfc87..39a28c7487c30 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -199,8 +199,6 @@ static void InitVarBaseAndTensor(imperative::VarBase *self, } else if (platform::is_cuda_pinned_place(place)) { SetTensorFromPyArray( tensor, array, place, zero_copy); - } else if (platform::is_npu_place(place)) { - SetTensorFromPyArray(tensor, array, place, zero_copy); } else if (platform::is_ipu_place(place)) { SetTensorFromPyArray(tensor, array, place, zero_copy); } else if (platform::is_custom_place(place)) { diff --git a/paddle/fluid/pybind/place.cc b/paddle/fluid/pybind/place.cc index 3a464cdc545ed..1c27d70d1bea7 100644 --- a/paddle/fluid/pybind/place.cc +++ b/paddle/fluid/pybind/place.cc @@ -648,8 +648,6 @@ void BindPlace(pybind11::module &m) { // NOLINT [](platform::Place &self) { return platform::is_cpu_place(self); }) .def("is_xpu_place", [](platform::Place &self) { return platform::is_xpu_place(self); }) - .def("is_npu_place", - [](platform::Place &self) { return platform::is_npu_place(self); }) .def("is_ipu_place", [](platform::Place &self) { return platform::is_ipu_place(self); }) .def("is_cuda_pinned_place", diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index d2bd64bc0fb15..05bacbbf54144 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -960,7 +960,6 @@ inline py::array TensorToPyArray(const phi::DenseTensor &tensor, } bool is_gpu_tensor = platform::is_gpu_place(tensor.place()); bool is_xpu_tensor = platform::is_xpu_place(tensor.place()); - bool is_npu_tensor = platform::is_npu_place(tensor.place()); bool is_custom_device_tensor = platform::is_custom_place(tensor.place()); const auto &tensor_dims = tensor.dims(); auto tensor_dtype = framework::TransToProtoVarType(tensor.dtype()); @@ -981,8 +980,7 @@ inline py::array TensorToPyArray(const phi::DenseTensor &tensor, std::string py_dtype_str = details::TensorDTypeToPyDTypeStr( framework::TransToProtoVarType(tensor.dtype())); - if (!is_gpu_tensor && !is_xpu_tensor && !is_npu_tensor && - !is_custom_device_tensor) { + if (!is_gpu_tensor && !is_xpu_tensor && !is_custom_device_tensor) { if (!need_deep_copy) { auto base = py::cast(std::move(tensor)); return py::array(py::dtype(py_dtype_str.c_str()), From 9d90738cdfa2b1547645bee62a65fe1109abcc84 Mon Sep 17 00:00:00 2001 From: GGBond8488 <33050871+GGBond8488@users.noreply.github.com> Date: Mon, 24 Apr 2023 16:21:48 +0800 Subject: [PATCH 021/405] add 0D support for trace (#53208) * add 0D support for trace, test=allcase * fix trace gpu kernel 0d error, test=allcase * fix windows error, test=allcase --- paddle/phi/infermeta/unary.cc | 1 - paddle/phi/kernels/gpu/trace_kernel.cu | 5 +++- .../phi/kernels/impl/trace_grad_kernel_impl.h | 3 ++- .../tests/unittests/test_zero_dim_tensor.py | 24 +++++++++++++++++++ 4 files changed, 30 insertions(+), 3 deletions(-) diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index bfe744446a97b..ea27eba513051 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -4402,7 +4402,6 @@ void TraceInferMeta( auto sizes = vectorize(x_dims); if (x_dims.size() == 2) { sizes.clear(); - sizes.push_back(1); } else { sizes.erase(sizes.begin() + std::max(dim1_, dim2_)); sizes.erase(sizes.begin() + std::min(dim1_, dim2_)); diff --git a/paddle/phi/kernels/gpu/trace_kernel.cu b/paddle/phi/kernels/gpu/trace_kernel.cu index 671ca490e136a..304bf778094d3 100644 --- a/paddle/phi/kernels/gpu/trace_kernel.cu +++ b/paddle/phi/kernels/gpu/trace_kernel.cu @@ -32,7 +32,10 @@ void TraceKernel(const Context& ctx, auto diag = funcs::Diagonal(ctx, &x, offset, axis1, axis2); if (diag.numel() > 0) { std::vector reduce_dims; - reduce_dims.push_back(out->dims().size()); + // Adapt to 0D output + auto out_dim_size = out->dims().size(); + if (out_dim_size == 0) out_dim_size = 1; + reduce_dims.push_back(out_dim_size); funcs::ReduceKernel>( ctx, diag, out, kps::IdentityFunctor(), reduce_dims); } else { diff --git a/paddle/phi/kernels/impl/trace_grad_kernel_impl.h b/paddle/phi/kernels/impl/trace_grad_kernel_impl.h index 90a2327ef3e20..1099f27f3622e 100644 --- a/paddle/phi/kernels/impl/trace_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/trace_grad_kernel_impl.h @@ -91,7 +91,8 @@ void TraceGradKernel(const Context& ctx, auto input_dims = in_grad->dims(); auto input_stride = phi::stride(input_dims); auto output_dims = out_grad.dims(); - auto output_stride = phi::stride(output_dims); + auto output_stride = output_dims.size() == 0 ? phi::DDim(output_dims) + : phi::stride(output_dims); auto* out_data = out_grad.data(); T* x_data = ctx.template Alloc(in_grad); diff --git a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py index 7ea98f7c889a3..965bcae57d9db 100644 --- a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py @@ -2437,6 +2437,16 @@ def test_multi_dot(self): self.assertEqual(b.grad.shape, [4, 5]) self.assertEqual(c.grad.shape, [5]) + def test_trace(self): + x = paddle.to_tensor([[3, 2], [1, 9]], dtype="float32") + x.stop_gradient = False + out = paddle.trace(x) + out.backward() + + self.assertEqual(out.shape, []) + np.testing.assert_allclose(out, np.array(12)) + self.assertEqual(x.grad.shape, [2, 2]) + class TestSundryAPIStatic(unittest.TestCase): def setUp(self): @@ -4426,6 +4436,20 @@ def test_multi_dot(self): self.assertEqual(res[2].shape, (4, 5)) self.assertEqual(res[3].shape, (5,)) + @prog_scope() + def test_trace(self): + x = paddle.to_tensor([[3, 2], [1, 9]], dtype="float32") + x.stop_gradient = False + out = paddle.trace(x) + paddle.static.append_backward(out) + + prog = paddle.static.default_main_program() + res = self.exe.run(prog, fetch_list=[out, x.grad_name]) + + self.assertEqual(res[0].shape, ()) + self.assertEqual(res[1].shape, (2, 2)) + np.testing.assert_allclose(res[0], np.array(12)) + # Use to test API whose zero-dim input tensors don't have grad and not need to test backward in OpTest. class TestNoBackwardAPI(unittest.TestCase): From adc2b7459f2bcbe9757f6522af41dddc9f0d5df5 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Mon, 24 Apr 2023 16:28:16 +0800 Subject: [PATCH 022/405] Mv eager ut (#53167) * Mv eager tests * fix * fix build error * fix build error * fix codestyle --- paddle/fluid/eager/CMakeLists.txt | 1 - paddle/fluid/eager/tests/CMakeLists.txt | 3 -- test/cpp/CMakeLists.txt | 1 + test/cpp/eager/CMakeLists.txt | 35 +++++++++++++++++++ .../data_structure_tests/CMakeLists.txt | 0 .../accumulation_node_test.cc | 0 .../autograd_meta_test.cc | 2 +- .../data_structure_tests/eager_tensor_test.cc | 0 .../grad_node_info_test.cc | 2 +- .../data_structure_tests/grad_node_test.h | 0 .../grad_tensor_holder_test.cc | 0 .../tensor_wrapper_test.cc | 2 +- .../eager}/performance_tests/CMakeLists.txt | 0 .../performance_tests/benchmark_eager_cpu.cc | 4 +-- .../performance_tests/benchmark_eager_cuda.cc | 4 +-- .../performance_tests/benchmark_fluid_cpu.cc | 4 +-- .../performance_tests/benchmark_fluid_cuda.cc | 4 +-- .../performance_tests/benchmark_utils.cc | 4 +-- .../performance_tests/benchmark_utils.h | 0 .../cpp/eager}/task_tests/CMakeLists.txt | 0 .../cpp/eager}/task_tests/backward_test.cc | 2 +- .../cross_batch_accumulation_test.cc | 2 +- .../cpp/eager}/task_tests/eager_utils_test.cc | 4 +-- .../task_tests/forward_autograd_test.cc | 2 +- .../eager}/task_tests/fwd_bwd_joint_test.cc | 2 +- .../cpp/eager}/task_tests/generated_test.cc | 2 +- .../cpp/eager}/task_tests/grad_test.cc | 2 +- .../cpp/eager}/task_tests/hook_test.cc | 2 +- .../task_tests/hook_test_intermidiate.cc | 2 +- .../eager}/task_tests/nan_inf_utils_test.cc | 0 .../eager}/task_tests/tensor_utils_test.cc | 2 +- .../tests => test/cpp/eager}/test_utils.h | 0 test/cpp/prim/test_eager_prim.cc | 2 +- 33 files changed, 61 insertions(+), 29 deletions(-) delete mode 100644 paddle/fluid/eager/tests/CMakeLists.txt create mode 100644 test/cpp/eager/CMakeLists.txt rename {paddle/fluid/eager/tests => test/cpp/eager}/data_structure_tests/CMakeLists.txt (100%) rename {paddle/fluid/eager/tests => test/cpp/eager}/data_structure_tests/accumulation_node_test.cc (100%) rename {paddle/fluid/eager/tests => test/cpp/eager}/data_structure_tests/autograd_meta_test.cc (97%) rename {paddle/fluid/eager/tests => test/cpp/eager}/data_structure_tests/eager_tensor_test.cc (100%) rename {paddle/fluid/eager/tests => test/cpp/eager}/data_structure_tests/grad_node_info_test.cc (98%) rename {paddle/fluid/eager/tests => test/cpp/eager}/data_structure_tests/grad_node_test.h (100%) rename {paddle/fluid/eager/tests => test/cpp/eager}/data_structure_tests/grad_tensor_holder_test.cc (100%) rename {paddle/fluid/eager/tests => test/cpp/eager}/data_structure_tests/tensor_wrapper_test.cc (97%) rename {paddle/fluid/eager/tests => test/cpp/eager}/performance_tests/CMakeLists.txt (100%) rename {paddle/fluid/eager/tests => test/cpp/eager}/performance_tests/benchmark_eager_cpu.cc (98%) rename {paddle/fluid/eager/tests => test/cpp/eager}/performance_tests/benchmark_eager_cuda.cc (98%) rename {paddle/fluid/eager/tests => test/cpp/eager}/performance_tests/benchmark_fluid_cpu.cc (98%) rename {paddle/fluid/eager/tests => test/cpp/eager}/performance_tests/benchmark_fluid_cuda.cc (98%) rename {paddle/fluid/eager/tests => test/cpp/eager}/performance_tests/benchmark_utils.cc (99%) rename {paddle/fluid/eager/tests => test/cpp/eager}/performance_tests/benchmark_utils.h (100%) rename {paddle/fluid/eager/tests => test/cpp/eager}/task_tests/CMakeLists.txt (100%) rename {paddle/fluid/eager/tests => test/cpp/eager}/task_tests/backward_test.cc (99%) rename {paddle/fluid/eager/tests => test/cpp/eager}/task_tests/cross_batch_accumulation_test.cc (98%) rename {paddle/fluid/eager/tests => test/cpp/eager}/task_tests/eager_utils_test.cc (98%) rename {paddle/fluid/eager/tests => test/cpp/eager}/task_tests/forward_autograd_test.cc (99%) rename {paddle/fluid/eager/tests => test/cpp/eager}/task_tests/fwd_bwd_joint_test.cc (99%) rename {paddle/fluid/eager/tests => test/cpp/eager}/task_tests/generated_test.cc (99%) rename {paddle/fluid/eager/tests => test/cpp/eager}/task_tests/grad_test.cc (99%) rename {paddle/fluid/eager/tests => test/cpp/eager}/task_tests/hook_test.cc (99%) rename {paddle/fluid/eager/tests => test/cpp/eager}/task_tests/hook_test_intermidiate.cc (99%) rename {paddle/fluid/eager/tests => test/cpp/eager}/task_tests/nan_inf_utils_test.cc (100%) rename {paddle/fluid/eager/tests => test/cpp/eager}/task_tests/tensor_utils_test.cc (97%) rename {paddle/fluid/eager/tests => test/cpp/eager}/test_utils.h (100%) diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt index 46d1ff43e0dfe..a0ff3300ffa29 100755 --- a/paddle/fluid/eager/CMakeLists.txt +++ b/paddle/fluid/eager/CMakeLists.txt @@ -39,7 +39,6 @@ add_subdirectory(api) add_subdirectory(custom_operator) if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) add_subdirectory(accumulation) - add_subdirectory(tests) add_subdirectory(pylayer) cc_library( grad_tensor_holder diff --git a/paddle/fluid/eager/tests/CMakeLists.txt b/paddle/fluid/eager/tests/CMakeLists.txt deleted file mode 100644 index 6bcd34262c8ab..0000000000000 --- a/paddle/fluid/eager/tests/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -add_subdirectory(data_structure_tests) -add_subdirectory(task_tests) -add_subdirectory(performance_tests) diff --git a/test/cpp/CMakeLists.txt b/test/cpp/CMakeLists.txt index 603b7eb78ae7e..2f395cb62ed65 100644 --- a/test/cpp/CMakeLists.txt +++ b/test/cpp/CMakeLists.txt @@ -5,3 +5,4 @@ add_subdirectory(prim) add_subdirectory(imperative) add_subdirectory(ir) add_subdirectory(inference) +add_subdirectory(eager) diff --git a/test/cpp/eager/CMakeLists.txt b/test/cpp/eager/CMakeLists.txt new file mode 100644 index 0000000000000..d8d3a6304cffc --- /dev/null +++ b/test/cpp/eager/CMakeLists.txt @@ -0,0 +1,35 @@ +set(eager_deps + phi_api + phi_dygraph_api + hook_utils + tensor_utils + utils + global_utils + backward + phi_tensor + tracer + layer + autograd_meta + eager_nan_inf_utils + grad_node_info + grad_tensor_holder + custom_operator_node) + +if(NOT (NOT WITH_PYTHON AND ON_INFER)) + set(eager_deps ${eager_deps} accumulation_node prim_utils) +endif() + +set(fluid_deps + tracer + layer + proto_desc + operator + op_registry + variable_helper + memcpy) +set(generated_deps final_dygraph_function final_dygraph_node dygraph_function + dygraph_node) + +add_subdirectory(data_structure_tests) +add_subdirectory(task_tests) +add_subdirectory(performance_tests) diff --git a/paddle/fluid/eager/tests/data_structure_tests/CMakeLists.txt b/test/cpp/eager/data_structure_tests/CMakeLists.txt similarity index 100% rename from paddle/fluid/eager/tests/data_structure_tests/CMakeLists.txt rename to test/cpp/eager/data_structure_tests/CMakeLists.txt diff --git a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc b/test/cpp/eager/data_structure_tests/accumulation_node_test.cc similarity index 100% rename from paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc rename to test/cpp/eager/data_structure_tests/accumulation_node_test.cc diff --git a/paddle/fluid/eager/tests/data_structure_tests/autograd_meta_test.cc b/test/cpp/eager/data_structure_tests/autograd_meta_test.cc similarity index 97% rename from paddle/fluid/eager/tests/data_structure_tests/autograd_meta_test.cc rename to test/cpp/eager/data_structure_tests/autograd_meta_test.cc index f588f8c8c759a..651e3b63f07ac 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/autograd_meta_test.cc +++ b/test/cpp/eager/data_structure_tests/autograd_meta_test.cc @@ -18,8 +18,8 @@ #include "gtest/gtest.h" #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/eager/grad_node_info.h" -#include "paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h" #include "paddle/phi/api/lib/utils/allocator.h" +#include "test/cpp/eager/data_structure_tests/grad_node_test.h" TEST(AutogradMeta, Constructor) { paddle::Tensor et1; diff --git a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc b/test/cpp/eager/data_structure_tests/eager_tensor_test.cc similarity index 100% rename from paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc rename to test/cpp/eager/data_structure_tests/eager_tensor_test.cc diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc b/test/cpp/eager/data_structure_tests/grad_node_info_test.cc similarity index 98% rename from paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc rename to test/cpp/eager/data_structure_tests/grad_node_info_test.cc index c56db9e611713..0948e6f72aa0b 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc +++ b/test/cpp/eager/data_structure_tests/grad_node_info_test.cc @@ -19,8 +19,8 @@ #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/eager/hooks.h" -#include "paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h" #include "paddle/phi/api/lib/utils/allocator.h" +#include "test/cpp/eager/data_structure_tests/grad_node_test.h" TEST(GradNodeInfo, GradSlotMeta) { auto grad_slot = egr::GradSlotMeta(); diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h b/test/cpp/eager/data_structure_tests/grad_node_test.h similarity index 100% rename from paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h rename to test/cpp/eager/data_structure_tests/grad_node_test.h diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc b/test/cpp/eager/data_structure_tests/grad_tensor_holder_test.cc similarity index 100% rename from paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc rename to test/cpp/eager/data_structure_tests/grad_tensor_holder_test.cc diff --git a/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc b/test/cpp/eager/data_structure_tests/tensor_wrapper_test.cc similarity index 97% rename from paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc rename to test/cpp/eager/data_structure_tests/tensor_wrapper_test.cc index 86973debfe323..a3a82b0c3b201 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc +++ b/test/cpp/eager/data_structure_tests/tensor_wrapper_test.cc @@ -16,8 +16,8 @@ #include "glog/logging.h" #include "gtest/gtest.h" -#include "paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h" #include "paddle/fluid/eager/utils.h" +#include "test/cpp/eager/data_structure_tests/grad_node_test.h" TEST(TensorWrapper, Basic) { VLOG(6) << "Test Full reserved"; diff --git a/paddle/fluid/eager/tests/performance_tests/CMakeLists.txt b/test/cpp/eager/performance_tests/CMakeLists.txt similarity index 100% rename from paddle/fluid/eager/tests/performance_tests/CMakeLists.txt rename to test/cpp/eager/performance_tests/CMakeLists.txt diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/test/cpp/eager/performance_tests/benchmark_eager_cpu.cc similarity index 98% rename from paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc rename to test/cpp/eager/performance_tests/benchmark_eager_cpu.cc index 54deba9d51652..c3c7d847f9794 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc +++ b/test/cpp/eager/performance_tests/benchmark_eager_cpu.cc @@ -22,10 +22,10 @@ #include "paddle/fluid/eager/api/all.h" #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/backward.h" -#include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h" -#include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/fluid/imperative/tracer.h" #include "paddle/phi/core/flags.h" +#include "test/cpp/eager/performance_tests/benchmark_utils.h" +#include "test/cpp/eager/test_utils.h" #ifdef WITH_GPERFTOOLS #include "gperftools/profiler.h" diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/test/cpp/eager/performance_tests/benchmark_eager_cuda.cc similarity index 98% rename from paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc rename to test/cpp/eager/performance_tests/benchmark_eager_cuda.cc index d7beb958df852..48c2054e7076e 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc +++ b/test/cpp/eager/performance_tests/benchmark_eager_cuda.cc @@ -21,10 +21,10 @@ #include "paddle/fluid/eager/api/all.h" #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/backward.h" -#include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h" -#include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/fluid/imperative/tracer.h" #include "paddle/phi/core/flags.h" +#include "test/cpp/eager/performance_tests/benchmark_utils.h" +#include "test/cpp/eager/test_utils.h" #ifdef WITH_GPERFTOOLS #include "gperftools/profiler.h" diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc b/test/cpp/eager/performance_tests/benchmark_fluid_cpu.cc similarity index 98% rename from paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc rename to test/cpp/eager/performance_tests/benchmark_fluid_cpu.cc index 2289a9e8680b4..18bdb8d2241cf 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc +++ b/test/cpp/eager/performance_tests/benchmark_fluid_cpu.cc @@ -23,11 +23,11 @@ #include "glog/logging.h" #include "gtest/gtest.h" -#include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h" -#include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/fluid/imperative/basic_engine.h" #include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/memory/memcpy.h" +#include "test/cpp/eager/performance_tests/benchmark_utils.h" +#include "test/cpp/eager/test_utils.h" #ifdef WITH_GPERFTOOLS #include "gperftools/profiler.h" diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/test/cpp/eager/performance_tests/benchmark_fluid_cuda.cc similarity index 98% rename from paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc rename to test/cpp/eager/performance_tests/benchmark_fluid_cuda.cc index c0f2250fbae18..d24c79f390c91 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc +++ b/test/cpp/eager/performance_tests/benchmark_fluid_cuda.cc @@ -23,11 +23,11 @@ #include "glog/logging.h" #include "gtest/gtest.h" -#include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h" -#include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/fluid/imperative/basic_engine.h" #include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/memory/memcpy.h" +#include "test/cpp/eager/performance_tests/benchmark_utils.h" +#include "test/cpp/eager/test_utils.h" #ifdef WITH_GPERFTOOLS #include "gperftools/profiler.h" diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc b/test/cpp/eager/performance_tests/benchmark_utils.cc similarity index 99% rename from paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc rename to test/cpp/eager/performance_tests/benchmark_utils.cc index c84dd7661b669..83d14a6b45b89 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc +++ b/test/cpp/eager/performance_tests/benchmark_utils.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h" +#include "test/cpp/eager/performance_tests/benchmark_utils.h" #include #include @@ -24,8 +24,8 @@ #include "paddle/fluid/eager/api/all.h" #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/backward.h" -#include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/fluid/eager/utils.h" +#include "test/cpp/eager/test_utils.h" // Eager Generated #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h b/test/cpp/eager/performance_tests/benchmark_utils.h similarity index 100% rename from paddle/fluid/eager/tests/performance_tests/benchmark_utils.h rename to test/cpp/eager/performance_tests/benchmark_utils.h diff --git a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt b/test/cpp/eager/task_tests/CMakeLists.txt similarity index 100% rename from paddle/fluid/eager/tests/task_tests/CMakeLists.txt rename to test/cpp/eager/task_tests/CMakeLists.txt diff --git a/paddle/fluid/eager/tests/task_tests/backward_test.cc b/test/cpp/eager/task_tests/backward_test.cc similarity index 99% rename from paddle/fluid/eager/tests/task_tests/backward_test.cc rename to test/cpp/eager/task_tests/backward_test.cc index 43a994068ca82..b26945d00ae14 100644 --- a/paddle/fluid/eager/tests/task_tests/backward_test.cc +++ b/test/cpp/eager/task_tests/backward_test.cc @@ -24,10 +24,10 @@ #include "paddle/fluid/eager/api/utils/tensor_utils.h" #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/grad_node_info.h" -#include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_meta.h" +#include "test/cpp/eager/test_utils.h" PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); diff --git a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc b/test/cpp/eager/task_tests/cross_batch_accumulation_test.cc similarity index 98% rename from paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc rename to test/cpp/eager/task_tests/cross_batch_accumulation_test.cc index 28cc3b2b30b34..6d48970f2e95f 100644 --- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc +++ b/test/cpp/eager/task_tests/cross_batch_accumulation_test.cc @@ -23,10 +23,10 @@ #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/backward.h" #include "paddle/fluid/eager/grad_node_info.h" -#include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_meta.h" +#include "test/cpp/eager/test_utils.h" PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); diff --git a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc b/test/cpp/eager/task_tests/eager_utils_test.cc similarity index 98% rename from paddle/fluid/eager/tests/task_tests/eager_utils_test.cc rename to test/cpp/eager/task_tests/eager_utils_test.cc index d9d277ad59b19..77902fa5eed50 100644 --- a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc +++ b/test/cpp/eager/task_tests/eager_utils_test.cc @@ -18,11 +18,11 @@ #include "paddle/fluid/eager/accumulation/accumulation_node.h" #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/eager/grad_node_info.h" -#include "paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h" -#include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/fluid/eager/utils.h" #include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/core/kernel_registry.h" +#include "test/cpp/eager/data_structure_tests/grad_node_test.h" +#include "test/cpp/eager/test_utils.h" PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); diff --git a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc b/test/cpp/eager/task_tests/forward_autograd_test.cc similarity index 99% rename from paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc rename to test/cpp/eager/task_tests/forward_autograd_test.cc index b25b40571fb71..41ca170998203 100644 --- a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc +++ b/test/cpp/eager/task_tests/forward_autograd_test.cc @@ -21,10 +21,10 @@ #include "paddle/fluid/eager/api/utils/tensor_utils.h" #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/grad_node_info.h" -#include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_meta.h" +#include "test/cpp/eager/test_utils.h" PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/test/cpp/eager/task_tests/fwd_bwd_joint_test.cc similarity index 99% rename from paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc rename to test/cpp/eager/task_tests/fwd_bwd_joint_test.cc index 5098be17bca47..4e9dffa28e415 100644 --- a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc +++ b/test/cpp/eager/task_tests/fwd_bwd_joint_test.cc @@ -23,10 +23,10 @@ #include "paddle/fluid/eager/backward.h" #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/hooks.h" -#include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_meta.h" +#include "test/cpp/eager/test_utils.h" PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); diff --git a/paddle/fluid/eager/tests/task_tests/generated_test.cc b/test/cpp/eager/task_tests/generated_test.cc similarity index 99% rename from paddle/fluid/eager/tests/task_tests/generated_test.cc rename to test/cpp/eager/task_tests/generated_test.cc index 7b7c47ecb71f7..dcb04ba66ca66 100644 --- a/paddle/fluid/eager/tests/task_tests/generated_test.cc +++ b/test/cpp/eager/task_tests/generated_test.cc @@ -22,10 +22,10 @@ #include "paddle/fluid/eager/api/utils/tensor_utils.h" #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/backward.h" -#include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/fluid/eager/utils.h" #include "paddle/fluid/imperative/tracer.h" #include "paddle/phi/core/kernel_registry.h" +#include "test/cpp/eager/test_utils.h" PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT); diff --git a/paddle/fluid/eager/tests/task_tests/grad_test.cc b/test/cpp/eager/task_tests/grad_test.cc similarity index 99% rename from paddle/fluid/eager/tests/task_tests/grad_test.cc rename to test/cpp/eager/task_tests/grad_test.cc index 8f76b6d05ee87..a1e2dece37595 100644 --- a/paddle/fluid/eager/tests/task_tests/grad_test.cc +++ b/test/cpp/eager/task_tests/grad_test.cc @@ -23,10 +23,10 @@ #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/backward.h" #include "paddle/fluid/eager/grad_node_info.h" -#include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_meta.h" +#include "test/cpp/eager/test_utils.h" PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/test/cpp/eager/task_tests/hook_test.cc similarity index 99% rename from paddle/fluid/eager/tests/task_tests/hook_test.cc rename to test/cpp/eager/task_tests/hook_test.cc index 2c67427c55bbe..0105d52846ed0 100644 --- a/paddle/fluid/eager/tests/task_tests/hook_test.cc +++ b/test/cpp/eager/task_tests/hook_test.cc @@ -23,10 +23,10 @@ #include "paddle/fluid/eager/backward.h" #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/hooks.h" -#include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_meta.h" +#include "test/cpp/eager/test_utils.h" PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); diff --git a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc b/test/cpp/eager/task_tests/hook_test_intermidiate.cc similarity index 99% rename from paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc rename to test/cpp/eager/task_tests/hook_test_intermidiate.cc index fe63ff7da48d1..5c91b1be253a9 100644 --- a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc +++ b/test/cpp/eager/task_tests/hook_test_intermidiate.cc @@ -20,10 +20,10 @@ #include "paddle/fluid/eager/backward.h" #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/hooks.h" -#include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/fluid/imperative/tracer.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" +#include "test/cpp/eager/test_utils.h" PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT); diff --git a/paddle/fluid/eager/tests/task_tests/nan_inf_utils_test.cc b/test/cpp/eager/task_tests/nan_inf_utils_test.cc similarity index 100% rename from paddle/fluid/eager/tests/task_tests/nan_inf_utils_test.cc rename to test/cpp/eager/task_tests/nan_inf_utils_test.cc diff --git a/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc b/test/cpp/eager/task_tests/tensor_utils_test.cc similarity index 97% rename from paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc rename to test/cpp/eager/task_tests/tensor_utils_test.cc index f055839e516ad..21e6304fe03aa 100644 --- a/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc +++ b/test/cpp/eager/task_tests/tensor_utils_test.cc @@ -20,9 +20,9 @@ #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/grad_tensor_holder.h" -#include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/core/kernel_registry.h" +#include "test/cpp/eager/test_utils.h" PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); diff --git a/paddle/fluid/eager/tests/test_utils.h b/test/cpp/eager/test_utils.h similarity index 100% rename from paddle/fluid/eager/tests/test_utils.h rename to test/cpp/eager/test_utils.h diff --git a/test/cpp/prim/test_eager_prim.cc b/test/cpp/prim/test_eager_prim.cc index 5055e439c40fd..d2cdeb80495c1 100644 --- a/test/cpp/prim/test_eager_prim.cc +++ b/test/cpp/prim/test_eager_prim.cc @@ -20,11 +20,11 @@ #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" #include "paddle/fluid/eager/api/utils/hook_utils.h" #include "paddle/fluid/eager/backward.h" -#include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/fluid/prim/utils/utils.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_meta.h" +#include "test/cpp/eager/test_utils.h" #include "test/cpp/prim/init_env_utils.h" DECLARE_string(tensor_operants_mode); From 71474b10c2f0d3eb6f4f296a1a053d1d6c7c5ca0 Mon Sep 17 00:00:00 2001 From: Yuanle Liu Date: Mon, 24 Apr 2023 16:34:59 +0800 Subject: [PATCH 023/405] fix static_assert with no message (#53222) --- paddle/phi/kernels/funcs/fused_gemm_epilogue.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/phi/kernels/funcs/fused_gemm_epilogue.h b/paddle/phi/kernels/funcs/fused_gemm_epilogue.h index 4f1a1c6f0bde1..8a2309ba26000 100644 --- a/paddle/phi/kernels/funcs/fused_gemm_epilogue.h +++ b/paddle/phi/kernels/funcs/fused_gemm_epilogue.h @@ -535,8 +535,10 @@ void ComputeFusedGemmEpilogueBackwardImpl(const phi::GPUContext& dev_ctx, bool use_addto_dx, bool use_addto_dy) { using MT = typename phi::dtype::MPTypeTrait::Type; - static_assert(std::is_same::value || std::is_same::value); - static_assert(std::is_same::value || std::is_same::value); + static_assert(std::is_same::value || std::is_same::value, + ""); + static_assert(std::is_same::value || std::is_same::value, + ""); using Trait = FusedGEMMGradTrait; if (dx) { From ae426b784af7af148d3804c39748a46eb3ef39fe Mon Sep 17 00:00:00 2001 From: zyfncg Date: Mon, 24 Apr 2023 16:37:16 +0800 Subject: [PATCH 024/405] fix compile bug of kps (#53251) --- .../kps/{elementwise_kernel.cu => elementwise_raw_kernel.cu} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename paddle/phi/kernels/legacy/kps/{elementwise_kernel.cu => elementwise_raw_kernel.cu} (100%) diff --git a/paddle/phi/kernels/legacy/kps/elementwise_kernel.cu b/paddle/phi/kernels/legacy/kps/elementwise_raw_kernel.cu similarity index 100% rename from paddle/phi/kernels/legacy/kps/elementwise_kernel.cu rename to paddle/phi/kernels/legacy/kps/elementwise_raw_kernel.cu From bfa5d6b8f3198da42bdcfe52f2ee2e07866be0a8 Mon Sep 17 00:00:00 2001 From: zhupengyang Date: Mon, 24 Apr 2023 16:53:32 +0800 Subject: [PATCH 025/405] transform cachekv datalayout of fused_multi_transformer_xpu (#53144) --- paddle/fluid/framework/ir/CMakeLists.txt | 6 + paddle/fluid/framework/ir/pass.cc | 1 + ...i_transformer_cachekv_layout_trans_pass.cc | 202 ++++++++++++++++++ ...ti_transformer_cachekv_layout_trans_pass.h | 79 +++++++ ...nsformer_cachekv_layout_trans_pass_test.cc | 125 +++++++++++ .../inference/api/paddle_pass_builder.cc | 1 + paddle/phi/infermeta/fusion.cc | 73 +++---- 7 files changed, 448 insertions(+), 39 deletions(-) create mode 100644 paddle/fluid/framework/ir/xpu/fused_multi_transformer_cachekv_layout_trans_pass.cc create mode 100644 paddle/fluid/framework/ir/xpu/fused_multi_transformer_cachekv_layout_trans_pass.h create mode 100644 paddle/fluid/framework/ir/xpu/fused_multi_transformer_cachekv_layout_trans_pass_test.cc diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 9fa7527827ae4..944dbd07bab5c 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -245,6 +245,8 @@ if(WITH_XPU) pass_library(fused_multi_transformer_xpu_quant_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) pass_library(stack_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) + pass_library(fused_multi_transformer_cachekv_layout_trans_pass inference DIR + xpu DEPS ${XPU_PASS_DEPS}) endif() cc_library( @@ -528,4 +530,8 @@ if(WITH_XPU) test_stack_fuse_pass SRCS xpu/stack_fuse_pass_test.cc DEPS stack_fuse_pass) + cc_test( + test_fused_multi_transformer_cachekv_layout_trans_pass + SRCS xpu/fused_multi_transformer_cachekv_layout_trans_pass_test.cc + DEPS fused_multi_transformer_cachekv_layout_trans_pass) endif() diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc index cc4033f7f5a54..6611be59fcc80 100644 --- a/paddle/fluid/framework/ir/pass.cc +++ b/paddle/fluid/framework/ir/pass.cc @@ -62,6 +62,7 @@ static const std::vector xpu_support_subgraph_passes = { "embedding_with_eltwise_add_xpu_fuse_pass", "multi_encoder_xpu_fuse_pass", "multi_encoder_xpu_slice_fuse_pass", + "fused_multi_transformer_cachekv_layout_trans_pass", "one_beam_size_fuse_pass", "stack_fuse_pass", "fused_multi_transformer_xpu_quant_pass", diff --git a/paddle/fluid/framework/ir/xpu/fused_multi_transformer_cachekv_layout_trans_pass.cc b/paddle/fluid/framework/ir/xpu/fused_multi_transformer_cachekv_layout_trans_pass.cc new file mode 100644 index 0000000000000..993b5a055280d --- /dev/null +++ b/paddle/fluid/framework/ir/xpu/fused_multi_transformer_cachekv_layout_trans_pass.cc @@ -0,0 +1,202 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/xpu/fused_multi_transformer_cachekv_layout_trans_pass.h" +#include +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/xpu/pass_utils.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { +namespace patterns { + +struct FusedMultiTransformerFillConstantPattern : public PatternBase { + FusedMultiTransformerFillConstantPattern(PDPattern* pattern, + const std::string& name_scope); + + // declare operator node's name + PATTERN_DECL_NODE(fill_constant); + PATTERN_DECL_NODE(fused_multi_transformer); + // declare variable node's name + PATTERN_DECL_NODE(fill_constant_out); +}; // struct FusedMultiTransformerFillConstantPattern + +FusedMultiTransformerFillConstantPattern:: + FusedMultiTransformerFillConstantPattern(PDPattern* pattern, + const std::string& name_scope) + : PatternBase(pattern, name_scope, name_scope) { + auto* fill_constant = pattern->NewNode(fill_constant_repr()) + ->assert_is_op("fill_constant") + ->assert_has_n_inputs(5) + ->assert_more([](Node* node) { + return node->Op()->GetAttrIfExists( + "friendly_device_type") != "xpu"; + }); + auto* fill_constant_out = pattern->NewNode(fill_constant_out_repr()) + ->assert_is_op_output("fill_constant", "Out"); + auto* fused_multi_transformer = + pattern->NewNode(fused_multi_transformer_repr()) + ->assert_is_op("fused_multi_transformer"); + + fill_constant->LinksTo({fill_constant_out}); + fused_multi_transformer->LinksFrom({fill_constant_out}); +} + +struct FusedMultiTransformerGatherPattern : public PatternBase { + FusedMultiTransformerGatherPattern(PDPattern* pattern, + const std::string& name_scope); + + // declare operator node's name + PATTERN_DECL_NODE(fused_multi_transformer); + PATTERN_DECL_NODE(gather); + // declare variable node's name + PATTERN_DECL_NODE(gather_in); + PATTERN_DECL_NODE(gather_out); +}; // struct FusedMultiTransformerGatherPattern + +FusedMultiTransformerGatherPattern::FusedMultiTransformerGatherPattern( + PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, name_scope) { + auto* gather_in = + pattern->NewNode(gather_in_repr())->assert_is_op_input("gather", "X"); + auto* gather = pattern->NewNode(gather_repr()) + ->assert_is_op("gather") + ->assert_more([](Node* node) { + return node->Op()->GetAttrIfExists("axis") == 1; + }); + auto* gather_out = + pattern->NewNode(gather_out_repr())->assert_is_op_output("gather", "Out"); + auto* fused_multi_transformer = + pattern->NewNode(fused_multi_transformer_repr()) + ->assert_is_op("fused_multi_transformer"); + + gather->LinksFrom({gather_in}).LinksTo({gather_out}); + fused_multi_transformer->LinksFrom({gather_out}); +} +} // namespace patterns + +void FusedMultiTransformerCacheKVLayoutTransPass::FillConstantReshapePass( + ir::Graph* graph) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::PreconditionNotMet("graph should not be null.")); + GraphPatternDetector gpd; + patterns::FusedMultiTransformerFillConstantPattern pattern( + gpd.mutable_pattern(), name_scope_); + + int found_subgraph_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* graph) { + VLOG(4) << "handle FillConstantReshapePass"; + GET_IR_NODE(fused_multi_transformer); + GET_IR_NODE(fill_constant); + GET_IR_NODE(fill_constant_out); + auto cachekv_names = fused_multi_transformer->Op()->Input("CacheKV"); + if (std::count(cachekv_names.begin(), + cachekv_names.end(), + fill_constant_out->Name()) == 0) + return; + + auto fill_constant_input_names = + fill_constant->Op()->Input("ShapeTensorList"); + auto fill_constant_trans_input_names = + std::vector{fill_constant_input_names[0], + fill_constant_input_names[3], + fill_constant_input_names[1], + fill_constant_input_names[2], + fill_constant_input_names[4]}; + fill_constant->Op()->SetInput("ShapeTensorList", + fill_constant_trans_input_names); + + auto fill_constant_output_shape = fill_constant_out->Var()->GetShape(); + fill_constant_out->Var()->SetShape({fill_constant_output_shape[0], + fill_constant_output_shape[3], + fill_constant_output_shape[1], + fill_constant_output_shape[2], + fill_constant_output_shape[4]}); + + fused_multi_transformer->Op()->SetAttr("friendly_device_type", + std::string("xpu")); + fill_constant->Op()->SetAttr("friendly_device_type", std::string("xpu")); + found_subgraph_count++; + }; + + gpd(graph, handler); + AddStatis(found_subgraph_count); +} + +void FusedMultiTransformerCacheKVLayoutTransPass::GatherReshapePass( + ir::Graph* graph) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::PreconditionNotMet("graph should not be null.")); + GraphPatternDetector gpd; + patterns::FusedMultiTransformerGatherPattern pattern(gpd.mutable_pattern(), + name_scope_); + + int found_subgraph_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* graph) { + VLOG(4) << "handle GatherReshapePass"; + GET_IR_NODE(gather); + GET_IR_NODE(fused_multi_transformer); + GET_IR_NODE(gather_in); + GET_IR_NODE(gather_out); + auto cachekv_names = fused_multi_transformer->Op()->Input("CacheKV"); + if (std::count(cachekv_names.begin(), + cachekv_names.end(), + gather_out->Name()) == 0) + return; + + auto gather_in_shape = gather_in->Var()->GetShape(); + auto gather_out_shape = gather_out->Var()->GetShape(); + gather_in->Var()->SetShape({gather_in_shape[0], + gather_in_shape[3], + gather_in_shape[1], + gather_in_shape[2], + gather_in_shape[4]}); + gather_out->Var()->SetShape({gather_out_shape[0], + gather_out_shape[3], + gather_out_shape[1], + gather_out_shape[2], + gather_out_shape[4]}); + gather->Op()->SetAttr("axis", 2); + fused_multi_transformer->Op()->SetAttr("friendly_device_type", + std::string("xpu")); + + found_subgraph_count++; + }; + + gpd(graph, handler); + AddStatis(found_subgraph_count); +} + +void FusedMultiTransformerCacheKVLayoutTransPass::ApplyImpl( + ir::Graph* graph) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::PreconditionNotMet("graph should not be null.")); + Init(name_scope_, graph); + + FillConstantReshapePass(graph); + GatherReshapePass(graph); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS( + fused_multi_transformer_cachekv_layout_trans_pass, + paddle::framework::ir::FusedMultiTransformerCacheKVLayoutTransPass); diff --git a/paddle/fluid/framework/ir/xpu/fused_multi_transformer_cachekv_layout_trans_pass.h b/paddle/fluid/framework/ir/xpu/fused_multi_transformer_cachekv_layout_trans_pass.h new file mode 100644 index 0000000000000..cb87317a76e6a --- /dev/null +++ b/paddle/fluid/framework/ir/xpu/fused_multi_transformer_cachekv_layout_trans_pass.h @@ -0,0 +1,79 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace phi { +class DenseTensor; +} // namespace phi + +namespace paddle { +namespace framework { +class Scope; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace framework { +namespace ir { + +class FusedMultiTransformerCacheKVLayoutTransPass : public FusePassBase { + protected: + void ApplyImpl(ir::Graph* graph) const override; + + private: + /* + Origin subgraph: + (ShapeTensorList: [d0,d1,d2,d3,d4]) + | + fill_constant + | + fused_multi_transformer + + Fused subgraph: + (ShapeTensorList: [d0,d3,d1,d2,d4]) + | + fill_constant + | + fused_multi_transformer + */ + void FillConstantReshapePass(ir::Graph* graph) const; + + /* + Origin subgraph: + (gather_x: [d0,d1,d2,d3,d4]) + | + gather(axis=1) + | + fused_multi_transformer + + Fused subgraph: + (gather_x: [d0,d3,d1,d2,d4]) + | + gather(axis=2) + | + fused_multi_transformer + */ + void GatherReshapePass(ir::Graph* graph) const; + + const std::string name_scope_{ + "fused_multi_transformer_cachekv_layout_trans_pass"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/xpu/fused_multi_transformer_cachekv_layout_trans_pass_test.cc b/paddle/fluid/framework/ir/xpu/fused_multi_transformer_cachekv_layout_trans_pass_test.cc new file mode 100644 index 0000000000000..ec5dba201163f --- /dev/null +++ b/paddle/fluid/framework/ir/xpu/fused_multi_transformer_cachekv_layout_trans_pass_test.cc @@ -0,0 +1,125 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/ir/pass_tester_helper.h" + +namespace paddle { +namespace framework { +namespace ir { + +VarDesc* Data(paddle::framework::BlockDesc* block, + std::string name, + std::vector shape = {}, + bool is_persistable = false, + proto::VarType::Type data_type = proto::VarType::FP32) { + auto* var = block->Var(name); + var->SetType(proto::VarType::LOD_TENSOR); + var->SetDataType(data_type); + var->SetShape(shape); + var->SetPersistable(is_persistable); + return var; +} + +VarDesc* fill_constant(BlockDesc* block, std::vector shapes) { + VarDesc* out = Data(block, shapes[0]->Name() + "_out"); + OpDesc* op = block->AppendOp(); + op->SetType("fill_constant"); + std::vector shape_names; + for (auto shape : shapes) { + shape_names.push_back(shape->Name()); + } + op->SetInput("ShapeTensorList", {shape_names}); + op->SetOutput("Out", {out->Name()}); + return out; +} + +TEST(FillConstantReshapePass, basic) { + paddle::framework::ProgramDesc program; + auto* block = program.MutableBlock(0); + auto* shape0 = Data(block, "shape0"); + auto* shape1 = Data(block, "shape1"); + auto* shape2 = Data(block, "shape2"); + auto* shape3 = Data(block, "shape3"); + auto* shape4 = Data(block, "shape4"); + auto* shape5 = Data(block, "shape5"); + auto* shape6 = Data(block, "shape6"); + auto* shape7 = Data(block, "shape7"); + auto* shape8 = Data(block, "shape8"); + auto* shape9 = Data(block, "shape9"); + auto* fill0 = fill_constant(block, {shape0, shape1, shape2, shape3, shape4}); + fill0->SetShape({1, 2, 3, 4, 5}); + auto* fill1 = fill_constant(block, {shape5, shape6, shape7, shape8, shape9}); + fill1->SetShape({1, 2, 3, 4, 5}); + OpDesc* fused_multi_transformer = block->AppendOp(); + fused_multi_transformer->SetType("fused_multi_transformer"); + fused_multi_transformer->SetInput("CacheKV", {fill0->Name(), fill1->Name()}); + + std::unique_ptr graph(new ir::Graph(program)); + auto pass = PassRegistry::Instance().Get( + "fused_multi_transformer_cachekv_layout_trans_pass"); + pass->Apply(graph.get()); + auto fills = GetOpNodes(graph, "fill_constant"); + auto fill0_in_names = fills[0]->Op()->Input("ShapeTensorList"); + std::vector expect_fill0_in_names{ + "shape0", "shape3", "shape1", "shape2", "shape4"}; + PADDLE_ENFORCE_EQ(fill0_in_names, + expect_fill0_in_names, + platform::errors::PreconditionNotMet( + "fill_constant name should be updated.")); + auto fill1_in_names = fills[1]->Op()->Input("ShapeTensorList"); + std::vector expect_fill1_in_names{ + "shape5", "shape8", "shape6", "shape7", "shape9"}; + PADDLE_ENFORCE_EQ(fill1_in_names, + expect_fill1_in_names, + platform::errors::PreconditionNotMet( + "fill_constant name should be updated.")); +} + +TEST(GatherReshapePass, basic) { + Layers layers; + auto* gather0_x = layers.data("gather0_x", {2, 1, 24, 512, 64}); + auto* gather0_index = layers.data("gather0_index", {1}); + auto* gather0_out = layers.gather(gather0_x, gather0_index, 1); + gather0_out->SetShape({2, 1, 24, 512, 64}); + auto* gather1_x = layers.data("gather1_x", {2, 1, 24, 512, 64}); + auto* gather1_index = layers.data("gather1_index", {1}); + auto* gather1_out = layers.gather(gather1_x, gather1_index, 1); + gather1_out->SetShape({2, 1, 24, 512, 64}); + auto* block = layers.Block(); + OpDesc* fused_multi_transformer = block->AppendOp(); + fused_multi_transformer->SetType("fused_multi_transformer"); + fused_multi_transformer->SetInput("CacheKV", + {gather0_out->Name(), gather1_out->Name()}); + + std::unique_ptr graph(new ir::Graph(layers.main_program())); + auto pass = PassRegistry::Instance().Get( + "fused_multi_transformer_cachekv_layout_trans_pass"); + pass->Apply(graph.get()); + auto gathers = GetOpNodes(graph, "gather"); + for (auto* gather : gathers) { + PADDLE_ENFORCE_EQ( + gather->Op()->GetAttrIfExists("axis"), + 2, + platform::errors::PreconditionNotMet( + "gather's axis attr should be updated to 2 by pass.")); + } +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(fused_multi_transformer_cachekv_layout_trans_pass); diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 1be90972b924b..e7c24272b81c5 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -519,6 +519,7 @@ XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) { "embedding_with_eltwise_add_xpu_fuse_pass", "multi_encoder_xpu_fuse_pass", "multi_encoder_xpu_slice_fuse_pass", + "fused_multi_transformer_cachekv_layout_trans_pass", "one_beam_size_fuse_pass", "delete_cast_op_pass", "stack_fuse_pass", diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc index 5c0aa3b8e89fd..f775cedce8c11 100644 --- a/paddle/phi/infermeta/fusion.cc +++ b/paddle/phi/infermeta/fusion.cc @@ -291,31 +291,26 @@ void FusedMultiTransformerXpuInferMeta( std::vector cache_kv_out) { auto x_dim = x.dims(); auto y_dim = qkvw[0]->dims(); - PADDLE_ENFORCE_EQ( - x_dim.size(), - 3, - phi::errors::InvalidArgument("The dimensions of x must be 3" - "(batch_size, seq_len, dim_embed)," - "but received dimensions of" - "Input is [%d]", - x_dim.size())); + PADDLE_ENFORCE_EQ(x_dim.size(), + 3, + phi::errors::InvalidArgument( + "The dimensions of x must be 3(batch_size, seq_len, " + "dim_embed), but received dimensions of Input is [%d]", + x_dim.size())); PADDLE_ENFORCE_EQ( y_dim.size(), 4, - phi::errors::InvalidArgument("The dimensions of qkv_weight must be 4" - "(3, num_head, dim_head, dim_embed)," - "but received dimensions of" - "Input is [%d]", - y_dim.size())); + phi::errors::InvalidArgument( + "The dimensions of qkv_weight must be 4(3, num_head, dim_head, " + "dim_embed), but received dimensions of qkv_weight is [%d]", + y_dim.size())); PADDLE_ENFORCE_EQ( x_dim[2], trans_qkvw ? y_dim[3] : y_dim[0], phi::errors::InvalidArgument( - "ShapeError: the dimension of x_dim[2] and y_dim[3](trans_qkvw is " - "true) or y_dim[0](trans_qkvw is false)" - "must be equal. But received: the shape " - "of input x = [%s], and the shape of " - "input qkv_weight = [%s]", + "The dimension of x_dim[2] and y_dim[3](trans_qkvw is true) or " + "y_dim[0](trans_qkvw is false) must be equal, but received: the " + "shape of input x = [%s], and the shape of input qkv_weight = [%s]", x_dim, y_dim)); if (cache_kv.size() > 0) { @@ -330,27 +325,27 @@ void FusedMultiTransformerXpuInferMeta( phi::errors::InvalidArgument( "The first dim of CacheKV must be 2, but got %d", c_dim[0])); // 2 - PADDLE_ENFORCE_EQ(c_dim[1], - x_dim[0], - phi::errors::InvalidArgument( - "The second dim of CacheKV must be equal with " - "batch size %d, but got %d", - x_dim[0], - c_dim[1])); // batch_size - PADDLE_ENFORCE_EQ(c_dim[2], - trans_qkvw ? y_dim[1] : y_dim[2], - phi::errors::InvalidArgument( - "The third dim of CacheKV must be equal with num " - "head %d, but got %d", - trans_qkvw ? y_dim[1] : y_dim[2], - c_dim[2])); // num_head - PADDLE_ENFORCE_EQ(c_dim[4], - trans_qkvw ? y_dim[2] : y_dim[3], - phi::errors::InvalidArgument( - "The fifth dim of CacheKV must be equal with head " - "size %d, but got %d", - trans_qkvw ? y_dim[2] : y_dim[3], - c_dim[4])); // head_size + PADDLE_ENFORCE_EQ( + c_dim[2], + x_dim[0], + phi::errors::InvalidArgument("The third dim of CacheKV must be equal " + "with batch size %d, but got %d", + x_dim[0], + c_dim[2])); // batch_size + PADDLE_ENFORCE_EQ( + c_dim[3], + trans_qkvw ? y_dim[1] : y_dim[2], + phi::errors::InvalidArgument("The fourth dim of CacheKV must be equal " + "with num head %d, but got %d", + trans_qkvw ? y_dim[1] : y_dim[2], + c_dim[3])); // num_head + PADDLE_ENFORCE_EQ( + c_dim[4], + trans_qkvw ? y_dim[2] : y_dim[3], + phi::errors::InvalidArgument("The fifth dim of CacheKV must be equal " + "with head size %d, but got %d", + trans_qkvw ? y_dim[2] : y_dim[3], + c_dim[4])); // head_size } out->set_dims(x_dim); From a0aff194a181de8d7b31932bbfcaee11ce7fcd8b Mon Sep 17 00:00:00 2001 From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com> Date: Mon, 24 Apr 2023 17:01:07 +0800 Subject: [PATCH 026/405] Fix the calculation of layer_norm_bwd (#53224) * Fix the calculation of layer_norm_bwd * fix --- paddle/phi/kernels/funcs/layer_norm_impl.cu.h | 29 +++++++++---------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h index d8ade4612c85e..b240be28ec949 100644 --- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h +++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h @@ -1603,13 +1603,13 @@ __global__ void LayerNormBackwardGradientAll( for (int64_t i = beg_idx; i < end_idx; i += stride) { int row_idx = i / feature_size; - auto var_val = real_sqrt(static_cast(var[row_idx]) + epsilon); + auto var_val = rsqrt_(static_cast(var[row_idx]) + epsilon); d_scale_partial += static_cast(d_y[i]) * - (static_cast(x[i]) - mean[row_idx]) / var_val; + (static_cast(x[i]) - mean[row_idx]) * var_val; d_bias_partial += static_cast(d_y[i]); if (HasDx) { d_x[i] = static_cast(static_cast(d_y[i]) * - static_cast(scale[blockIdx.x + col_offset]) / + static_cast(scale[blockIdx.x + col_offset]) * var_val); } } @@ -1659,10 +1659,10 @@ __global__ void LayerNormBackwardGradientScaleOrBias( for (int64_t i = beg_idx; i < end_idx; i += stride) { int row_idx = i / feature_size; auto var_val = - static_cast(real_sqrt(static_cast(var[row_idx]) + epsilon)); + static_cast(rsqrt_(static_cast(var[row_idx]) + epsilon)); if (HasDScale) { d_scale_or_d_bias_partial += static_cast(d_y[i]) * - (static_cast(x[i]) - mean[row_idx]) / + (static_cast(x[i]) - mean[row_idx]) * var_val; } else { // d_bias != nullptr d_scale_or_d_bias_partial += static_cast(d_y[i]); @@ -1671,10 +1671,10 @@ __global__ void LayerNormBackwardGradientScaleOrBias( if (HasDx) { if (scale != nullptr) { d_x[i] = static_cast(static_cast(d_y[i]) * - static_cast(scale[blockIdx.x + col_offset]) / + static_cast(scale[blockIdx.x + col_offset]) * var_val); } else { - d_x[i] = static_cast(static_cast(d_y[i]) / var_val); + d_x[i] = static_cast(static_cast(d_y[i]) * var_val); } } } @@ -1762,13 +1762,13 @@ __global__ void LayerNormBackwardGradientOnlyDX( U d_x_mean_partial = static_cast(0), d_x_var_partial = static_cast(0); for (int64_t i = beg_idx; i < end_idx; i += BlockDim) { auto var_val = - static_cast(real_sqrt(static_cast(block_var) + epsilon)); + static_cast(rsqrt_(static_cast(block_var) + epsilon)); if (scale != nullptr) { int col_idx = i % feature_size; d_x[i] = static_cast(static_cast(d_y[i]) * - static_cast(scale[col_idx]) / var_val); + static_cast(scale[col_idx]) * var_val); } else { - d_x[i] = static_cast(static_cast(d_y[i]) / var_val); + d_x[i] = static_cast(static_cast(d_y[i]) * var_val); } d_x_mean_partial += static_cast(d_x[i]); d_x_var_partial += @@ -1812,21 +1812,20 @@ __global__ void LayerNormBackwardWhenBatchSizeIsOne( int64_t idx = threadIdx.x + blockIdx.x * blockDim.x; using ScaleBiasT = LayerNormScaleBiasT; if (idx < feature_size) { - auto var_val = - static_cast(real_sqrt(static_cast(var[0]) + epsilon)); + auto var_val = static_cast(rsqrt_(static_cast(var[0]) + epsilon)); if (d_x != nullptr) { if (d_scale == nullptr) { - d_x[idx] = static_cast(static_cast(d_y[idx]) / var_val); + d_x[idx] = static_cast(static_cast(d_y[idx]) * var_val); } else { d_x[idx] = static_cast(static_cast(d_y[idx]) * - static_cast(scale[idx]) / var_val); + static_cast(scale[idx]) * var_val); } } if (d_scale != nullptr) { d_scale[idx] = static_cast(static_cast(d_y[idx]) * - (static_cast(x[idx]) - mean[0]) / var_val); + (static_cast(x[idx]) - mean[0]) * var_val); } if (d_bias != nullptr) { From c0a604e79c83b57f1b75f539b11f17899ced8bd3 Mon Sep 17 00:00:00 2001 From: YangQun Date: Mon, 24 Apr 2023 02:01:29 -0700 Subject: [PATCH 027/405] [Zero-Dim] support 0d tensor for shape and squeeze onednn kernel (#52832) * support 0d tensor for shape and squeeze onednn kernel * set python api for shape op ut --- paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc | 5 +++-- paddle/phi/kernels/onednn/squeeze_grad_kernel.cc | 4 +++- paddle/phi/kernels/onednn/squeeze_kernel.cc | 6 ++++-- .../unittests/white_list/check_shape_white_list.py | 2 ++ test/mkldnn/test_shape_mkldnn_op.py | 9 ++++++++- test/mkldnn/test_squeeze2_mkldnn_op.py | 14 ++++++++++++++ 6 files changed, 34 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc index 79e84e4755337..e2fc1c19f8e39 100644 --- a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc @@ -89,8 +89,9 @@ class ReshapeMKLDNNKernel : public framework::OpKernel { astream.wait(); out->Resize(out_dims); - out->set_mem_desc( - reorder_dst_memory_p->get_desc().reshape(phi::vectorize(out_dims))); + auto reshape_dims = out_dims.size() != 0 ? phi::vectorize(out_dims) + : std::vector{1}; + out->set_mem_desc(reorder_dst_memory_p->get_desc().reshape(reshape_dims)); } void InferInOutShape(const framework::ExecutionContext& ctx, diff --git a/paddle/phi/kernels/onednn/squeeze_grad_kernel.cc b/paddle/phi/kernels/onednn/squeeze_grad_kernel.cc index 654acfe5700c3..93b8fc75a9a48 100644 --- a/paddle/phi/kernels/onednn/squeeze_grad_kernel.cc +++ b/paddle/phi/kernels/onednn/squeeze_grad_kernel.cc @@ -25,7 +25,9 @@ void SqueezeGradKernel(const Context& dev_ctx, const DenseTensor& dout, const IntArray& axes, DenseTensor* dx) { - auto dout_vec_dims = vectorize(dout.dims()); + auto dout_vec_dims = dout.dims().size() != 0 ? vectorize(dout.dims()) + : std::vector{1}; + auto dout_type = funcs::ToOneDNNDataType(dout.dtype()); funcs::ReorderOneDNNHandler reorder_handler( diff --git a/paddle/phi/kernels/onednn/squeeze_kernel.cc b/paddle/phi/kernels/onednn/squeeze_kernel.cc index 0ad82bfeddace..9b86f9e1a9c74 100644 --- a/paddle/phi/kernels/onednn/squeeze_kernel.cc +++ b/paddle/phi/kernels/onednn/squeeze_kernel.cc @@ -47,8 +47,10 @@ void ExecuteSqueeze(const Context& dev_ctx, astream.wait(); out->Resize(out_dims); - out->set_mem_desc( - reorder_dst_memory_p->get_desc().reshape(vectorize(out_dims))); + + auto reshape_dims = + out_dims.size() != 0 ? vectorize(out_dims) : std::vector{1}; + out->set_mem_desc(reorder_dst_memory_p->get_desc().reshape(reshape_dims)); } template diff --git a/python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py b/python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py index 86f84d00cea55..db5a710867277 100644 --- a/python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py +++ b/python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py @@ -31,4 +31,6 @@ 'multi_dot', 'index_add', 'reshape2', + 'squeeze', + 'squeeze2', ] diff --git a/test/mkldnn/test_shape_mkldnn_op.py b/test/mkldnn/test_shape_mkldnn_op.py index 2616ad65458c0..b954336de23e2 100644 --- a/test/mkldnn/test_shape_mkldnn_op.py +++ b/test/mkldnn/test_shape_mkldnn_op.py @@ -21,10 +21,10 @@ from paddle.fluid.tests.unittests.eager_op_test import OpTest, OpTestTool -@OpTestTool.skip_if_not_cpu_bf16() class TestShape3DFP32OneDNNOp(OpTest): def setUp(self): self.op_type = "shape" + self.python_api = paddle.tensor.shape self.config() self.attrs = {'use_mkldnn': True} self.inputs = {'Input': np.zeros(self.shape).astype(self.dtype)} @@ -38,6 +38,13 @@ def test_check_output(self): self.check_output_with_place(core.CPUPlace()) +class TestShape0DFP32OneDNNOp(TestShape3DFP32OneDNNOp): + def config(self): + self.shape = [] + self.dtype = np.float32 + + +@OpTestTool.skip_if_not_cpu_bf16() class TestShape6DBF16OneDNNOp(TestShape3DFP32OneDNNOp): def config(self): self.shape = [10, 2, 3, 4, 5, 2] diff --git a/test/mkldnn/test_squeeze2_mkldnn_op.py b/test/mkldnn/test_squeeze2_mkldnn_op.py index ea914f62438d8..0904be811c0bc 100644 --- a/test/mkldnn/test_squeeze2_mkldnn_op.py +++ b/test/mkldnn/test_squeeze2_mkldnn_op.py @@ -76,6 +76,20 @@ def test_check_output(self): self.check_output_with_place(core.CPUPlace()) +class TestSqueeze2OneDNNOp_ZeroDim(TestSqueeze2OneDNNOp): + def init_test_case(self): + self.ori_shape = [1] + self.axes = () + self.new_shape = () + + +class TestSqueezeOneDNNOp_ZeroDim(TestSqueezeOneDNNOp): + def init_test_case(self): + self.ori_shape = [1] + self.axes = () + self.new_shape = () + + class TestSqueeze2OneDNNOp1(TestSqueeze2OneDNNOp): def init_test_case(self): self.ori_shape = (1, 20, 1, 5) From 41138718fa8e14e0d95350837385d6d462a3ba72 Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Mon, 24 Apr 2023 17:23:25 +0800 Subject: [PATCH 028/405] Add "enable_tensor_checker" and "disable_tensor_checker" to api list (#52936) --- paddle/fluid/eager/nan_inf_utils.cc | 26 +- paddle/fluid/eager/nan_inf_utils.h | 4 + .../framework/details/nan_inf_utils_detail.cc | 8 + .../framework/details/nan_inf_utils_detail.cu | 2 +- .../framework/details/nan_inf_utils_detail.h | 4 + paddle/fluid/pybind/pybind.cc | 12 + python/paddle/amp/debugging.py | 356 ++++++++++-------- .../fluid/tests/unittests/test_nan_inf_dir.py | 145 +------ .../tests/unittests/test_tensor_checker.py | 111 ++++++ 9 files changed, 359 insertions(+), 309 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_tensor_checker.py diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc index 6eae40fca36cf..e71ae7cf11939 100644 --- a/paddle/fluid/eager/nan_inf_utils.cc +++ b/paddle/fluid/eager/nan_inf_utils.cc @@ -24,8 +24,6 @@ DECLARE_int32(check_nan_inf_level); namespace egr { -static std::once_flag dump_list_init_flag; - static std::unordered_set& nan_inf_check_op_list() { static std::unordered_set _check_op_list = {}; return _check_op_list; @@ -36,39 +34,32 @@ static std::unordered_set& nan_inf_skip_op_list() { return _skip_op_list; } -static void InitDumpListFormEnv() { +void SetCheckOpList(const std::string& check_op_list = "") { nan_inf_check_op_list(); - nan_inf_skip_op_list(); - const char* check_op_list = std::getenv("Paddle_check_nan_inf_op_list"); - const char* skip_op_list = std::getenv("Paddle_skip_nan_inf_op_list"); - - if (check_op_list) { + if (check_op_list.size() != 0) { std::stringstream ss(check_op_list); std::string op_type; LOG(INFO) << "Please set op's name according to the " "paddle.amp.low_precision_op_list()"; while (std::getline(ss, op_type, ',')) { nan_inf_check_op_list().emplace(op_type); + VLOG(4) << "Check nan inf op list: " << op_type; } } +} - if (skip_op_list) { +void SetSkipOpList(const std::string& skip_op_list = "") { + nan_inf_skip_op_list(); + if (skip_op_list.size() != 0) { std::stringstream ss(skip_op_list); std::string op_type; LOG(INFO) << "Please set op's name according to the " "paddle.amp.low_precision_op_list()"; while (std::getline(ss, op_type, ',')) { nan_inf_skip_op_list().emplace(op_type); + VLOG(4) << "Skip nan inf op list: " << op_type; } } - - for (auto const& key : nan_inf_check_op_list()) { - LOG(INFO) << "Check nan inf op list: " << key; - } - - for (auto const& key : nan_inf_skip_op_list()) { - LOG(INFO) << "Skip nan inf op list: " << key; - } } bool CheckOp(const std::string& api_name) { @@ -89,7 +80,6 @@ bool CheckOp(const std::string& api_name) { } void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) { - std::call_once(dump_list_init_flag, InitDumpListFormEnv); auto op_name = phi::TransToFluidOpName(api_name); if (tensor.initialized() && CheckOp(op_name)) { auto& tensor_name = tensor.name(); diff --git a/paddle/fluid/eager/nan_inf_utils.h b/paddle/fluid/eager/nan_inf_utils.h index 8d7ed7ffb76b2..4f412cf6db88d 100644 --- a/paddle/fluid/eager/nan_inf_utils.h +++ b/paddle/fluid/eager/nan_inf_utils.h @@ -65,6 +65,10 @@ void CheckTensorHasNanOrInf( void CheckTensorHasNanOrInf(const std::string& api_name, const TupleOfTensorAndVector& tensors); +void SetCheckOpList(const std::string& check_op_list); + +void SetSkipOpList(const std::string& skip_op_list); + void CheckTensorHasNanOrInf( const std::string& api_name, const paddle::small_vector, diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc index e3e08e8b7df28..7890e37e67246 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc @@ -30,6 +30,7 @@ namespace details { struct DebugTools { DebugTools() {} std::string path = ""; + int stack_limit = 1; }; static DebugTools debug_nan_inf; @@ -45,6 +46,13 @@ std::string GetNanPath() { return debug_nan_inf.path + "/"; } +void SetNanInfStackLimit(const int& stack_limit) { + debug_nan_inf.stack_limit = stack_limit; + VLOG(4) << "Set the stack limit of debug tools : " << stack_limit; +} + +int GetNanInfStackLimit() { return debug_nan_inf.stack_limit; } + static std::once_flag white_list_init_flag; static int op_role_nan_inf_white_list = 0; diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu index dd99adfecfcd9..3e001299e8e38 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu @@ -516,7 +516,7 @@ void TensorCheckerVisitor::apply( check_nan_inf_level, nan_inf_zero_tensor.data()); - if (check_nan_inf_level == 0) { + if (check_nan_inf_level == 0 && GetNanInfStackLimit() > 0) { auto nan_cpu = phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(int64_t) * 3); int64_t* nan_cpu_ptr = reinterpret_cast(nan_cpu->ptr()); diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.h b/paddle/fluid/framework/details/nan_inf_utils_detail.h index 8f5eb5352ac7b..59865162cc365 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.h +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.h @@ -40,6 +40,10 @@ void SetNanInfDebugPath(const std::string& nan_inf_path); std::string GetNanPath(); +void SetNanInfStackLimit(const int& stack_limit); + +int GetNanInfStackLimit(); + template ::value, bool> = true> diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index bde6357ccbe2f..359c2266f8ea3 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2670,9 +2670,21 @@ All parameter, weight, gradient are variables in Paddle. m.def("use_layout_autotune", [] { return egr::Controller::Instance().UseLayoutAutoTune(); }); // Add the api for nan op debug + m.def("set_nan_inf_stack_limit", + &paddle::framework::details::SetNanInfStackLimit); + + // Add the api for nan op debug m.def("set_nan_inf_debug_path", &paddle::framework::details::SetNanInfDebugPath); + // Add check op lost + m.def("set_checked_op_list", + [](const std::string &op_list) { egr::SetCheckOpList(op_list); }); + + // Add skipped op list + m.def("set_skipped_op_list", + [](const std::string &op_list) { egr::SetSkipOpList(op_list); }); + m.def("check_numerics", [](const std::string &op_name, const paddle::Tensor &tensor) { VLOG(4) << "Check tensor whether has nan or inf."; diff --git a/python/paddle/amp/debugging.py b/python/paddle/amp/debugging.py index d103622198457..69ee21fe4c322 100644 --- a/python/paddle/amp/debugging.py +++ b/python/paddle/amp/debugging.py @@ -13,7 +13,6 @@ # limitations under the License. import contextlib -import os import random from enum import Enum @@ -24,82 +23,119 @@ from paddle.fluid.framework import dygraph_only __all__ = [ + "DebugMode", + "TensorCheckerConfig", "enable_operator_stats_collection", "disable_operator_stats_collection", "collect_operator_stats", + "enable_tensor_checker", + "disable_tensor_checker", ] class DebugMode(Enum): + """ + The DebugMode is a feature that helps to present the state of the TensorCheckerConfig. Each DebugMode has a specific meaning, which is explained below: + + - DebugMode.CHECK_NAN_INF_AND_ABORT: This mode prints or saves information about Tensors that contain NaN/Inf and interrupts the program. + + - DebugMode.CHECK_NAN_INF: This mode prints or saves critical information about Tensors that contain NaN/Inf but allows the program to continue running. + + - DebugMode.CHECK_ALL_FOR_OVERFLOW: This mode checks the output of the FP32 operator and prints or saves information about key Tensors that exceed the FP16 representation range, such as overflow or underflow. + + - DebugMode.CHECK_ALL: This mode prints or saves output Tensor key information for all operators. + + """ + CHECK_NAN_INF_AND_ABORT = 0 CHECK_NAN_INF = 1 CHECK_ALL_FOR_OVERFLOW = 2 CHECK_ALL = 3 - CHECK_ALL_AND_ABORT = 4 - DUMP_ALL = 5 + # CHECK_ALL_AND_ABORT = 4 + # DUMP_ALL = 5 + + +def set_checked_op_list(checked_op_list): + # check checked_op_list + if checked_op_list is not None: + if isinstance(checked_op_list, (list, tuple)): + check_op_list = ",".join(value for value in checked_op_list) + paddle.fluid.core.set_checked_op_list(check_op_list) + else: + raise ValueError("checked_op_list must be list or tuple") + + +def set_skipped_op_list(skipped_op_list): + # check skipped_op_list + if skipped_op_list is not None: + if isinstance(skipped_op_list, (list, tuple)): + skip_op_list = ",".join(value for value in skipped_op_list) + paddle.fluid.core.set_skipped_op_list(skip_op_list) + else: + raise ValueError("skipped_op_list must be list or tuple") class TensorCheckerConfig: """ - Collect the config for checking nan and inf in module or op tensor. + The purpose of this class is to collect the configuration for checking NaN and Inf values in the tensors of a module or operator. It takes the following arguments: Args: - * enable: Whether to enable Tensor's value detection function. The default value is False, which means that these tools will never be used. + enable(bool): Indicating whether to enable the detection of NaN and Inf values in tensors. The default value is False, which means that these tools will not be used. - * debug_mode: Debug mode,There are 6 kinds of debug mode. - CHECK_NAN_INF_AND_ABORT(default): Print or save Tensor key information with NaN/Inf and interrupt the program - CHECK_NAN_INF: Print or save Tensor critical information with NaN/Inf, but continue to run - CHECK_ALL_AND_ABORT: Print or save the output Tensor key information of all operators, and interrupt the program if NaN/Inf occurs - CHECK_ALL_FOR_OVERFLOW: Check the output of the FP32 operator, print or save key Tensor information that exceeds the FP16 representation range (overflow, underflow) - CHECK_ALL: Print or save output Tensor key information for all operators - DUMP_ALL: Saves all Tensor data. This mode does not print on the terminal + debug_mode(DebugMode, optional): A parameter that determines the type of debugging to be used. Default is DebugMode.CHECK_NAN_INF_AND_ABORT. - * dump_dir: The collection data storage path. If it is None, it will be directly printed to the terminal + output_dir(string, optional): The path to store collected data. If this parameter is set to None, the data will be printed to the terminal. Default is None. - * checked_op_list: A list of operators you want to check + checked_op_list(list|tuple, optional): Specifies a list of operators that need to be checked during program execution, for example, checked_op_list=['elementwise_add', 'conv2d'], indicating that the output results of elementwise_add and conv2d should be checked for nan/inf during program execution. Default is None. - * skipped_op_list: A list of operators to skip checking + skipped_op_list(list|tuple, optional): Specifies a list of operators that do not need to be checked during program execution, for example, skipped_op_list=['elementwise_add', 'conv2d'], indicating that the output results of elementwise_add and conv2d should not be checked for nan/inf during program execution. None is None. - * debug_step: The iteration scope of debugging + debug_step(list|tuple, optional): A list or tuple used primarily for nan/inf checking during model training. For example, debug_step=[1,5] indicates that nan/inf checking should only be performed on model training iterations 1 to 5. Default is None. - * stack_height_limit: The maximum depth of the call stack, and supports printing the call stack at the error location. The specific scheme needs to be investigated - - * enable_traceback_filtering: Whether to filter the traceback. The main purpose is to filter out the internal code call stack of the framework and only display the user code call stack + stack_height_limit(int, optional): An integer value specifying the maximum depth of the call stack. This feature supports printing the call stack at the error location. Currently, only enabling or disabling call stack printing is supported. If you want to print the corresponding C++ call stack when NaN is detected in GPU Kernel, set stack_height_limit to 1, otherwise set it to 0. Default is 1. Examples: - .. code-block:: python - import paddle - checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=DebugMode.CHECK_NAN_INF_AND_ABORT) - paddle.amp.debugging.enable_tensor_checker(checker_config) + .. code-block:: python + + import paddle + + checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF) + paddle.amp.debugging.enable_tensor_checker(checker_config) - x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False) - y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32') - res = paddle.pow(x, y) + x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False) + y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32') + res = paddle.pow(x, y) + paddle.autograd.backward(res, retain_graph=True) + paddle.amp.debugging.disable_tensor_checker() - paddle.autograd.backward(res, retain_graph=True) - paddle.amp.debugging.disable_tensor_checker() + #[PRECISION] [ERROR] in [device=cpu, op=elementwise_pow_grad, tensor=, dtype=fp32], numel=3, num_nan=1, num_inf=0, num_zero=0, max=2.886751e-01, min=2.000000e-01, mean=-nan + + # when DebugMode.CHECK_NAN_INF_AND_ABORT and stack_height_limit = 1 + #Traceback (most recent call last): + # res = paddle.pow(x, y) + # File "/usr/local/lib/python3.8/dist-packages/paddle/tensor/math.py", line 447, in pow + # return _C_ops.elementwise_pow(x, y) """ # For module debugging - Current_step_id = 0 + current_step_id = 0 def __init__( self, enable, debug_mode=DebugMode.CHECK_NAN_INF_AND_ABORT, - dump_dir=None, + output_dir=None, checked_op_list=None, skipped_op_list=None, debug_step=None, - stack_height_limit=3, - enable_traceback_filtering=False, + stack_height_limit=1, ): self.enable = enable self.debug_mode = debug_mode - self.dump_dir = dump_dir + self.output_dir = output_dir self.checked_op_list = checked_op_list self.skipped_op_list = skipped_op_list @@ -107,8 +143,6 @@ def __init__( self.debug_step = debug_step self.stack_height_limit = stack_height_limit - self.enable_traceback_filtering = enable_traceback_filtering - self.start_step = None self.end_step = None @@ -146,60 +180,43 @@ def __init__( DebugMode.__members__, ) - # check checked_op_list - if self.checked_op_list is not None: - if isinstance(self.checked_op_list, (list, tuple)): - check_op_list = ",".join( - value for value in self.checked_op_list - ) - os.environ["Paddle_check_nan_inf_op_list"] = str(check_op_list) - else: - raise ValueError("checked_op_list must be list or tuple") + set_checked_op_list(self.checked_op_list) - # check skipped_op_list - if self.skipped_op_list is not None: - if isinstance(self.skipped_op_list, (list, tuple)): - skipped_op_list = ",".join( - value for value in self.skipped_op_list - ) - os.environ["Paddle_skip_nan_inf_op_list"] = str(skipped_op_list) - else: - raise ValueError("skipped_op_list must be list or tuple") + set_skipped_op_list(self.skipped_op_list) if self.enable: self._set_seed(self.enable) - def keep_random(self, seed, flag): + def _set_seed(self, flag): + if self.initial_seed != self.seed: + self.seed = self.initial_seed + + if self.seed > np.iinfo(np.uint32).max or self.seed < 0: + print("[Warnning: Seed must be between 0 and 2**32 - 1") + self.seed = 123 + # get random seed - self.seed = seed paddle.seed(self.seed) np.random.seed(self.seed) random.seed(self.seed) + # info + print("AMP Debugging TensorCheckerConfig: seed ", self.seed) + # set cudnn and cpu if core.is_compiled_with_cuda(): paddle.set_flags({"FLAGS_cudnn_deterministic": flag}) - paddle.set_flags({"FLAGS_cpu_deterministic": flag}) + print( + "AMP Debugging TensorCheckerConfig: FLAGS_cudnn_deterministic is ", + flag, + ) - # info - print("AMP Debugging TensorCheckerConfig: seed ", self.seed) - print( - "AMP Debugging TensorCheckerConfig: FLAGS_cudnn_deterministic is ", - flag, - ) + paddle.set_flags({"FLAGS_cpu_deterministic": flag}) print( "AMP Debugging TensorCheckerConfig: FLAGS_cpu_deterministic is ", flag, ) - def _set_seed(self, enable): - if self.initial_seed != self.seed: - self.seed = self.initial_seed - if self.seed > 4294967295 or self.seed < 0: - print("[Warnning: Seed must be between 0 and 2**32 - 1") - self.seed = 123 - self.keep_random(self.seed, True) - def _set_env(self, check_flag): paddle.set_flags({"FLAGS_check_nan_inf": check_flag}) if check_flag: @@ -209,35 +226,35 @@ def _set_env(self, check_flag): ) # set output_dir - if self.dump_dir is not None: - paddle.fluid.core.set_nan_inf_debug_path(self.dump_dir) + if self.output_dir is not None: + paddle.fluid.core.set_nan_inf_debug_path(self.output_dir) # set stack_height_limit if isinstance(self.stack_height_limit, (int)): - paddle.set_flags( - {"FLAGS_call_stack_level": self.stack_height_limit} + paddle.fluid.core.set_nan_inf_stack_limit( + self.stack_height_limit ) else: raise ValueError("stack_height_limit must be int") - def check(self): + def update_and_check_step_id(self): if self.enable: if self.start_step is not None and self.end_step is not None: if ( - self.start_step > TensorCheckerConfig.Current_step_id - or TensorCheckerConfig.Current_step_id >= self.end_step + self.start_step > TensorCheckerConfig.current_step_id + or TensorCheckerConfig.current_step_id >= self.end_step ): return False else: - TensorCheckerConfig.Current_step_id += 1 + TensorCheckerConfig.current_step_id += 1 return True return False - def run(self): + def start_check_nan_inf(self): if self.enable: self._set_env(self.enable) - def end(self): + def stop_check_nan_inf(self): self._set_env(False) @@ -302,26 +319,26 @@ def enable_operator_stats_collection(): Examples: - .. code-block:: python + .. code-block:: python - import paddle + import paddle - conv = paddle.nn.Conv2D(3, 2, 3) - x = paddle.rand([10, 3, 32, 32]) + conv = paddle.nn.Conv2D(3, 2, 3) + x = paddle.rand([10, 3, 32, 32]) - paddle.amp.debugging.enable_operator_stats_collection() - # AMP list including conv2d, elementwise_add, reshape2, cast (transfer_dtype) - with paddle.amp.auto_cast(enable=True, level='O2'): - out = conv(x) - # Print to the standard output. - paddle.amp.debugging.disable_operator_stats_collection() - # <------------------------------------------------------- op list --------------------------------------------------------> - # <--------------- Op Name ---------------- | -- FP16 Calls --- | -- BF16 Calls --- | --- FP32 Calls--- | -- Other Calls --> - # conv2d | 1 | 0 | 0 | 0 - # elementwise_add | 1 | 0 | 0 | 0 - # reshape2 | 1 | 0 | 0 | 0 - # transfer_dtype | 0 | 0 | 3 | 0 - # <----------------------------------------------------- op count: 4 ------------------------------------------------------> + paddle.amp.debugging.enable_operator_stats_collection() + # AMP list including conv2d, elementwise_add, reshape2, cast (transfer_dtype) + with paddle.amp.auto_cast(enable=True, level='O2'): + out = conv(x) + # Print to the standard output. + paddle.amp.debugging.disable_operator_stats_collection() + # <------------------------------------------------------- op list --------------------------------------------------------> + # <--------------- Op Name ---------------- | -- FP16 Calls --- | -- BF16 Calls --- | --- FP32 Calls--- | -- Other Calls --> + # conv2d | 1 | 0 | 0 | 0 + # elementwise_add | 1 | 0 | 0 | 0 + # reshape2 | 1 | 0 | 0 | 0 + # transfer_dtype | 0 | 0 | 3 | 0 + # <----------------------------------------------------- op count: 4 ------------------------------------------------------> """ # Clear the previous stats. @@ -340,26 +357,26 @@ def disable_operator_stats_collection(): Examples: - .. code-block:: python + .. code-block:: python - import paddle + import paddle - conv = paddle.nn.Conv2D(3, 2, 3) - x = paddle.rand([10, 3, 32, 32]) + conv = paddle.nn.Conv2D(3, 2, 3) + x = paddle.rand([10, 3, 32, 32]) - paddle.amp.debugging.enable_operator_stats_collection() - # AMP list including conv2d, elementwise_add, reshape2, cast (transfer_dtype) - with paddle.amp.auto_cast(enable=True, level='O2'): - out = conv(x) - # Print to the standard output. - paddle.amp.debugging.disable_operator_stats_collection() - # <------------------------------------------------------- op list --------------------------------------------------------> - # <--------------- Op Name ---------------- | -- FP16 Calls --- | -- BF16 Calls --- | --- FP32 Calls--- | -- Other Calls --> - # conv2d | 1 | 0 | 0 | 0 - # elementwise_add | 1 | 0 | 0 | 0 - # reshape2 | 1 | 0 | 0 | 0 - # transfer_dtype | 0 | 0 | 3 | 0 - # <----------------------------------------------------- op count: 4 ------------------------------------------------------> + paddle.amp.debugging.enable_operator_stats_collection() + # AMP list including conv2d, elementwise_add, reshape2, cast (transfer_dtype) + with paddle.amp.auto_cast(enable=True, level='O2'): + out = conv(x) + # Print to the standard output. + paddle.amp.debugging.disable_operator_stats_collection() + # <------------------------------------------------------- op list --------------------------------------------------------> + # <--------------- Op Name ---------------- | -- FP16 Calls --- | -- BF16 Calls --- | --- FP32 Calls--- | -- Other Calls --> + # conv2d | 1 | 0 | 0 | 0 + # elementwise_add | 1 | 0 | 0 | 0 + # reshape2 | 1 | 0 | 0 | 0 + # transfer_dtype | 0 | 0 | 3 | 0 + # <----------------------------------------------------- op count: 4 ------------------------------------------------------> """ if not _get_operator_stats_flag(): @@ -381,25 +398,25 @@ def collect_operator_stats(): Examples: - .. code-block:: python + .. code-block:: python - import paddle + import paddle - conv = paddle.nn.Conv2D(3, 2, 3) - x = paddle.rand([10, 3, 32, 32]) + conv = paddle.nn.Conv2D(3, 2, 3) + x = paddle.rand([10, 3, 32, 32]) - with paddle.amp.debugging.collect_operator_stats(): - # AMP list including conv2d, elementwise_add, reshape2, cast (transfer_dtype) - with paddle.amp.auto_cast(enable=True, level='O2'): - out = conv(x) - # Print to the standard output. - # <------------------------------------------------------- op list --------------------------------------------------------> - # <--------------- Op Name ---------------- | -- FP16 Calls --- | -- BF16 Calls --- | --- FP32 Calls--- | -- Other Calls --> - # conv2d | 1 | 0 | 0 | 0 - # elementwise_add | 1 | 0 | 0 | 0 - # reshape2 | 1 | 0 | 0 | 0 - # transfer_dtype | 0 | 0 | 3 | 0 - # <----------------------------------------------------- op count: 4 ------------------------------------------------------> + with paddle.amp.debugging.collect_operator_stats(): + # AMP list including conv2d, elementwise_add, reshape2, cast (transfer_dtype) + with paddle.amp.auto_cast(enable=True, level='O2'): + out = conv(x) + # Print to the standard output. + # <------------------------------------------------------- op list --------------------------------------------------------> + # <--------------- Op Name ---------------- | -- FP16 Calls --- | -- BF16 Calls --- | --- FP32 Calls--- | -- Other Calls --> + # conv2d | 1 | 0 | 0 | 0 + # elementwise_add | 1 | 0 | 0 | 0 + # reshape2 | 1 | 0 | 0 | 0 + # transfer_dtype | 0 | 0 | 3 | 0 + # <----------------------------------------------------- op count: 4 ------------------------------------------------------> """ enable_operator_stats_collection() @@ -409,57 +426,74 @@ def collect_operator_stats(): def enable_tensor_checker(checker_config): """ - enable_tensor_checker(checker_config) is enables model level accuracy checking, which is used together with disables_tensor_checker() to achieve model level precision checking through the combination of these two APIs, checking the output Tensors of all operators within the specified range. + The enable_tensor_checker(checker_config) function enables model-level accuracy checking and is used in combination with disables_tensor_checker() to achieve model-level precision checking by checking the output Tensors of all operators within the specified range. - Attention: - - * If disable is called before loss. backward()_tensor_checker(), the gradient operator is not checked; + Args: + checker_config(TensorCheckerConfig): Checker_config is to collect the configuration for checking NaN and Inf values in the tensors of a module or operator. - * If disable is called before optimizer.step() tensor_checker(), the optimizer and other weight update related operators will not be checked + Note: + If disable_tensor_checker() is called before backward(), the gradient operator will not be checked. + If disable_tensor_checker() is called before optimizer.step(), the optimizer and other weight update related operators will not be checked. Examples: - .. code-block:: python - import paddle - checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=DebugMode.CHECK_NAN_INF_AND_ABORT) - paddle.amp.debugging.enable_tensor_checker(checker_config) + .. code-block:: python + + import paddle + + checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF) + paddle.amp.debugging.enable_tensor_checker(checker_config) - x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False) - y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32') - res = paddle.pow(x, y) - paddle.autograd.backward(res, retain_graph=True) + x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False) + y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32') + res = paddle.pow(x, y) + paddle.autograd.backward(res, retain_graph=True) + paddle.amp.debugging.disable_tensor_checker() + #[PRECISION] [ERROR] in [device=cpu, op=elementwise_pow_grad, tensor=, dtype=fp32], numel=3, num_nan=1, num_inf=0, num_zero=0, max=2.886751e-01, min=2.000000e-01, mean=-nan + + # when DebugMode.CHECK_NAN_INF_AND_ABORT and stack_height_limit = 1 + # Traceback (most recent call last): + # File "tp.py", line 8, in + # res = paddle.pow(x, y) + # File "/usr/local/lib/python3.8/dist-packages/paddle/tensor/math.py", line 447, in pow + # return _C_ops.elementwise_pow(x, y) - paddle.amp.debugging.disable_tensor_checker() """ - if checker_config.check(): - checker_config.run() + if checker_config.update_and_check_step_id(): + checker_config.start_check_nan_inf() else: - checker_config.end() + checker_config.stop_check_nan_inf() def disable_tensor_checker(): """ - disable_tensor_checker() to disables the accuracy checking, which is used together with enables_tensor_checker(config) to achieve model level precision checking through the combination of these two APIs, checking the output Tensors of all operators within the specified range. + disable_tensor_checker() is used to disable accuracy checking, and is used together with enable_tensor_checker(config) to achieve model-level precision checking by checking the output Tensors of all operators within the specified range. - Attention: + Note: + If disable_tensor_checker() is called before backward(), the gradient operator will not be checked; + If disable_tensor_checker() is called before optimizer.step(), the optimizer and other weight update related operators will not be checked. - * If disable_tensor_checker() is called before loss.backward(), the gradient operator is not checked; + Examples: - * If disable_tensor_checker() is called before optimizer.step(), the optimizer and other weight update related operators will not be checked + .. code-block:: python - Examples: - .. code-block:: python - import paddle + import paddle - checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=DebugMode.CHECK_NAN_INF_AND_ABORT) - paddle.amp.debugging.enable_tensor_checker(checker_config) + checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF) + paddle.amp.debugging.enable_tensor_checker(checker_config) - x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False) - y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32') - res = paddle.pow(x, y) - paddle.autograd.backward(res, retain_graph=True) + x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False) + y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32') + res = paddle.pow(x, y) + paddle.autograd.backward(res, retain_graph=True) + paddle.amp.debugging.disable_tensor_checker() + #[PRECISION] [ERROR] in [device=cpu, op=elementwise_pow_grad, tensor=, dtype=fp32], numel=3, num_nan=1, num_inf=0, num_zero=0, max=2.886751e-01, min=2.000000e-01, mean=-nan - paddle.amp.debugging.disable_tensor_checker() + # when DebugMode.CHECK_NAN_INF_AND_ABORT and stack_height_limit = 1 + # Traceback (most recent call last): + # res = paddle.pow(x, y) + # File "/usr/local/lib/python3.8/dist-packages/paddle/tensor/math.py", line 447, in pow + # return _C_ops.elementwise_pow(x, y) """ paddle.set_flags({"FLAGS_check_nan_inf": 0}) diff --git a/python/paddle/fluid/tests/unittests/test_nan_inf_dir.py b/python/paddle/fluid/tests/unittests/test_nan_inf_dir.py index 425dc9a7e997e..06695c56f243d 100644 --- a/python/paddle/fluid/tests/unittests/test_nan_inf_dir.py +++ b/python/paddle/fluid/tests/unittests/test_nan_inf_dir.py @@ -78,7 +78,14 @@ def get_num_nan_inf( def test_num_nan_inf(self): path = "nan_inf_log_dir" - paddle.fluid.core.set_nan_inf_debug_path(path) + + checker_config = paddle.amp.debugging.TensorCheckerConfig( + enable=True, + debug_mode=paddle.amp.debugging.DebugMode.CHECK_ALL, + output_dir=path, + ) + + paddle.amp.debugging.enable_tensor_checker(checker_config) def _check_num_nan_inf(use_cuda): shape = [32, 32] @@ -86,145 +93,25 @@ def _check_num_nan_inf(use_cuda): num_nan_np, num_inf_np = self.get_reference_num_nan_inf(x_np) add_assert = (num_nan_np + num_inf_np) > 0 num_nan, num_inf = self.get_num_nan_inf( - x_np, use_cuda, add_assert, path + x_np, + use_cuda, + add_assert, + path, ) if not use_cuda: assert num_nan == num_nan_np and num_inf == num_inf_np - paddle.set_flags( - {"FLAGS_check_nan_inf": 1, "FLAGS_check_nan_inf_level": 3} - ) - _check_num_nan_inf(use_cuda=False) if paddle.fluid.core.is_compiled_with_cuda(): _check_num_nan_inf(use_cuda=True) + else: + _check_num_nan_inf(use_cuda=False) + x = paddle.to_tensor([2, 3, 4], 'float32') y = paddle.to_tensor([1, 5, 2], 'float32') z = paddle.add(x, y) path = "" paddle.fluid.core.set_nan_inf_debug_path(path) - - def test_nan_inf_op(self): - import paddle - - num_nan = 0 - num_inf = 0 - # check op list - x = paddle.to_tensor( - [1, 0, 1], - place=paddle.CPUPlace(), - dtype='float32', - stop_gradient=False, - ) - y = paddle.to_tensor( - [0.2, -1, 0.5], place=paddle.CPUPlace(), dtype='float32' - ) - try: - res = paddle.pow(x, y) - except Exception as e: - # Cannot catch the log in CUDA kernel. - err_str_list = ( - str(e) - .replace("(", " ") - .replace(")", " ") - .replace(",", " ") - .split(" ") - ) - for err_str in err_str_list: - if "num_nan" in err_str: - num_nan = int(err_str.split("=")[1]) - elif "num_inf" in err_str: - num_inf = int(err_str.split("=")[1]) - print( - "[CHECK_NAN_INF_AND_ABORT] num_nan={}, num_inf={}".format( - num_nan, num_inf - ) - ) - return num_inf - - def test_check_op_list(self): - import paddle - - num_nan = 0 - num_inf = 0 - - checker_config = paddle.amp.debugging.TensorCheckerConfig( - enable=True, - debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF_AND_ABORT, - skipped_op_list=["elementwise_div"], - ) - - x = paddle.to_tensor( - [0, 0, 0], - place=paddle.CPUPlace(), - dtype='float32', - stop_gradient=False, - ) - y = paddle.to_tensor( - [0.2, -1, 0.5], place=paddle.CPUPlace(), dtype='float32' - ) - paddle.amp.debugging.enable_tensor_checker(checker_config) - try: - res = paddle.divide(y, x) - except Exception as e: - # Cannot catch the log in CUDA kernel. - err_str_list = ( - str(e) - .replace("(", " ") - .replace(")", " ") - .replace(",", " ") - .split(" ") - ) - for err_str in err_str_list: - if "num_nan" in err_str: - num_nan = int(err_str.split("=")[1]) - elif "num_inf" in err_str: - num_inf = int(err_str.split("=")[1]) - print( - "[CHECK_NAN_INF_AND_ABORT] num_nan={}, num_inf={}".format( - num_nan, num_inf - ) - ) - paddle.amp.debugging.enable_tensor_checker(checker_config) - - def test_tensor_checker(self): - import paddle - - def _assert_flag(value): - flags = ['FLAGS_check_nan_inf', 'FLAGS_check_nan_inf_level'] - res = paddle.get_flags(flags) - assert res["FLAGS_check_nan_inf"] == value - - paddle.set_flags({"FLAGS_check_nan_inf": 0}) - paddle.seed(102) - checker_config = paddle.amp.debugging.TensorCheckerConfig( - enable=True, - debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF_AND_ABORT, - checked_op_list=["elementwise_pow"], - skipped_op_list=["elementwise_add"], - debug_step=[0, 3], - ) - # check seed - assert checker_config.initial_seed == 102 - assert checker_config.seed == 102 - _assert_flag(False) - for index in range(5): - paddle.amp.debugging.enable_tensor_checker(checker_config) - if index <= 2: - _assert_flag(True) - assert ( - index + 1 - == paddle.amp.debugging.TensorCheckerConfig.Current_step_id - ) - assert 1 == self.test_nan_inf_op() - else: - assert ( - 3 - == paddle.amp.debugging.TensorCheckerConfig.Current_step_id - ) - _assert_flag(False) - assert 0 == self.test_nan_inf_op() - paddle.amp.debugging.disable_tensor_checker() - _assert_flag(False) + paddle.amp.debugging.disable_tensor_checker() if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_tensor_checker.py b/python/paddle/fluid/tests/unittests/test_tensor_checker.py new file mode 100644 index 0000000000000..a5b5e82034fd0 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_tensor_checker.py @@ -0,0 +1,111 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import paddle + + +class TestTensorChecker(unittest.TestCase): + def get_num_inf(self, e): + num_nan = 0 + num_inf = 0 + # Cannot catch the log in CUDA kernel. + err_str_list = ( + str(e) + .replace("(", " ") + .replace(")", " ") + .replace(",", " ") + .split(" ") + ) + for err_str in err_str_list: + if "num_nan" in err_str: + num_nan = int(err_str.split("=")[1]) + elif "num_inf" in err_str: + num_inf = int(err_str.split("=")[1]) + print( + "[CHECK_NAN_INF_AND_ABORT] num_nan={}, num_inf={}".format( + num_nan, num_inf + ) + ) + return num_nan + + def generate_num_inf(self, place): + num_inf = 0 + num_nan = 0 + paddle.set_device(place) + # check op list + x = paddle.to_tensor( + [1, 0, 0], + dtype='float32', + stop_gradient=False, + ) + y = paddle.to_tensor([0, 0, 1], dtype='float32') + try: + res = paddle.pow(x, y) + # test backward + paddle.autograd.backward([res]) + res = paddle.divide(y, x) + except Exception as e: + num_inf = self.get_num_inf(e) + return num_inf + + def test_tensor_checker(self): + def _assert_flag(value): + flags = ['FLAGS_check_nan_inf', 'FLAGS_check_nan_inf_level'] + res = paddle.get_flags(flags) + assert res["FLAGS_check_nan_inf"] == value + + paddle.set_flags({"FLAGS_check_nan_inf": 0}) + paddle.seed(102) + checker_config = paddle.amp.debugging.TensorCheckerConfig( + enable=True, + debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF_AND_ABORT, + checked_op_list=["elementwise_pow_grad"], + skipped_op_list=["elementwise_div"], + debug_step=[0, 3], + ) + places = ['cpu'] + if paddle.is_compiled_with_cuda(): + places.append('gpu') + # check seed + self.assertEqual(checker_config.initial_seed, 102) + self.assertEqual(checker_config.seed, 102) + _assert_flag(False) + + for place in places: + paddle.amp.debugging.TensorCheckerConfig.current_step_id = 0 + for index in range(5): + paddle.amp.debugging.enable_tensor_checker(checker_config) + if index <= 2: + _assert_flag(True) + self.assertEqual( + index + 1, + paddle.amp.debugging.TensorCheckerConfig.current_step_id, + ) + self.assertEqual(1, self.generate_num_inf(place)) + else: + self.assertEqual( + 3, + paddle.amp.debugging.TensorCheckerConfig.current_step_id, + ) + _assert_flag(False) + self.assertEqual(0, self.generate_num_inf(place)) + + paddle.amp.debugging.disable_tensor_checker() + _assert_flag(False) + + +if __name__ == '__main__': + unittest.main() From f0f5866524a765376770535523b88ce37e47ddf5 Mon Sep 17 00:00:00 2001 From: kangguangli Date: Mon, 24 Apr 2023 17:27:16 +0800 Subject: [PATCH 029/405] [BugFix] wrong match between depend and c_allreduce_sum (#53089) * fix bug: wrong match between depend and c_allreduce_sum * fix codestyle * fix bug * add c_sync_calc_stream back * fix * revert * use flag to control * fix for code coverage --- .../meta_optimizers/raw_program_optimizer.py | 37 +++++++++++++++++-- .../test_dist_fleet_raw_program_optimizer.py | 17 +++++++++ 2 files changed, 51 insertions(+), 3 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py index bd1faf1401402..8919ded2e245c 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and +import os + from paddle import static from paddle.fluid import core from paddle.framework import _global_flags @@ -62,6 +64,9 @@ def _set_basic_info( self.calc_comm_same_stream = ( user_defined_strategy._calc_comm_same_stream ) + self.sync_before_allreduce = os.environ.get( + 'FLAGS_sync_before_allreduce', None + ) def _can_apply(self): if not self.role_maker._is_collective: @@ -433,17 +438,28 @@ def get_after_idx_of_fuse_group(grad_param_segments): OP_ROLE_KEY: OpRole.Backward, }, ) + if not self.calc_comm_same_stream and self.sync_before_allreduce: + block._insert_op_without_sync( + after_idx + 1, + type='c_sync_calc_stream', + inputs={'X': fused_var}, + outputs={'Out': fused_var}, + attrs={OP_ROLE_KEY: OpRole.Backward}, + ) idx = 0 - if not self.calc_comm_same_stream: + if not self.calc_comm_same_stream and not self.sync_before_allreduce: for i in range(len(grad_param_segments)): - while block.ops[idx].type != 'c_allreduce_sum': + while ( + block.ops[idx].type != 'c_allreduce_sum' + or fused_vars[i].name not in block.ops[idx].input_arg_names + ): idx += 1 grad_segment, param_segment = grad_param_segments[i] for grad in grad_segment: block._insert_op_without_sync( idx + 1, type='depend', - inputs={'X': grad, 'Dep': fused_var}, + inputs={'X': grad, 'Dep': fused_vars[i]}, outputs={'Out': grad}, ) idx += 1 @@ -486,6 +502,21 @@ def get_after_idx_of_fuse_group(grad_param_segments): }, ) + if self.calc_comm_same_stream or not self.sync_before_allreduce: + block._sync_with_cpp() + return + + # insert the sync comm op + for idx, op in enumerate(block.ops): + if is_optimizer_op(op): + block._insert_op_without_sync( + idx, + type='c_sync_comm_stream', + inputs={'X': fused_vars}, + outputs={'Out': fused_vars}, + attrs={'ring_id': ring_id, OP_ROLE_KEY: OpRole.Backward}, + ) + break block._sync_with_cpp() def __get_ouputs_name_to_idx(self, first_backward_idx, block): diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_raw_program_optimizer.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_raw_program_optimizer.py index ba826bcf4ff17..c19791a3c33a8 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_raw_program_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_raw_program_optimizer.py @@ -45,5 +45,22 @@ def test_dist_train(self): ) +class TestFleetMetaOptimizerPrecisionWithSync(TestFleetMetaOptimizerPrecision): + def need_envs(self): + return {'FLAGS_sync_before_allreduce': '1'} + + def test_dist_train(self): + from paddle import fluid + + if fluid.core.is_compiled_with_cuda(): + self.check_with_place( + "dist_fleet_raw_program_optimizer.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name + 'with_sync', + need_envs=self.need_envs(), + ) + + if __name__ == '__main__': unittest.main() From 21508090058baceb46d96b70418f1308d95eb993 Mon Sep 17 00:00:00 2001 From: Wang Xin Date: Mon, 24 Apr 2023 18:09:40 +0800 Subject: [PATCH 030/405] fix 'Werror-maybe-uninitialized' compiler error in GCC 11.3 (#53246) --- paddle/phi/api/lib/tensor.cc | 2 +- paddle/phi/kernels/funcs/sequence2batch.cc | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc index a63c2cc9cfea4..634c37933cbe9 100644 --- a/paddle/phi/api/lib/tensor.cc +++ b/paddle/phi/api/lib/tensor.cc @@ -97,7 +97,7 @@ int64_t Tensor::size() const { return impl_->numel(); } const phi::DDim &Tensor::dims() const { return impl_->dims(); } std::vector Tensor::shape() const { - auto dims = impl_->dims(); + const auto &dims = impl_->dims(); return phi::vectorize(dims); } diff --git a/paddle/phi/kernels/funcs/sequence2batch.cc b/paddle/phi/kernels/funcs/sequence2batch.cc index 11a687cdeafac..a172f09cf36f6 100644 --- a/paddle/phi/kernels/funcs/sequence2batch.cc +++ b/paddle/phi/kernels/funcs/sequence2batch.cc @@ -26,8 +26,8 @@ class CopyMatrixRowsFunctor { phi::DenseTensor* dst, bool is_src_index) { size_t* index = index_lod.data(); - auto src_dims = src.dims(); - auto dst_dims = dst->dims(); + const auto& src_dims = vectorize(src.dims()); + const auto& dst_dims = vectorize(dst->dims()); PADDLE_ENFORCE_EQ(src_dims.size(), 2UL, phi::errors::InvalidArgument( From 81c89dd6c6b0cf48a529fa8391babd045958bd31 Mon Sep 17 00:00:00 2001 From: JYChen Date: Mon, 24 Apr 2023 18:52:27 +0800 Subject: [PATCH 031/405] fix right value is 0d and index is List/Tensor (#53225) --- python/paddle/fluid/variable_index.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/variable_index.py b/python/paddle/fluid/variable_index.py index fbfcf78991271..fe7833f048f04 100644 --- a/python/paddle/fluid/variable_index.py +++ b/python/paddle/fluid/variable_index.py @@ -185,7 +185,8 @@ def set_item(self, tensor_origin, value): for i in range(len(gather_tensor_shape)): if not ( - value_dims_bd[i] == gather_tensor_shape[i] + len(value_dims_bd) == 0 + or value_dims_bd[i] == gather_tensor_shape[i] or value_dims_bd[i] == 1 ): raise ValueError( From 9d0befe90d1c16676cad87c84d68af2580357655 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Mon, 24 Apr 2023 18:40:05 +0800 Subject: [PATCH 032/405] Reorganize the forward codes of flash-attention. --- .../manual/fluid_manual/dygraph_forward_api.h | 1 + .../forwards/fused_gate_attention_fwd_func.cc | 14 +- .../operators/fused/fused_gate_attention.h | 750 +++++++++--------- .../fused/fused_gate_attention_op.cc | 6 +- .../fused/fused_gate_attention_op.cu | 38 +- 5 files changed, 434 insertions(+), 375 deletions(-) diff --git a/paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h b/paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h index 7fda5aa69b70f..d8a4fee0caf6e 100644 --- a/paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h +++ b/paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h @@ -27,6 +27,7 @@ std::tuple fused_gate_attention_dygraph_function( const paddle::Tensor& Query, diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc index fd3d32401d922..b0585bc7acd54 100644 --- a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc +++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc @@ -26,6 +26,7 @@ std::tuple fused_gate_attention_dygraph_function( const paddle::Tensor& Query, @@ -181,6 +182,9 @@ fused_gate_attention_dygraph_function( {"SoftmaxOut", {std::make_shared( egr::Controller::Instance().GenerateUniqueName())}}, + {"SoftmaxLse", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, {"FMHAOut", {std::make_shared( egr::Controller::Instance().GenerateUniqueName())}}, @@ -256,6 +260,8 @@ fused_gate_attention_dygraph_function( egr::EagerUtils::GetOutput(outs["QKVTransposeOut"][0], &QKVTransposeOut); paddle::Tensor SoftmaxOut; egr::EagerUtils::GetOutput(outs["SoftmaxOut"][0], &SoftmaxOut); + paddle::Tensor SoftmaxLse; + egr::EagerUtils::GetOutput(outs["SoftmaxLse"][0], &SoftmaxLse); paddle::Tensor FMHAOut; egr::EagerUtils::GetOutput(outs["FMHAOut"][0], &FMHAOut); paddle::Tensor GateOut; @@ -296,7 +302,7 @@ fused_gate_attention_dygraph_function( p_autograd_Out); // Create GradOpNode auto grad_node = std::shared_ptr( - new fused_gate_attentionGradNodeCompat(8, 12)); + new fused_gate_attentionGradNodeCompat(9, 12)); bool merge_qkv = true; if (attrs.count("merge_qkv")) { @@ -308,6 +314,11 @@ fused_gate_attention_dygraph_function( has_gating = PADDLE_GET_CONST(bool, attrs.at("has_gating")); } + // bool use_flash_attn = false; + // if (attrs.count("use_flash_attn")) { + // use_flash_attn = PADDLE_GET_CONST(bool, attrs.at("use_flash_attn")); + // } + // Set Attributes grad_node->SetAttrMap(std::move(attrs)); grad_node->SetDefaultAttrMap(std::move(default_attrs)); @@ -379,6 +390,7 @@ fused_gate_attention_dygraph_function( ValueTransposeOut, QKVTransposeOut, SoftmaxOut, + SoftmaxLse, FMHAOut, GateOut, Out); diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h index 7cacf8f1b1f31..d0310383abe8b 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention.h +++ b/paddle/fluid/operators/fused/fused_gate_attention.h @@ -99,7 +99,7 @@ struct GateAttentionConfig { const phi::DenseTensor* qkv_weight, bool merge_qkv, bool has_gating, - bool use_flash_attn = false) + bool use_flash_attn) : dev_ctx(dev_ctx), merge_qkv(merge_qkv), has_gating(has_gating), @@ -158,8 +158,8 @@ struct GateAttentionConfig { gate_out_dims = {batch_size, seq_len_m, seq_len_r, num_heads, head_dim}; } - bool UseFlashAttn(const bool merge_qkv, const bool run_flash_attn) { - if (!run_flash_attn) { + bool CanUseFlashAttn() const { + if (!std::is_same::value) { return false; } @@ -169,7 +169,7 @@ struct GateAttentionConfig { case 32: case 64: case 128: - return true; + return use_flash_attn; default: return false; } @@ -436,369 +436,145 @@ class FMHAGateRef { T* q_ptr = nullptr; T* k_ptr = nullptr; T* v_ptr = nullptr; - bool is_bf16 = - qkv_transpose_out->dtype() == DataType::BFLOAT16 ? true : false; - - if (std::is_same::value) { - std::cout << "T is phi::dtype::float16. \n"; - } else if (std::is_same::value) { - std::cout << "T is phi::dtype::bfloat16. \n"; - } else if (std::is_same::value) { - std::cout << "T is float. \n"; - } - if (config->UseFlashAttn(merge_qkv_, config->use_flash_attn && is_bf16)) { + if (merge_qkv_) { + // qkv_transpose_out = transpose(qkv_out) PADDLE_ENFORCE_NOT_NULL( qkv_transpose_out, platform::errors::NotFound("The input qkv_transpose_out can not be " "nullptr when merge_qkv is true.")); - // 1. Dealing with qkv_out for flash_attn. phi::DenseTensor* qkv_out = config->GetQKVOut(); - ComputeQKVTransposeForwardForFlashAttn(*qkv_out, qkv_transpose_out); + ComputeQKVTransposeForward(*qkv_out, qkv_transpose_out); config->ClearQKVOut(); - int seq_batch_size = static_cast(config->batch_size) * - static_cast(config->seq_len_m); - qkv_transpose_out->Resize( - {3, - seq_batch_size * static_cast(config->seq_len_r), - static_cast(config->num_heads), - static_cast(config->head_dim)}); - DBG_WAIT; - // q_size == k_size int64_t q_size = config->GetQuerySize(); q_ptr = qkv_transpose_out->data(); k_ptr = q_ptr + q_size; v_ptr = k_ptr + q_size; - - // 2. Dealing with cu_seq_q and cu_seq_k for flash_attn. - phi::DenseTensor cu_seq_q, cu_seq_k; - int64_t end_size = (seq_batch_size + 1); - int64_t seq_size = 0; - int64_t start = 0, end = end_size, - step = static_cast(config->seq_len_r); - phi::funcs::GetSize(start, end, step, &seq_size); - cu_seq_q.Resize({end_size}); - cu_seq_k.Resize({end_size}); - AllocWithDebugInfo(dev_ctx_, "cu_seq_q", &cu_seq_q); - AllocWithDebugInfo(dev_ctx_, "cu_seq_k", &cu_seq_k); - int64_t block = std::min(seq_size, static_cast(256)); - int64_t grid = (seq_size + block - 1) / block; - FlashAttRange<<>>( - start, step, end, cu_seq_q.data(), cu_seq_k.data()); - VLOG(4) << "[Flash_attn] cu_seq_len : start = " << start - << ", step = " << step << ", end = " << end; - DBG_WAIT; - - // 3. Dealing with mask and bias for flash_attn. - phi::DenseTensor temp_mask, temp_bias; - auto dims_merge_func = [&](const phi::DenseTensor* src_tensor, - phi::DenseTensor* dst_tensor, - const std::string& prefix) { - if (src_tensor) { - int64_t first_dim = 1; - dst_tensor->ShareDataWith(*src_tensor); - auto dims_ = src_tensor->dims(); - for (int i = 0; i < dims_.size() - 3; ++i) { - first_dim *= dims_[i]; - } - auto dims_rank = dims_.size(); - dst_tensor->Resize({first_dim, - dims_[dims_rank - 3], - dims_[dims_rank - 2], - dims_[dims_rank - 1]}); - GetFlashAttnDimsString(prefix, temp_mask.dims()); - } - }; - auto& qkv_dims = qkv_transpose_out->dims(); - dims_merge_func(src_mask, &temp_mask, "mask_dim"); - dims_merge_func(nonbatched_bias, &temp_bias, "bias_dim"); - GetFlashAttnDimsString("qkv_transpose_out", qkv_dims); - DBG_WAIT; - // 4. flash_attn parameter setting. - - int batch_size_ = seq_batch_size; - int total_q_ = qkv_dims[1]; // q.dims()[0] - int total_k_ = qkv_dims[1]; // q.dims()[0] - int num_heads_ = qkv_dims[2]; // q.dims()[1] - int head_size_ = qkv_dims[3]; // q.dims()[2] - int max_seqlen_q_ = batch_size_; - int max_seqlen_k_ = batch_size_; - int num_splits = 0; // 0 for an internal heuristic, which is optimal - VLOG(6) << "[Flash_attn Fwd] batch_size : " << batch_size_; - VLOG(6) << "[Flash_attn Fwd] total_q : " << total_q_; - VLOG(6) << "[Flash_attn Fwd] total_k : " << total_k_; - VLOG(6) << "[Flash_attn Fwd] num_heads : " << num_heads_; - VLOG(6) << "[Flash_attn Fwd] head_size : " << head_size_; - VLOG(6) << "[Flash_attn Fwd] max_seqlen_q : " << max_seqlen_q_; - VLOG(6) << "[Flash_attn Fwd] max_seqlen_k : " << max_seqlen_k_; - - // 5. construct softmax_lse - phi::DenseTensor softmax_lse; - int softmax_lse_last_dim = ((max_seqlen_q_ + 16 - 1) / 16) * 16; - softmax_lse.Resize({batch_size_, num_heads_, softmax_lse_last_dim}); - AllocWithDebugInfo( - dev_ctx_, "flash_attn: softmax_lse", &softmax_lse); - - DBG_WAIT; - // 6. construct random seed - auto gen = dev_ctx_.GetGenerator(); - uint64_t inc = batch_size_ * num_heads_ * 32; - auto seed_offset_pair = gen->IncrementOffset(inc); - uint64_t seed = seed_offset_pair.first; - uint64_t offset = seed_offset_pair.second; - - GetFlashAttnDimsString("softmax_out", softmax_out->dims()); - GetFlashAttnDimsString("softmax_lse", softmax_lse.dims()); - GetFlashAttnDimsString("cu_seq_q", cu_seq_q.dims()); - GetFlashAttnDimsString("cu_seq_k", cu_seq_k.dims()); - DBG_WAIT; - - // 7. flas_attn part one, get temp worksapce size. - float p_dropout = 0.f; - float softmax_scale = static_cast(1); - cudaStream_t stream = dev_ctx_.stream(); - uint64_t workspace_size; - bool succ = phi::dynload::flash_attn_fwd_with_bias_and_mask( - static_cast(q_ptr), - static_cast(k_ptr), - static_cast(v_ptr), - nullptr, // for calculation workspace size - cu_seq_q.data(), - cu_seq_k.data(), - total_q_, - total_k_, - batch_size_, - num_heads_, - head_size_, - max_seqlen_q_, - max_seqlen_k_, - p_dropout, - softmax_scale, - /*zero_tensors=*/false, - /*is_causal=*/false, - is_bf16, - num_splits, - softmax_lse.data(), - softmax_out->data(), - nullptr, - &workspace_size, - stream, - seed, - offset, - src_mask ? temp_mask.data() : nullptr, - nonbatched_bias ? temp_bias.data() : nullptr, - temp_mask.dims().Get(), - temp_bias.dims().Get()); - if (!succ) { - PADDLE_THROW(phi::errors::External(phi::dynload::flash_attn_error())); - } - DBG_WAIT; - - phi::DenseTensor workspace; - printf("workspace_size = %d\n", workspace_size); - if (workspace_size > 0) { - workspace = phi::Empty( - dev_ctx_, {int64_t(workspace_size / sizeof(float))}); - DBGPTR(workspace.data(), "workspace"); - } - DBG_WAIT; - -#define DBG_INIT(prefix, x) \ - do { \ - printf("[%s, %d] ", __func__, __LINE__); \ - if (x->initialized()) { \ - std::cout << prefix << " is initialized." << std::endl; \ - } else { \ - std::cout << prefix << " is not initialized." << std::endl; \ - } \ - } while (0); - DBG_INIT("qkv_transpose_out", qkv_transpose_out); - DBG_INIT("softmax_out", softmax_out); - DBG_INIT("src_mask", src_mask); - DBG_INIT("fmha_out", fmha_out); - DBG_INIT("gate_out", gate_out); - - // 8. flas_attn part two, run impl. - succ = phi::dynload::flash_attn_fwd_with_bias_and_mask( - static_cast(q_ptr), - static_cast(k_ptr), - static_cast(v_ptr), - static_cast( - fmha_out->data()), // for calculation workspace size - cu_seq_q.data(), - cu_seq_k.data(), - total_q_, - total_k_, - batch_size_, - num_heads_, - head_size_, - max_seqlen_q_, - max_seqlen_k_, - p_dropout, - softmax_scale, - /*zero_tensors=*/false, - /*is_causal=*/false, - is_bf16, - num_splits, - softmax_lse.data(), - softmax_out->data(), - (workspace_size > 0) ? static_cast(workspace.data()) : nullptr, - &workspace_size, - stream, - seed, - offset, - src_mask ? temp_mask.data() : nullptr, - nonbatched_bias ? temp_bias.data() : nullptr, - temp_mask.dims().Get(), - temp_bias.dims().Get()); - DBG_WAIT; - if (!succ) { - PADDLE_THROW(phi::errors::External(phi::dynload::flash_attn_error())); - } - DBG_WAIT; } else { - if (merge_qkv_) { - // qkv_transpose_out = transpose(qkv_out) - PADDLE_ENFORCE_NOT_NULL( - qkv_transpose_out, - platform::errors::NotFound("The input qkv_transpose_out can not be " - "nullptr when merge_qkv is true.")); - - phi::DenseTensor* qkv_out = config->GetQKVOut(); - ComputeQKVTransposeForward(*qkv_out, qkv_transpose_out); - config->ClearQKVOut(); - - // q_size == k_size - int64_t q_size = config->GetQuerySize(); - q_ptr = qkv_transpose_out->data(); - k_ptr = q_ptr + q_size; - v_ptr = k_ptr + q_size; - } else { - PADDLE_ENFORCE_NOT_NULL( - q_transpose_out, - platform::errors::NotFound("The input q_transpose_out can not be " - "nullptr when merge_qkv is false.")); - PADDLE_ENFORCE_NOT_NULL( - k_transpose_out, - platform::errors::NotFound("The input k_transpose_out can not be " - "nullptr when merge_qkv is false.")); - PADDLE_ENFORCE_NOT_NULL( - v_transpose_out, - platform::errors::NotFound("The input v_transpose_out can not be " - "nullptr when merge_qkv is false.")); - - phi::DenseTensor* query_out = config->GetQueryOut(); - phi::DenseTensor* key_out = config->GetKeyOut(); - phi::DenseTensor* value_out = config->GetValueOut(); - ComputeQKVTransposeForward(*query_out, - *key_out, - *value_out, - q_transpose_out, - k_transpose_out, - v_transpose_out); - - // q_size != k_size - q_ptr = q_transpose_out->data(); - k_ptr = k_transpose_out->data(); - v_ptr = v_transpose_out->data(); - } - // qk_out = BatchedGEMM(Q, K^T) - // [batch_size, seq_len_m, num_heads, seq_len_r, head_dim] * - // [batch_size, seq_len_m, num_heads, m_size, head_dim] - // -> [batch_size, seq_len_m, num_heads, seq_len_r, m_size] - phi::DenseTensor* qk_out = config->GetQKOut(softmax_out); - T* qk_out_ptr = qk_out->data(); - - int64_t gemm_batch_size = - config->batch_size * config->seq_len_m * config->num_heads; - int64_t gemm_m = config->seq_len_r; - int64_t gemm_n = config->m_size; - int64_t gemm_k = config->head_dim; - // attn = torch.matmul(q, k.transpose(-1, -2)) - T alpha = static_cast(1.0 / sqrt(config->head_dim)); - // ComputeBatchedGEMM(merge_qkv_ ? - // phi::slice_ddim(qkv_transpose_out->dims(), - // 1, - // qkv_transpose_out->dims().size() - // - 1) : q_transpose_out->dims(), - // merge_qkv_ ? - // phi::slice_ddim(qkv_transpose_out->dims(), - // 1, - // qkv_transpose_out->dims().size() - // - 1) : k_transpose_out->dims(), - // q_ptr, - // k_ptr, - // qk_out_ptr, - // false, - // true, - // gemm_m, - // gemm_n, - // gemm_k, - // gemm_batch_size, - // alpha); - - ComputeBatchedGEMM(q_ptr, - k_ptr, - qk_out_ptr, - false, - true, - gemm_m, - gemm_n, - gemm_k, - gemm_batch_size, - alpha); - // attn = softmax_dropout(attn, 0, self.training, mask=mask, bias=bias) - // softmax_out = softmax(qk_out + nonbatched_bias + src_mask) - ComputeBiasMaskSoftmaxForward( - nonbatched_bias, src_mask, qk_out, softmax_out); - config->ClearQKOut(); - - // qktv_out = BatchedGEMM(softmax_out, V) - // [batch_size, seq_len_m, num_heads, seq_len_r, m_size] * - // [batch_size, seq_len_m, num_heads, m_size, head_dim] - // -> [batch_size, seq_len_m, num_heads, seq_len_r, head_dim] - phi::DenseTensor* qktv_out = config->GetQKTVOut(gate_out); - T* qktv_out_ptr = qktv_out->data(); - - gemm_m = config->seq_len_r; - gemm_n = config->head_dim; - gemm_k = config->m_size; - - // o = torch.matmul(attn, v) - T* softmax_out_ptr = softmax_out->data(); - // ComputeBatchedGEMM(softmax_out->dims(), - // merge_qkv_ ? - // phi::slice_ddim(qkv_transpose_out->dims(), - // 1, - // qkv_transpose_out->dims().size() - // - 1) : v_transpose_out->dims(), - // softmax_out_ptr, - // v_ptr, - // qktv_out_ptr, - // false, - // false, - // gemm_m, - // gemm_n, - // gemm_k, - // gemm_batch_size); - - ComputeBatchedGEMM(softmax_out_ptr, - v_ptr, - qktv_out_ptr, - false, - false, - gemm_m, - gemm_n, - gemm_k, - gemm_batch_size); + PADDLE_ENFORCE_NOT_NULL( + q_transpose_out, + platform::errors::NotFound("The input q_transpose_out can not be " + "nullptr when merge_qkv is false.")); + PADDLE_ENFORCE_NOT_NULL( + k_transpose_out, + platform::errors::NotFound("The input k_transpose_out can not be " + "nullptr when merge_qkv is false.")); + PADDLE_ENFORCE_NOT_NULL( + v_transpose_out, + platform::errors::NotFound("The input v_transpose_out can not be " + "nullptr when merge_qkv is false.")); - // fmha_out = transpose(qktv_out) - // o = o.transpose(-2, -3).contiguous() - ComputeQKTVTransposeForward(*qktv_out, fmha_out); + phi::DenseTensor* query_out = config->GetQueryOut(); + phi::DenseTensor* key_out = config->GetKeyOut(); + phi::DenseTensor* value_out = config->GetValueOut(); + ComputeQKVTransposeForward(*query_out, + *key_out, + *value_out, + q_transpose_out, + k_transpose_out, + v_transpose_out); + + // q_size != k_size + q_ptr = q_transpose_out->data(); + k_ptr = k_transpose_out->data(); + v_ptr = v_transpose_out->data(); } + // qk_out = BatchedGEMM(Q, K^T) + // [batch_size, seq_len_m, num_heads, seq_len_r, head_dim] * + // [batch_size, seq_len_m, num_heads, m_size, head_dim] + // -> [batch_size, seq_len_m, num_heads, seq_len_r, m_size] + phi::DenseTensor* qk_out = config->GetQKOut(softmax_out); + T* qk_out_ptr = qk_out->data(); + + int64_t gemm_batch_size = + config->batch_size * config->seq_len_m * config->num_heads; + int64_t gemm_m = config->seq_len_r; + int64_t gemm_n = config->m_size; + int64_t gemm_k = config->head_dim; + // attn = torch.matmul(q, k.transpose(-1, -2)) + T alpha = static_cast(1.0 / sqrt(config->head_dim)); + // ComputeBatchedGEMM(merge_qkv_ ? + // phi::slice_ddim(qkv_transpose_out->dims(), + // 1, + // qkv_transpose_out->dims().size() + // - 1) : q_transpose_out->dims(), + // merge_qkv_ ? + // phi::slice_ddim(qkv_transpose_out->dims(), + // 1, + // qkv_transpose_out->dims().size() + // - 1) : k_transpose_out->dims(), + // q_ptr, + // k_ptr, + // qk_out_ptr, + // false, + // true, + // gemm_m, + // gemm_n, + // gemm_k, + // gemm_batch_size, + // alpha); + + ComputeBatchedGEMM(q_ptr, + k_ptr, + qk_out_ptr, + false, + true, + gemm_m, + gemm_n, + gemm_k, + gemm_batch_size, + alpha); + // attn = softmax_dropout(attn, 0, self.training, mask=mask, bias=bias) + // softmax_out = softmax(qk_out + nonbatched_bias + src_mask) + ComputeBiasMaskSoftmaxForward( + nonbatched_bias, src_mask, qk_out, softmax_out); + config->ClearQKOut(); + + // qktv_out = BatchedGEMM(softmax_out, V) + // [batch_size, seq_len_m, num_heads, seq_len_r, m_size] * + // [batch_size, seq_len_m, num_heads, m_size, head_dim] + // -> [batch_size, seq_len_m, num_heads, seq_len_r, head_dim] + phi::DenseTensor* qktv_out = config->GetQKTVOut(gate_out); + T* qktv_out_ptr = qktv_out->data(); + + gemm_m = config->seq_len_r; + gemm_n = config->head_dim; + gemm_k = config->m_size; + + // o = torch.matmul(attn, v) + T* softmax_out_ptr = softmax_out->data(); + // ComputeBatchedGEMM(softmax_out->dims(), + // merge_qkv_ ? + // phi::slice_ddim(qkv_transpose_out->dims(), + // 1, + // qkv_transpose_out->dims().size() + // - 1) : v_transpose_out->dims(), + // softmax_out_ptr, + // v_ptr, + // qktv_out_ptr, + // false, + // false, + // gemm_m, + // gemm_n, + // gemm_k, + // gemm_batch_size); + + ComputeBatchedGEMM(softmax_out_ptr, + v_ptr, + qktv_out_ptr, + false, + false, + gemm_m, + gemm_n, + gemm_k, + gemm_batch_size); + + // fmha_out = transpose(qktv_out) + // o = o.transpose(-2, -3).contiguous() + ComputeQKTVTransposeForward(*qktv_out, fmha_out); config->ClearQKTVOut(); if (config->has_gating) { @@ -843,7 +619,7 @@ class FMHAGateRef { std::cout << "[Grad]: T is float. \n"; } - if (config->UseFlashAttn(merge_qkv_, config->use_flash_attn && is_bf16)) { + if (config->CanUseFlashAttn()) { PADDLE_ENFORCE_NOT_NULL( qkv_transpose_out, platform::errors::NotFound("The input qkv_transpose_out can not be" @@ -1275,15 +1051,6 @@ class FMHAGateRef { dev_ctx_, qkv_out, perm, qkv_transpose_out); } - // [batch_size, seq_len_m, seq_len_r, 3, num_heads, head_dim] -> - // [3, batch_size, seq_len_m, seq_len_r, num_heads, head_dim] - void ComputeQKVTransposeForwardForFlashAttn( - const phi::DenseTensor& qkv_out, phi::DenseTensor* qkv_transpose_out) { - std::vector perm = {3, 0, 1, 2, 4, 5}; - phi::funcs::TransposeGPUKernelDriver( - dev_ctx_, qkv_out, perm, qkv_transpose_out); - } - void ComputeQKVTransposeBackward( const phi::DenseTensor& qkv_transpose_out_grad, phi::DenseTensor* qkv_out_grad) { @@ -1449,5 +1216,268 @@ class FMHAGateRef { bool merge_qkv_; }; +template +class FlashAttnWithGating { + public: + FlashAttnWithGating(const phi::GPUContext& dev_ctx, bool merge_qkv) + : dev_ctx_(dev_ctx), merge_qkv_(merge_qkv) {} + + void ComputeForward(const phi::DenseTensor* nonbatched_bias, + const phi::DenseTensor* src_mask, + phi::DenseTensor* q_transpose_out, + phi::DenseTensor* k_transpose_out, + phi::DenseTensor* v_transpose_out, + phi::DenseTensor* qkv_transpose_out, + phi::DenseTensor* softmax_out, + phi::DenseTensor* softmax_lse, + phi::DenseTensor* fmha_out, + phi::DenseTensor* gate_out, + GateAttentionConfig* config) { + T* q_ptr = nullptr; + T* k_ptr = nullptr; + T* v_ptr = nullptr; + + bool is_bf16 = + qkv_transpose_out->dtype() == DataType::BFLOAT16 ? true : false; + + if (std::is_same::value) { + LOG(INFO) << "T is phi::dtype::float16."; + } else if (std::is_same::value) { + LOG(INFO) << "T is phi::dtype::bfloat16."; + } else if (std::is_same::value) { + LOG(INFO) << "T is float."; + } + + LOG(INFO) << "Use flash attention"; + + PADDLE_ENFORCE_NOT_NULL( + qkv_transpose_out, + platform::errors::NotFound("The input qkv_transpose_out can not be " + "nullptr when merge_qkv is true.")); + + // 1. Dealing with qkv_out for flash_attn. + phi::DenseTensor* qkv_out = config->GetQKVOut(); + ComputeQKVTransposeForwardForFlashAttn(*qkv_out, qkv_transpose_out); + config->ClearQKVOut(); + + int seq_batch_size = static_cast(config->batch_size) * + static_cast(config->seq_len_m); + qkv_transpose_out->Resize( + {3, + seq_batch_size * static_cast(config->seq_len_r), + static_cast(config->num_heads), + static_cast(config->head_dim)}); + DBG_WAIT; + + // q_size == k_size + int64_t q_size = config->GetQuerySize(); + q_ptr = qkv_transpose_out->data(); + k_ptr = q_ptr + q_size; + v_ptr = k_ptr + q_size; + + // 2. Dealing with cu_seq_q and cu_seq_k for flash_attn. + phi::DenseTensor cu_seq_q, cu_seq_k; + int64_t end_size = (seq_batch_size + 1); + int64_t seq_size = 0; + int64_t start = 0, end = end_size, + step = static_cast(config->seq_len_r); + phi::funcs::GetSize(start, end, step, &seq_size); + cu_seq_q.Resize({end_size}); + cu_seq_k.Resize({end_size}); + AllocWithDebugInfo(dev_ctx_, "cu_seq_q", &cu_seq_q); + AllocWithDebugInfo(dev_ctx_, "cu_seq_k", &cu_seq_k); + int64_t block = std::min(seq_size, static_cast(256)); + int64_t grid = (seq_size + block - 1) / block; + FlashAttRange<<>>( + start, step, end, cu_seq_q.data(), cu_seq_k.data()); + VLOG(4) << "[Flash_attn] cu_seq_len : start = " << start + << ", step = " << step << ", end = " << end; + DBG_WAIT; + + // 3. Dealing with mask and bias for flash_attn. + phi::DenseTensor temp_mask, temp_bias; + auto dims_merge_func = [&](const phi::DenseTensor* src_tensor, + phi::DenseTensor* dst_tensor, + const std::string& prefix) { + if (src_tensor) { + int64_t first_dim = 1; + dst_tensor->ShareDataWith(*src_tensor); + auto dims_ = src_tensor->dims(); + for (int i = 0; i < dims_.size() - 3; ++i) { + first_dim *= dims_[i]; + } + auto dims_rank = dims_.size(); + dst_tensor->Resize({first_dim, + dims_[dims_rank - 3], + dims_[dims_rank - 2], + dims_[dims_rank - 1]}); + GetFlashAttnDimsString(prefix, temp_mask.dims()); + } + }; + auto& qkv_dims = qkv_transpose_out->dims(); + dims_merge_func(src_mask, &temp_mask, "mask_dim"); + dims_merge_func(nonbatched_bias, &temp_bias, "bias_dim"); + GetFlashAttnDimsString("qkv_transpose_out", qkv_dims); + DBG_WAIT; + // 4. flash_attn parameter setting. + + int batch_size_ = seq_batch_size; + int total_q_ = qkv_dims[1]; // q.dims()[0] + int total_k_ = qkv_dims[1]; // q.dims()[0] + int num_heads_ = qkv_dims[2]; // q.dims()[1] + int head_size_ = qkv_dims[3]; // q.dims()[2] + int max_seqlen_q_ = batch_size_; + int max_seqlen_k_ = batch_size_; + int num_splits = 0; // 0 for an internal heuristic, which is optimal + VLOG(6) << "[Flash_attn Fwd] batch_size : " << batch_size_; + VLOG(6) << "[Flash_attn Fwd] total_q : " << total_q_; + VLOG(6) << "[Flash_attn Fwd] total_k : " << total_k_; + VLOG(6) << "[Flash_attn Fwd] num_heads : " << num_heads_; + VLOG(6) << "[Flash_attn Fwd] head_size : " << head_size_; + VLOG(6) << "[Flash_attn Fwd] max_seqlen_q : " << max_seqlen_q_; + VLOG(6) << "[Flash_attn Fwd] max_seqlen_k : " << max_seqlen_k_; + + // 5. construct softmax_lse + int softmax_lse_last_dim = ((max_seqlen_q_ + 16 - 1) / 16) * 16; + softmax_lse->Resize({batch_size_, num_heads_, softmax_lse_last_dim}); + AllocWithDebugInfo(dev_ctx_, "flash_attn: softmax_lse", softmax_lse); + + DBG_WAIT; + // 6. construct random seed + auto gen = dev_ctx_.GetGenerator(); + uint64_t inc = batch_size_ * num_heads_ * 32; + auto seed_offset_pair = gen->IncrementOffset(inc); + uint64_t seed = seed_offset_pair.first; + uint64_t offset = seed_offset_pair.second; + + GetFlashAttnDimsString("softmax_out", softmax_out->dims()); + GetFlashAttnDimsString("softmax_lse", softmax_lse->dims()); + GetFlashAttnDimsString("cu_seq_q", cu_seq_q.dims()); + GetFlashAttnDimsString("cu_seq_k", cu_seq_k.dims()); + DBG_WAIT; + + // 7. flas_attn part one, get temp worksapce size. + float p_dropout = 0.f; + float softmax_scale = static_cast(1); + cudaStream_t stream = dev_ctx_.stream(); + uint64_t workspace_size; + bool succ = phi::dynload::flash_attn_fwd_with_bias_and_mask( + static_cast(q_ptr), + static_cast(k_ptr), + static_cast(v_ptr), + nullptr, // for calculation workspace size + cu_seq_q.data(), + cu_seq_k.data(), + total_q_, + total_k_, + batch_size_, + num_heads_, + head_size_, + max_seqlen_q_, + max_seqlen_k_, + p_dropout, + softmax_scale, + /*zero_tensors=*/false, + /*is_causal=*/false, + is_bf16, + num_splits, + softmax_lse->data(), + softmax_out->data(), + nullptr, + &workspace_size, + stream, + seed, + offset, + src_mask ? temp_mask.data() : nullptr, + nonbatched_bias ? temp_bias.data() : nullptr, + temp_mask.dims().Get(), + temp_bias.dims().Get()); + if (!succ) { + PADDLE_THROW(phi::errors::External(phi::dynload::flash_attn_error())); + } + DBG_WAIT; + + phi::DenseTensor workspace; + printf("workspace_size = %d\n", workspace_size); + if (workspace_size > 0) { + workspace = phi::Empty( + dev_ctx_, {int64_t(workspace_size / sizeof(float))}); + DBGPTR(workspace.data(), "workspace"); + } + DBG_WAIT; + +#define DBG_INIT(prefix, x) \ + do { \ + printf("[%s, %d] ", __func__, __LINE__); \ + if (x->initialized()) { \ + std::cout << prefix << " is initialized." << std::endl; \ + } else { \ + std::cout << prefix << " is not initialized." << std::endl; \ + } \ + } while (0); + + DBG_INIT("qkv_transpose_out", qkv_transpose_out); + DBG_INIT("softmax_out", softmax_out); + DBG_INIT("src_mask", src_mask); + DBG_INIT("fmha_out", fmha_out); + DBG_INIT("gate_out", gate_out); + + // 8. flas_attn part two, run impl. + succ = phi::dynload::flash_attn_fwd_with_bias_and_mask( + static_cast(q_ptr), + static_cast(k_ptr), + static_cast(v_ptr), + static_cast(fmha_out->data()), // for calculation workspace size + cu_seq_q.data(), + cu_seq_k.data(), + total_q_, + total_k_, + batch_size_, + num_heads_, + head_size_, + max_seqlen_q_, + max_seqlen_k_, + p_dropout, + softmax_scale, + /*zero_tensors=*/false, + /*is_causal=*/false, + is_bf16, + num_splits, + softmax_lse->data(), + softmax_out->data(), + (workspace_size > 0) ? static_cast(workspace.data()) : nullptr, + &workspace_size, + stream, + seed, + offset, + src_mask ? temp_mask.data() : nullptr, + nonbatched_bias ? temp_bias.data() : nullptr, + temp_mask.dims().Get(), + temp_bias.dims().Get()); + DBG_WAIT; + if (!succ) { + PADDLE_THROW(phi::errors::External(phi::dynload::flash_attn_error())); + } + DBG_WAIT; + + if (config->has_gating) { + gate_out->Resize(config->gate_out_dims); + } + } + + private: + // [batch_size, seq_len_m, seq_len_r, 3, num_heads, head_dim] -> + // [3, batch_size, seq_len_m, seq_len_r, num_heads, head_dim] + void ComputeQKVTransposeForwardForFlashAttn( + const phi::DenseTensor& qkv_out, phi::DenseTensor* qkv_transpose_out) { + std::vector perm = {3, 0, 1, 2, 4, 5}; + phi::funcs::TransposeGPUKernelDriver( + dev_ctx_, qkv_out, perm, qkv_transpose_out); + } + + const phi::GPUContext& dev_ctx_; + bool merge_qkv_; +}; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cc b/paddle/fluid/operators/fused/fused_gate_attention_op.cc index 0c965ac08745a..a3fd178059fab 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention_op.cc +++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cc @@ -157,14 +157,14 @@ class FusedGateAttentionOpMaker : public framework::OpProtoAndCheckerMaker { .AsIntermediate() .AsDispensable(); AddOutput("SoftmaxOut", "Result in fmha.").AsIntermediate(); + AddOutput("SoftmaxLse", "Result of the flash attention.") + .AsIntermediate() + .AsDispensable(); AddOutput("FMHAOut", "Result in fmha.").AsIntermediate(); AddOutput("GateOut", "Result of the gating module.") .AsIntermediate() .AsDispensable(); AddOutput("Out", "Result after attention."); - AddOutput("SoftmaxLse", "Result of the gating module.") - .AsIntermediate() - .AsDispensable(); AddAttr("has_gating", "if true, the attention op uses gate architecure, " "[default true].") diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu index 938d0c707d08c..afee194827ceb 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu @@ -428,17 +428,33 @@ class FusedGateAttentionOpKernel : public framework::OpKernel { } // 2. FMHA - auto fmha_compute = FMHAGateRef(dev_ctx, merge_qkv); - fmha_compute.ComputeForward(nonbatched_bias, - src_mask, - q_transpose_out, - k_transpose_out, - v_transpose_out, - qkv_transpose_out, - softmax_out, - fmha_out, - gate_out, - &config); + if (config.CanUseFlashAttn()) { + auto *softmax_lse = ctx.Output("SoftmaxLse"); + auto fmha_compute = FlashAttnWithGating(dev_ctx, merge_qkv); + fmha_compute.ComputeForward(nonbatched_bias, + src_mask, + q_transpose_out, + k_transpose_out, + v_transpose_out, + qkv_transpose_out, + softmax_out, + softmax_lse, + fmha_out, + gate_out, + &config); + } else { + auto fmha_compute = FMHAGateRef(dev_ctx, merge_qkv); + fmha_compute.ComputeForward(nonbatched_bias, + src_mask, + q_transpose_out, + k_transpose_out, + v_transpose_out, + qkv_transpose_out, + softmax_out, + fmha_out, + gate_out, + &config); + } // 3. Gating Linear if (has_gating) { From 562d2daf7529a4faf62d3d9dd20ecf8f6965b66f Mon Sep 17 00:00:00 2001 From: risemeup1 <62429225+risemeup1@users.noreply.github.com> Date: Mon, 24 Apr 2023 19:20:10 +0800 Subject: [PATCH 033/405] Fix gpu ps compile patch error (#53256) * fix patch error * fix patch error --- cmake/external/eigen.cmake | 2 +- cmake/external/gloo.cmake | 4 ++-- cmake/external/gtest.cmake | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 12ccf5df27cae..eda9ae8c4a2ac 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -52,7 +52,7 @@ if(CMAKE_COMPILER_IS_GNUCC) file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Complex.h.patch complex_header) set(EIGEN_PATCH_COMMAND - patch -d ${EIGEN_SOURCE_DIR}/Eigen/src/Core/arch/SSE/ < + patch -Nd ${EIGEN_SOURCE_DIR}/Eigen/src/Core/arch/SSE/ < ${complex_header}) endif() endif() diff --git a/cmake/external/gloo.cmake b/cmake/external/gloo.cmake index 0666d48538b74..63212c974e257 100755 --- a/cmake/external/gloo.cmake +++ b/cmake/external/gloo.cmake @@ -55,8 +55,8 @@ if(CMAKE_COMPILER_IS_GNUCC) file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/gloo/types.h.patch types_header) set(GLOO_PATCH_COMMAND - patch -d ${GLOO_SOURCE_DIR}/gloo/transport/tcp < ${native_dst} && patch - -d ${GLOO_SOURCE_DIR}/gloo/ < ${types_header}) + patch -Nd ${GLOO_SOURCE_DIR}/gloo/transport/tcp < ${native_dst} && + patch -Nd ${GLOO_SOURCE_DIR}/gloo/ < ${types_header}) endif() endif() include_directories(${GLOO_INCLUDE_DIR}) diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake index be6d9cdde61e7..315f6b5b752b2 100644 --- a/cmake/external/gtest.cmake +++ b/cmake/external/gtest.cmake @@ -66,7 +66,7 @@ endif() if(NOT WIN32 AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 12.0) file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/gtest/gtest-death-test.cc.patch native_src) - set(GTEST_PATCH_COMMAND patch -d ${GTEST_SOURCE_DIR}/googletest/src < + set(GTEST_PATCH_COMMAND patch -Nd ${GTEST_SOURCE_DIR}/googletest/src < ${native_src}) endif() if(WIN32) From ddd7203987fe8bd464b2a1d7652085e9f4cee6e0 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Mon, 24 Apr 2023 19:33:16 +0800 Subject: [PATCH 034/405] fix dist_grad kernel (#53239) --- paddle/phi/kernels/dist_grad_kernel.cc | 48 +++++++++++-------- .../fluid/tests/unittests/test_dist_op.py | 9 ++++ 2 files changed, 38 insertions(+), 19 deletions(-) diff --git a/paddle/phi/kernels/dist_grad_kernel.cc b/paddle/phi/kernels/dist_grad_kernel.cc index ba468ad299e4c..17c24fa905b5c 100644 --- a/paddle/phi/kernels/dist_grad_kernel.cc +++ b/paddle/phi/kernels/dist_grad_kernel.cc @@ -52,6 +52,10 @@ void DistGradKernel(const Context& dev_ctx, float p, DenseTensor* x_grad, DenseTensor* y_grad) { + if ((!x_grad) && (!y_grad)) { + return; + } + auto t = Subtract(dev_ctx, x, y); DenseTensor x_grad_tmp; x_grad_tmp.Resize(t.dims()); @@ -59,26 +63,32 @@ void DistGradKernel(const Context& dev_ctx, y_grad_tmp.Resize(t.dims()); PNormGradKernel( dev_ctx, t, out, out_grad, p, -1, 1e-12, false, true, &x_grad_tmp); - ScaleKernel(dev_ctx, x_grad_tmp, -1.0, 0.0, false, &y_grad_tmp); - // do reduce, the implemetation of cpu SumKernel has bug, it changes - // the dims of output iternally, so we Resize x/y_grad twice. - auto res_x = GetReduceDims(x_grad_tmp.dims(), x.dims()); - if (!std::get<0>(res_x).empty()) { - x_grad->Resize(phi::make_ddim(std::get<1>(res_x))); - SumKernel( - dev_ctx, x_grad_tmp, std::get<0>(res_x), x.dtype(), false, x_grad); - x_grad->Resize(x.dims()); - } else { - x_grad->ShareBufferWith(x_grad_tmp); + + if (x_grad) { + // do reduce, the implemetation of cpu SumKernel has bug, it changes + // the dims of output iternally, so we Resize x/y_grad twice. + auto res_x = GetReduceDims(x_grad_tmp.dims(), x.dims()); + if (!std::get<0>(res_x).empty()) { + x_grad->Resize(phi::make_ddim(std::get<1>(res_x))); + SumKernel( + dev_ctx, x_grad_tmp, std::get<0>(res_x), x.dtype(), false, x_grad); + x_grad->Resize(x.dims()); + } else { + x_grad->ShareBufferWith(x_grad_tmp); + } } - auto res_y = GetReduceDims(y_grad_tmp.dims(), y.dims()); - if (!std::get<0>(res_y).empty()) { - y_grad->Resize(phi::make_ddim(std::get<1>(res_y))); - SumKernel( - dev_ctx, y_grad_tmp, std::get<0>(res_y), y.dtype(), false, y_grad); - y_grad->Resize(y.dims()); - } else { - y_grad->ShareBufferWith(y_grad_tmp); + + if (y_grad) { + ScaleKernel(dev_ctx, x_grad_tmp, -1.0, 0.0, false, &y_grad_tmp); + auto res_y = GetReduceDims(y_grad_tmp.dims(), y.dims()); + if (!std::get<0>(res_y).empty()) { + y_grad->Resize(phi::make_ddim(std::get<1>(res_y))); + SumKernel( + dev_ctx, y_grad_tmp, std::get<0>(res_y), y.dtype(), false, y_grad); + y_grad->Resize(y.dims()); + } else { + y_grad->ShareBufferWith(y_grad_tmp); + } } } diff --git a/python/paddle/fluid/tests/unittests/test_dist_op.py b/python/paddle/fluid/tests/unittests/test_dist_op.py index 4ec55cb7938df..96c0de915cff2 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_op.py +++ b/python/paddle/fluid/tests/unittests/test_dist_op.py @@ -192,6 +192,15 @@ def test_api(self): ) np.testing.assert_allclose(dist(x_i, y_i, p), out[0], rtol=1e-05) + def test_grad_x(self): + paddle.disable_static() + a = paddle.rand([2, 2, 3, 2]) + b = paddle.rand([1, 1, 3, 1]) + a.stop_gradient = False + c = paddle.dist(a, b, 2) + c.backward() + paddle.enable_static() + if __name__ == '__main__': paddle.enable_static() From 9f9cd919a669428fe47e4184b0d334a25e6c6e61 Mon Sep 17 00:00:00 2001 From: Zhou Wei <1183042833@qq.com> Date: Mon, 24 Apr 2023 20:25:29 +0800 Subject: [PATCH 035/405] [Zero-Dim] Support paddle.max output 0D, test=allcase (#53242) --- .../operators/reduce_ops/reduce_max_op.cc | 7 +-- .../composite_backward_api.h | 2 +- paddle/phi/api/yaml/legacy_ops.yaml | 2 +- paddle/phi/kernels/cpu/add_n_kernel.cc | 2 + .../kernels/funcs/selected_rows_functor.cc | 1 + paddle/phi/kernels/funcs/unsqueeze.h | 18 +++---- .../phi/kernels/onednn/reduce_kernel_impl.h | 6 ++- .../fleet/utils/mix_precision_utils.py | 2 +- python/paddle/fluid/dygraph/math_op_patch.py | 2 +- .../unittests/test_learning_rate_scheduler.py | 8 +-- .../tests/unittests/test_lr_scheduler.py | 2 +- .../tests/unittests/test_zero_dim_tensor.py | 49 +++++++++++++++++-- python/paddle/hapi/progressbar.py | 9 +++- python/paddle/nn/clip.py | 2 +- python/paddle/nn/quant/lsq.py | 4 +- test/autograd/utils.py | 5 +- .../seq2seq_dygraph_model.py | 2 +- test/dygraph_to_static/test_for_enumerate.py | 2 +- test/dygraph_to_static/test_sentiment.py | 4 +- 19 files changed, 88 insertions(+), 41 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc index f53b6d985e23b..1bb84a054698d 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc @@ -54,9 +54,10 @@ class ReduceMaxCompositeGradOpMaker : public prim::CompositeGradOpMakerBase { } // namespace operators } // namespace paddle -DECLARE_INFER_SHAPE_FUNCTOR(reduce_max, - ReduceMaxInferShapeFunctor, - PD_INFER_META(phi::OriginReduceInferMetaBase)); +DECLARE_INFER_SHAPE_FUNCTOR( + reduce_max, + ReduceMaxInferShapeFunctor, + PD_INFER_META(phi::ReduceIntArrayAxisInferMetaBase)); REGISTER_OPERATOR( reduce_max, diff --git a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h index c722aa4858388..a42c41c1ba229 100644 --- a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h +++ b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h @@ -1335,7 +1335,7 @@ void max_grad(const Tensor& x, } else { auto axis_ = std::vector(); if (reduce_all) { - for (int64_t i = 1; i < x_dim_size; i++) { + for (int64_t i = 0; i < x_dim_size; i++) { axis_.push_back(i); } } else { diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml index a3ff474a69ecc..d23a20b18fcc4 100755 --- a/paddle/phi/api/yaml/legacy_ops.yaml +++ b/paddle/phi/api/yaml/legacy_ops.yaml @@ -744,7 +744,7 @@ args : (Tensor x, IntArray axis={}, bool keepdim=false) output : Tensor(out) infer_meta : - func : OriginReduceInferMeta + func : ReduceIntArrayAxisInferMeta kernel : func : max backward : max_grad diff --git a/paddle/phi/kernels/cpu/add_n_kernel.cc b/paddle/phi/kernels/cpu/add_n_kernel.cc index 650d33755dca9..e165f1cc7f24b 100644 --- a/paddle/phi/kernels/cpu/add_n_kernel.cc +++ b/paddle/phi/kernels/cpu/add_n_kernel.cc @@ -89,6 +89,7 @@ PD_REGISTER_KERNEL(add_n, double, int, phi::dtype::bfloat16, + phi::dtype::float16, int64_t) {} PD_REGISTER_KERNEL(add_n_array, @@ -99,4 +100,5 @@ PD_REGISTER_KERNEL(add_n_array, double, int, phi::dtype::bfloat16, + phi::dtype::float16, int64_t) {} diff --git a/paddle/phi/kernels/funcs/selected_rows_functor.cc b/paddle/phi/kernels/funcs/selected_rows_functor.cc index 9fc69ddab4a9c..efeb742cfd2ae 100644 --- a/paddle/phi/kernels/funcs/selected_rows_functor.cc +++ b/paddle/phi/kernels/funcs/selected_rows_functor.cc @@ -395,6 +395,7 @@ template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; #ifdef PADDLE_WITH_XPU diff --git a/paddle/phi/kernels/funcs/unsqueeze.h b/paddle/phi/kernels/funcs/unsqueeze.h index 136fb9b2924bd..b15e781b25117 100644 --- a/paddle/phi/kernels/funcs/unsqueeze.h +++ b/paddle/phi/kernels/funcs/unsqueeze.h @@ -105,19 +105,19 @@ inline DDim GetOutputSqueezeShape(const std::vector squeeze_dims, inline DDim GetUnsqueezeShape(const std::vector unsqz_dims, const DDim& in_dims) { - int output_size = in_dims.size() + static_cast(unsqz_dims.size()); - int cur_output_size = in_dims.size(); - std::vector output_shape(output_size, 0); + int output_rank = in_dims.size() + static_cast(unsqz_dims.size()); + int cur_output_rank = in_dims.size(); + std::vector output_shape(output_rank, 0); // Validity Check: rank range. PADDLE_ENFORCE_LE( - output_size, + output_rank, 6, phi::errors::InvalidArgument("The output " "tensor's rank should be less than 6.")); for (int axis : unsqz_dims) { - int cur = axis < 0 ? axis + cur_output_size + 1 : axis; + int cur = axis < 0 ? axis + cur_output_rank + 1 : axis; // Vaildity Check: the axis bound PADDLE_ENFORCE_GE( cur, @@ -125,12 +125,12 @@ inline DDim GetUnsqueezeShape(const std::vector unsqz_dims, phi::errors::InvalidArgument("The insert dimension value should " "not be less than 0")); PADDLE_ENFORCE_LE(cur, - cur_output_size, + cur_output_rank, phi::errors::InvalidArgument( "The insert dimension value shoule not be larger " "than the dimension size of input tensor")); // Move old axis, and insert new axis - for (int i = cur_output_size; i >= cur; --i) { + for (int i = cur_output_rank; i >= cur; --i) { if (output_shape[i] == 1) { // Move axis output_shape[i + 1] = 1; @@ -139,11 +139,11 @@ inline DDim GetUnsqueezeShape(const std::vector unsqz_dims, } output_shape[cur] = 1; // Add the output size. - cur_output_size++; + cur_output_rank++; } // Make output shape - for (int in_idx = 0, out_idx = 0; out_idx < output_size; ++out_idx) { + for (int in_idx = 0, out_idx = 0; out_idx < output_rank; ++out_idx) { if (output_shape[out_idx] == 0) { output_shape[out_idx] = in_dims[in_idx++]; } diff --git a/paddle/phi/kernels/onednn/reduce_kernel_impl.h b/paddle/phi/kernels/onednn/reduce_kernel_impl.h index d9e01f79bed6a..69f667c36624b 100644 --- a/paddle/phi/kernels/onednn/reduce_kernel_impl.h +++ b/paddle/phi/kernels/onednn/reduce_kernel_impl.h @@ -102,8 +102,10 @@ void ReduceKernel(const Context& dev_ctx, reduction_p->execute(astream, reduction_args); astream.wait(); - out->set_mem_desc( - dst_memory_p->get_desc().reshape(vectorize(out->dims()))); + const auto reshape_dims = out->dims().size() != 0 + ? vectorize(out->dims()) + : std::vector{1}; + out->set_mem_desc(dst_memory_p->get_desc().reshape(reshape_dims)); } } diff --git a/python/paddle/distributed/fleet/utils/mix_precision_utils.py b/python/paddle/distributed/fleet/utils/mix_precision_utils.py index 1ee26bce1fb85..9e172d840bdfc 100644 --- a/python/paddle/distributed/fleet/utils/mix_precision_utils.py +++ b/python/paddle/distributed/fleet/utils/mix_precision_utils.py @@ -242,7 +242,7 @@ def unscale_method(self, optimizer): paddle.distributed.all_reduce( is_found_inf, op=paddle.distributed.ReduceOp.MAX, group=None ) - self._found_inf = is_found_inf.numpy()[0] + self._found_inf = int(is_found_inf) class MixPrecisionScaler: diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py index 47d3451122943..220d849070d18 100644 --- a/python/paddle/fluid/dygraph/math_op_patch.py +++ b/python/paddle/fluid/dygraph/math_op_patch.py @@ -179,7 +179,7 @@ def _ndim_(var): @property def _size_(var): - return np.prod(var.shape) + return int(np.prod(var.shape)) @property def _T_(var): diff --git a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py index e92f44f5e1b41..126d5bfbf753e 100644 --- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py +++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py @@ -212,7 +212,7 @@ def test_LR_state_dict(self): adam_test.set_dict(opt_state) self.assertEqual( adam_test._learning_rate.best_loss, - adam3._learning_rate.best_loss.numpy()[0], + adam3._learning_rate.best_loss, "best_loss is different before and after set_dict", ) self.assertEqual( @@ -275,7 +275,7 @@ def test_LinearLrWarmup(self): t = lr() np.testing.assert_allclose( - t.numpy()[0].item(), right_result[i], rtol=1e-05 + t.numpy().item(), right_result[i], rtol=1e-05 ) with self.assertRaises(TypeError): @@ -342,7 +342,7 @@ def test_StepDecay(self): right_result = step_decay( epoch, learning_rate, step_size, decay_rate ) - fluid_result = scheduler().numpy()[0] + fluid_result = scheduler().numpy().item() scheduler.epoch() self.assertAlmostEqual( right_result, @@ -371,7 +371,7 @@ def test_LambdaDecay(self): for epoch in range(30): right_result = lambda_decay(epoch, learning_rate, lr_lambda) - fluid_result = scheduler().numpy()[0] + fluid_result = scheduler().numpy().item() scheduler.epoch() self.assertAlmostEqual( right_result, diff --git a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py index 56cf520b1c997..0202d7540d234 100644 --- a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py +++ b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py @@ -208,7 +208,7 @@ def _test_dygraph(self, place, kwargs): self.assertEqual( scheduler.cooldown_counter, scheduler1.cooldown_counter ) - self.assertEqual(scheduler.best.numpy()[0], scheduler1.best) + self.assertEqual(scheduler.best, scheduler1.best) self.assertEqual(scheduler.num_bad_epochs, scheduler1.num_bad_epochs) self.assertEqual(scheduler.last_epoch, scheduler1.last_epoch) self.assertEqual(scheduler.last_lr, scheduler1.last_lr) diff --git a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py index 965bcae57d9db..9c049ddbf435d 100644 --- a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py @@ -219,17 +219,19 @@ def test_dygraph_reduce(self): self.assertEqual(x.grad.shape, []) np.testing.assert_allclose(x.grad.numpy(), np.array(3.0)) - # 2) x is ND if api in [ paddle.sum, paddle.mean, paddle.nanmean, paddle.nansum, - paddle.max, ]: return - x = paddle.rand([3, 5]) + # 2) x is ND, reduce to 0D + if api in [paddle.all, paddle.any]: + x = paddle.randint(0, 2, [3, 5]).astype('bool') + else: + x = paddle.rand([3, 5]) x.stop_gradient = False out = api(x, None) out.retain_grads() @@ -240,6 +242,21 @@ def test_dygraph_reduce(self): self.assertEqual(out.grad.shape, []) self.assertEqual(x.grad.shape, [3, 5]) + # 3) x is 1D, axis=0, reduce to 0D + if api in [paddle.all, paddle.any]: + x = paddle.randint(0, 2, [5]).astype('bool') + else: + x = paddle.rand([5]) + x.stop_gradient = False + out = api(x, 0) + out.retain_grads() + out.backward() + + self.assertEqual(out.shape, []) + if x.grad is not None: + self.assertEqual(out.grad.shape, []) + self.assertEqual(x.grad.shape, [5]) + paddle.enable_static() def test_static_reduce(self): @@ -284,16 +301,19 @@ def test_static_reduce(self): np.testing.assert_allclose(res[2], np.array(1.0)) np.testing.assert_allclose(res[3], np.array(1.0)) - # 2) x is ND if api in [ paddle.sum, paddle.mean, paddle.nanmean, paddle.nansum, - paddle.max, ]: return + # 2) x is ND, reduce to 0D + if api in [paddle.all, paddle.any]: + x = paddle.randint(0, 2, [3, 5]).astype('bool') + else: + x = paddle.rand([3, 5]) x = paddle.rand([3, 5]) x.stop_gradient = False out = api(x, None) @@ -309,6 +329,25 @@ def test_static_reduce(self): self.assertEqual(res[1].shape, ()) self.assertEqual(res[2].shape, (3, 5)) + # 3) x is 1D, axis=0, reduce to 0D + if api in [paddle.all, paddle.any]: + x = paddle.randint(0, 2, [5]).astype('bool') + else: + x = paddle.rand([5]) + x.stop_gradient = False + out = api(x, 0) + paddle.static.append_backward(out) + + fetch_list = [out] + if block.has_var(x.grad_name): + fetch_list.extend([out.grad_name, x.grad_name]) + + res = exe.run(main_prog, fetch_list=fetch_list) + self.assertEqual(res[0].shape, ()) + if len(res) > 1: + self.assertEqual(res[1].shape, ()) + self.assertEqual(res[2].shape, (5,)) + paddle.disable_static() diff --git a/python/paddle/hapi/progressbar.py b/python/paddle/hapi/progressbar.py index e63bb913334ea..08d6cf2b12181 100644 --- a/python/paddle/hapi/progressbar.py +++ b/python/paddle/hapi/progressbar.py @@ -81,8 +81,13 @@ def convert_uint16_to_float(in_list): for i, (k, val) in enumerate(values): if k == "loss": - val = val if isinstance(val, (list, np.ndarray)) else [val] - if isinstance(val[0], np.uint16): + if isinstance(val, list): + scalar_val = val[0] + elif isinstance(val, np.ndarray): + scalar_val = val.item() + else: + scalar_val = val + if isinstance(scalar_val, np.uint16): values[i] = ("loss", list(convert_uint16_to_float(val))) if current_num: diff --git a/python/paddle/nn/clip.py b/python/paddle/nn/clip.py index 58332802efa29..1952ea0514b50 100644 --- a/python/paddle/nn/clip.py +++ b/python/paddle/nn/clip.py @@ -700,7 +700,7 @@ def _dygraph_clip(self, params_grads): global_norm_var = paddle.add_n(global_norm_var) global_norm_var = paddle.sqrt(global_norm_var) max_global_norm = paddle.full( - shape=[1], dtype=global_norm_var.dtype, fill_value=self.clip_norm + shape=[], dtype=global_norm_var.dtype, fill_value=self.clip_norm ) need_clip = False diff --git a/python/paddle/nn/quant/lsq.py b/python/paddle/nn/quant/lsq.py index 8f225e9265980..4fa8f55266a38 100644 --- a/python/paddle/nn/quant/lsq.py +++ b/python/paddle/nn/quant/lsq.py @@ -178,7 +178,7 @@ def __init__( s_attr = ParamAttr( name=self._scale_name, initializer=Constant(1.0), trainable=True ) - self.s = self.create_parameter(shape=[1], attr=s_attr, dtype='float32') + self.s = self.create_parameter(shape=[], attr=s_attr, dtype='float32') self.s.stop_gradient = False if not self.symmetric: @@ -189,7 +189,7 @@ def __init__( name=self._beta_name, initializer=Constant(0.0), trainable=True ) self.beta = self.create_parameter( - shape=[1], attr=beta_attr, dtype='float32' + shape=[], attr=beta_attr, dtype='float32' ) self.beta.stop_gradient = False diff --git a/test/autograd/utils.py b/test/autograd/utils.py index 7fbc456b6b5df..de1db9f2a19f5 100644 --- a/test/autograd/utils.py +++ b/test/autograd/utils.py @@ -26,10 +26,7 @@ # Finite Difference Utils ########################################################## def _product(t): - if isinstance(t, int): - return t - else: - return np.product(t) + return int(np.product(t)) def _get_item(t, idx): diff --git a/test/dygraph_to_static/seq2seq_dygraph_model.py b/test/dygraph_to_static/seq2seq_dygraph_model.py index df9746a57986a..3b9f343ca9fad 100644 --- a/test/dygraph_to_static/seq2seq_dygraph_model.py +++ b/test/dygraph_to_static/seq2seq_dygraph_model.py @@ -407,7 +407,7 @@ def beam_search(self, inputs): parent_ids = [] for step_idx in range(paddle.to_tensor(self.beam_max_step_num)): - if paddle.sum(1 - beam_finished).numpy()[0] == 0: + if paddle.sum(1 - beam_finished) == 0: break step_input = self._merge_batch_beams(step_input) new_dec_hidden, new_dec_cell = [], [] diff --git a/test/dygraph_to_static/test_for_enumerate.py b/test/dygraph_to_static/test_for_enumerate.py index 58fd9a0b03e8b..1aca0549213c2 100644 --- a/test/dygraph_to_static/test_for_enumerate.py +++ b/test/dygraph_to_static/test_for_enumerate.py @@ -28,7 +28,7 @@ def for_in_range(x): z = paddle.tensor.fill_constant([1], 'int32', 0) x = fluid.dygraph.to_variable(x) - for i in range(x.numpy()[0]): + for i in range(x.numpy().item()): z = z + i return z diff --git a/test/dygraph_to_static/test_sentiment.py b/test/dygraph_to_static/test_sentiment.py index 5de0d47387931..3ad8ce6333433 100644 --- a/test/dygraph_to_static/test_sentiment.py +++ b/test/dygraph_to_static/test_sentiment.py @@ -342,7 +342,7 @@ def train(args, to_static): model.train() avg_cost, prediction, acc = model(doc, label) - loss_data.append(avg_cost.numpy()[0]) + loss_data.append(float(avg_cost)) avg_cost.backward() sgd_optimizer.minimize(avg_cost) @@ -358,7 +358,7 @@ def train(args, to_static): "step: %d, ave loss: %f, speed: %f steps/s" % ( batch_id, - avg_cost.numpy()[0], + float(avg_cost), args.log_step / used_time, ) ) From 66f07bcdb5dc118303c1ebf446b272f752019863 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Mon, 24 Apr 2023 21:12:13 +0800 Subject: [PATCH 036/405] Fix forward. --- .../operators/fused/fused_gate_attention.h | 126 +++++++++--------- .../fused/fused_gate_attention_op.cu | 7 +- 2 files changed, 69 insertions(+), 64 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h index d0310383abe8b..45dd62a0d4ce7 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention.h +++ b/paddle/fluid/operators/fused/fused_gate_attention.h @@ -56,6 +56,21 @@ void AllocWithDebugInfo(const phi::GPUContext& dev_ctx, VLOG(4) << info << ": " << MemoryDebugString(*t); } +inline std::string TensorDebugString(const phi::DenseTensor* t) { + if (t && t->initialized()) { + std::stringstream ss; + ss << "shape=[" << t->dims() << "], ptr=" << t->data(); + return ss.str(); + } else { + return "nullptr"; + } +} + +inline std::string WaitWithDebugInfo(const phi::GPUContext& dev_ctx) { + dev_ctx.Wait(); + return "[Synchronize] "; +} + template struct TernaryAddFunctor { inline HOSTDEVICE T operator()(T a, T b, T c) const { return a + b + c; } @@ -391,7 +406,7 @@ static void GetFlashAttnDimsString(const std::string& prefix, const phi::DDim dim_val) { // if (VLOG_IS_ON(4)) { std::ostringstream out_string; - out_string << "FlashAttn - " << prefix << ".dims() is [ "; + out_string << "FlashAttn - " << prefix << ".dims() is ["; for (int i = 0; i < dim_val.size(); ++i) { out_string << dim_val[i] << ", "; } @@ -411,12 +426,6 @@ static void GetFlashAttnDimsString(const std::string& prefix, std::cout << out_string.str(); \ } while (0); -#define DBG_WAIT \ - do { \ - printf("[%s, %d] Run here.\n", __func__, __LINE__); \ - dev_ctx_.Wait(); \ - } while (0); - template class FMHAGateRef { public: @@ -635,7 +644,7 @@ class FMHAGateRef { int seq_batch_size = static_cast(config->batch_size) * static_cast(config->seq_len_m); - DBG_WAIT; + LOG(INFO) << WaitWithDebugInfo(dev_ctx_); // 2. Dealing with cu_seq_q and cu_seq_k for flash_attn. phi::DenseTensor cu_seq_q, cu_seq_k; @@ -655,7 +664,7 @@ class FMHAGateRef { start, step, end, cu_seq_q.data(), cu_seq_k.data()); VLOG(4) << "[Flash_attn] cu_seq_len : start = " << start << ", step = " << step << ", end = " << end; - DBG_WAIT; + LOG(INFO) << WaitWithDebugInfo(dev_ctx_); // 3. Dealing with mask and bias for flash_attn. phi::DenseTensor temp_mask, temp_bias; @@ -704,18 +713,18 @@ class FMHAGateRef { // softmax_lse->Resize({batch_size_, num_heads_, last_q_dim}); // AllocWithDebugInfo( // dev_ctx_, "flash_attn: softmax_lse", softmax_lse); - DBG_WAIT; + LOG(INFO) << WaitWithDebugInfo(dev_ctx_); phi::DenseTensor softmax_d = phi::Empty( dev_ctx_, {batch_size_, num_heads_, last_q_dim}); - DBG_WAIT; + LOG(INFO) << WaitWithDebugInfo(dev_ctx_); phi::DenseTensor bias_d; if (nonbatched_bias) { bias_d = phi::Empty( dev_ctx_, {batch_size_, num_heads_, max_seqlen_q_, max_seqlen_k_}); } - DBG_WAIT; + LOG(INFO) << WaitWithDebugInfo(dev_ctx_); q_ptr = q_transpose_out->data(); k_ptr = k_transpose_out->data(); @@ -783,7 +792,7 @@ class FMHAGateRef { if (!succ) { PADDLE_THROW(phi::errors::External(phi::dynload::flash_attn_error())); } - DBG_WAIT; + LOG(INFO) << WaitWithDebugInfo(dev_ctx_); phi::DenseTensor workspace; printf("workspace_size = %d\n", workspace_size); @@ -792,7 +801,7 @@ class FMHAGateRef { dev_ctx_, {int64_t(workspace_size / sizeof(float))}); DBGPTR(workspace.data(), "workspace"); } - DBG_WAIT; + LOG(INFO) << WaitWithDebugInfo(dev_ctx_); succ = phi::dynload::flash_attn_bwd_with_bias_and_mask( static_cast(q_ptr), @@ -833,7 +842,7 @@ class FMHAGateRef { if (!succ) { PADDLE_THROW(phi::errors::External(phi::dynload::flash_attn_error())); } - DBG_WAIT; + LOG(INFO) << WaitWithDebugInfo(dev_ctx_); if (nonbatched_bias) { // compare block reduce @@ -1228,7 +1237,6 @@ class FlashAttnWithGating { phi::DenseTensor* k_transpose_out, phi::DenseTensor* v_transpose_out, phi::DenseTensor* qkv_transpose_out, - phi::DenseTensor* softmax_out, phi::DenseTensor* softmax_lse, phi::DenseTensor* fmha_out, phi::DenseTensor* gate_out, @@ -1248,7 +1256,7 @@ class FlashAttnWithGating { LOG(INFO) << "T is float."; } - LOG(INFO) << "Use flash attention"; + LOG(INFO) << "Use flash attention: merge_qkv=" << merge_qkv_; PADDLE_ENFORCE_NOT_NULL( qkv_transpose_out, @@ -1267,7 +1275,10 @@ class FlashAttnWithGating { seq_batch_size * static_cast(config->seq_len_r), static_cast(config->num_heads), static_cast(config->head_dim)}); - DBG_WAIT; + LOG(INFO) << WaitWithDebugInfo(dev_ctx_) + << "1: Reshape qkv_transpose_out: [" + << config->qkv_transpose_out_dims << "] -> [" + << qkv_transpose_out->dims() << "]"; // q_size == k_size int64_t q_size = config->GetQuerySize(); @@ -1277,10 +1288,11 @@ class FlashAttnWithGating { // 2. Dealing with cu_seq_q and cu_seq_k for flash_attn. phi::DenseTensor cu_seq_q, cu_seq_k; - int64_t end_size = (seq_batch_size + 1); + int64_t end_size = seq_batch_size + 1; int64_t seq_size = 0; - int64_t start = 0, end = end_size, - step = static_cast(config->seq_len_r); + int64_t start = 0; + int64_t end = end_size; + int64_t step = static_cast(config->seq_len_r); phi::funcs::GetSize(start, end, step, &seq_size); cu_seq_q.Resize({end_size}); cu_seq_k.Resize({end_size}); @@ -1292,7 +1304,7 @@ class FlashAttnWithGating { start, step, end, cu_seq_q.data(), cu_seq_k.data()); VLOG(4) << "[Flash_attn] cu_seq_len : start = " << start << ", step = " << step << ", end = " << end; - DBG_WAIT; + LOG(INFO) << WaitWithDebugInfo(dev_ctx_) << "2: Init cu_seq_q and cu_seq_k"; // 3. Dealing with mask and bias for flash_attn. phi::DenseTensor temp_mask, temp_bias; @@ -1318,9 +1330,10 @@ class FlashAttnWithGating { dims_merge_func(src_mask, &temp_mask, "mask_dim"); dims_merge_func(nonbatched_bias, &temp_bias, "bias_dim"); GetFlashAttnDimsString("qkv_transpose_out", qkv_dims); - DBG_WAIT; - // 4. flash_attn parameter setting. + LOG(INFO) << WaitWithDebugInfo(dev_ctx_) + << "3: Merge dimensions for mask and bias"; + // 4. flash_attn parameter setting. int batch_size_ = seq_batch_size; int total_q_ = qkv_dims[1]; // q.dims()[0] int total_k_ = qkv_dims[1]; // q.dims()[0] @@ -1329,20 +1342,24 @@ class FlashAttnWithGating { int max_seqlen_q_ = batch_size_; int max_seqlen_k_ = batch_size_; int num_splits = 0; // 0 for an internal heuristic, which is optimal - VLOG(6) << "[Flash_attn Fwd] batch_size : " << batch_size_; - VLOG(6) << "[Flash_attn Fwd] total_q : " << total_q_; - VLOG(6) << "[Flash_attn Fwd] total_k : " << total_k_; - VLOG(6) << "[Flash_attn Fwd] num_heads : " << num_heads_; - VLOG(6) << "[Flash_attn Fwd] head_size : " << head_size_; - VLOG(6) << "[Flash_attn Fwd] max_seqlen_q : " << max_seqlen_q_; - VLOG(6) << "[Flash_attn Fwd] max_seqlen_k : " << max_seqlen_k_; + LOG(INFO) << "[Flash_attn Fwd] batch_size : " << batch_size_; + LOG(INFO) << "[Flash_attn Fwd] total_q : " << total_q_; + LOG(INFO) << "[Flash_attn Fwd] total_k : " << total_k_; + LOG(INFO) << "[Flash_attn Fwd] num_heads : " << num_heads_; + LOG(INFO) << "[Flash_attn Fwd] head_size : " << head_size_; + LOG(INFO) << "[Flash_attn Fwd] max_seqlen_q : " << max_seqlen_q_; + LOG(INFO) << "[Flash_attn Fwd] max_seqlen_k : " << max_seqlen_k_; + LOG(INFO) << WaitWithDebugInfo(dev_ctx_) + << "4: Init flash-attention parameters"; // 5. construct softmax_lse int softmax_lse_last_dim = ((max_seqlen_q_ + 16 - 1) / 16) * 16; softmax_lse->Resize({batch_size_, num_heads_, softmax_lse_last_dim}); AllocWithDebugInfo(dev_ctx_, "flash_attn: softmax_lse", softmax_lse); + LOG(INFO) << WaitWithDebugInfo(dev_ctx_) + << "5: Allocate softmax_lse: shape=[" << softmax_lse->dims() + << "]"; - DBG_WAIT; // 6. construct random seed auto gen = dev_ctx_.GetGenerator(); uint64_t inc = batch_size_ * num_heads_ * 32; @@ -1350,11 +1367,10 @@ class FlashAttnWithGating { uint64_t seed = seed_offset_pair.first; uint64_t offset = seed_offset_pair.second; - GetFlashAttnDimsString("softmax_out", softmax_out->dims()); GetFlashAttnDimsString("softmax_lse", softmax_lse->dims()); GetFlashAttnDimsString("cu_seq_q", cu_seq_q.dims()); GetFlashAttnDimsString("cu_seq_k", cu_seq_k.dims()); - DBG_WAIT; + LOG(INFO) << WaitWithDebugInfo(dev_ctx_) << "6: Construct random seed"; // 7. flas_attn part one, get temp worksapce size. float p_dropout = 0.f; @@ -1382,7 +1398,7 @@ class FlashAttnWithGating { is_bf16, num_splits, softmax_lse->data(), - softmax_out->data(), + nullptr, // softmax out, nullptr, &workspace_size, stream, @@ -1390,44 +1406,33 @@ class FlashAttnWithGating { offset, src_mask ? temp_mask.data() : nullptr, nonbatched_bias ? temp_bias.data() : nullptr, - temp_mask.dims().Get(), - temp_bias.dims().Get()); + src_mask ? temp_mask.dims().Get() : nullptr, + nonbatched_bias ? temp_bias.dims().Get() : nullptr); if (!succ) { PADDLE_THROW(phi::errors::External(phi::dynload::flash_attn_error())); } - DBG_WAIT; + LOG(INFO) << WaitWithDebugInfo(dev_ctx_) + << "7: Get workspace_size=" << workspace_size; phi::DenseTensor workspace; - printf("workspace_size = %d\n", workspace_size); if (workspace_size > 0) { workspace = phi::Empty( dev_ctx_, {int64_t(workspace_size / sizeof(float))}); DBGPTR(workspace.data(), "workspace"); } - DBG_WAIT; - -#define DBG_INIT(prefix, x) \ - do { \ - printf("[%s, %d] ", __func__, __LINE__); \ - if (x->initialized()) { \ - std::cout << prefix << " is initialized." << std::endl; \ - } else { \ - std::cout << prefix << " is not initialized." << std::endl; \ - } \ - } while (0); + LOG(INFO) << WaitWithDebugInfo(dev_ctx_) << "Allocate workspace"; - DBG_INIT("qkv_transpose_out", qkv_transpose_out); - DBG_INIT("softmax_out", softmax_out); - DBG_INIT("src_mask", src_mask); - DBG_INIT("fmha_out", fmha_out); - DBG_INIT("gate_out", gate_out); + LOG(INFO) << "qkv_transpose_out: " << TensorDebugString(qkv_transpose_out); + LOG(INFO) << "src_mask: " << TensorDebugString(src_mask); + LOG(INFO) << "fmha_out: " << TensorDebugString(fmha_out); + LOG(INFO) << "gate_out: " << TensorDebugString(gate_out); // 8. flas_attn part two, run impl. succ = phi::dynload::flash_attn_fwd_with_bias_and_mask( static_cast(q_ptr), static_cast(k_ptr), static_cast(v_ptr), - static_cast(fmha_out->data()), // for calculation workspace size + static_cast(fmha_out->data()), cu_seq_q.data(), cu_seq_k.data(), total_q_, @@ -1444,7 +1449,7 @@ class FlashAttnWithGating { is_bf16, num_splits, softmax_lse->data(), - softmax_out->data(), + nullptr, // softmax out (workspace_size > 0) ? static_cast(workspace.data()) : nullptr, &workspace_size, stream, @@ -1452,13 +1457,12 @@ class FlashAttnWithGating { offset, src_mask ? temp_mask.data() : nullptr, nonbatched_bias ? temp_bias.data() : nullptr, - temp_mask.dims().Get(), - temp_bias.dims().Get()); - DBG_WAIT; + src_mask ? temp_mask.dims().Get() : nullptr, + nonbatched_bias ? temp_bias.dims().Get() : nullptr); if (!succ) { PADDLE_THROW(phi::errors::External(phi::dynload::flash_attn_error())); } - DBG_WAIT; + LOG(INFO) << WaitWithDebugInfo(dev_ctx_) << "8: Run SUCCESS"; if (config->has_gating) { gate_out->Resize(config->gate_out_dims); diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu index afee194827ceb..3d10047bf23a0 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu @@ -371,7 +371,6 @@ class FusedGateAttentionOpKernel : public framework::OpKernel { auto *v_transpose_out = ctx.Output("ValueTransposeOut"); auto *qkv_transpose_out = ctx.Output("QKVTransposeOut"); - auto *softmax_out = ctx.Output("SoftmaxOut"); auto *fmha_out = ctx.Output("FMHAOut"); auto *gate_out = ctx.Output("GateOut"); auto *out = ctx.Output("Out"); @@ -382,7 +381,6 @@ class FusedGateAttentionOpKernel : public framework::OpKernel { bool use_fused_matmul_bias = true; auto &dev_ctx = ctx.template device_context(); - AllocWithDebugInfo(dev_ctx, "softmax_out", softmax_out); AllocWithDebugInfo(dev_ctx, "fmha_out", fmha_out); if (has_gating) { AllocWithDebugInfo(dev_ctx, "gate_out", gate_out); @@ -430,6 +428,7 @@ class FusedGateAttentionOpKernel : public framework::OpKernel { // 2. FMHA if (config.CanUseFlashAttn()) { auto *softmax_lse = ctx.Output("SoftmaxLse"); + auto fmha_compute = FlashAttnWithGating(dev_ctx, merge_qkv); fmha_compute.ComputeForward(nonbatched_bias, src_mask, @@ -437,12 +436,14 @@ class FusedGateAttentionOpKernel : public framework::OpKernel { k_transpose_out, v_transpose_out, qkv_transpose_out, - softmax_out, softmax_lse, fmha_out, gate_out, &config); } else { + auto *softmax_out = ctx.Output("SoftmaxOut"); + AllocWithDebugInfo(dev_ctx, "softmax_out", softmax_out); + auto fmha_compute = FMHAGateRef(dev_ctx, merge_qkv); fmha_compute.ComputeForward(nonbatched_bias, src_mask, From 387d26f3c20533fb5ea102ce5cf5f8862d6a5adb Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Mon, 24 Apr 2023 21:28:10 +0800 Subject: [PATCH 037/405] Remove some noused codes. --- .../operators/fused/fused_gate_attention.h | 42 ++----------------- 1 file changed, 3 insertions(+), 39 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h index 45dd62a0d4ce7..0b67382edc4b0 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention.h +++ b/paddle/fluid/operators/fused/fused_gate_attention.h @@ -503,29 +503,8 @@ class FMHAGateRef { int64_t gemm_m = config->seq_len_r; int64_t gemm_n = config->m_size; int64_t gemm_k = config->head_dim; - // attn = torch.matmul(q, k.transpose(-1, -2)) T alpha = static_cast(1.0 / sqrt(config->head_dim)); - // ComputeBatchedGEMM(merge_qkv_ ? - // phi::slice_ddim(qkv_transpose_out->dims(), - // 1, - // qkv_transpose_out->dims().size() - // - 1) : q_transpose_out->dims(), - // merge_qkv_ ? - // phi::slice_ddim(qkv_transpose_out->dims(), - // 1, - // qkv_transpose_out->dims().size() - // - 1) : k_transpose_out->dims(), - // q_ptr, - // k_ptr, - // qk_out_ptr, - // false, - // true, - // gemm_m, - // gemm_n, - // gemm_k, - // gemm_batch_size, - // alpha); - + // attn = matmul(q, k.transpose(-1, -2)) ComputeBatchedGEMM(q_ptr, k_ptr, qk_out_ptr, @@ -536,6 +515,7 @@ class FMHAGateRef { gemm_k, gemm_batch_size, alpha); + // attn = softmax_dropout(attn, 0, self.training, mask=mask, bias=bias) // softmax_out = softmax(qk_out + nonbatched_bias + src_mask) ComputeBiasMaskSoftmaxForward( @@ -553,24 +533,8 @@ class FMHAGateRef { gemm_n = config->head_dim; gemm_k = config->m_size; - // o = torch.matmul(attn, v) + // o = matmul(attn, v) T* softmax_out_ptr = softmax_out->data(); - // ComputeBatchedGEMM(softmax_out->dims(), - // merge_qkv_ ? - // phi::slice_ddim(qkv_transpose_out->dims(), - // 1, - // qkv_transpose_out->dims().size() - // - 1) : v_transpose_out->dims(), - // softmax_out_ptr, - // v_ptr, - // qktv_out_ptr, - // false, - // false, - // gemm_m, - // gemm_n, - // gemm_k, - // gemm_batch_size); - ComputeBatchedGEMM(softmax_out_ptr, v_ptr, qktv_out_ptr, From cf4a1c84205dbbe0a0b7c91e71ce9dda05542c67 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Mon, 24 Apr 2023 22:27:36 +0800 Subject: [PATCH 038/405] Change backward. --- .../operators/fused/fused_gate_attention.h | 751 ++++++++++-------- .../fused/fused_gate_attention_op.cc | 27 +- .../fused/fused_gate_attention_op.cu | 37 +- 3 files changed, 440 insertions(+), 375 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h index 0b67382edc4b0..07b4b5973cac8 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention.h +++ b/paddle/fluid/operators/fused/fused_gate_attention.h @@ -563,11 +563,7 @@ class FMHAGateRef { const phi::DenseTensor* fmha_out_grad, phi::DenseTensor* src_mask_grad, phi::DenseTensor* nonbatched_bias_grad, - GateAttentionGradConfig* config, - const phi::DenseTensor* fmha_out = nullptr, - const phi::DenseTensor* softmax_lse = nullptr, - const phi::DenseTensor* nonbatched_bias = nullptr, - const phi::DenseTensor* src_mask = nullptr) { + GateAttentionGradConfig* config) { const T* q_ptr = nullptr; const T* k_ptr = nullptr; const T* v_ptr = nullptr; @@ -581,22 +577,12 @@ class FMHAGateRef { phi::DenseTensor v_transpose_out_grad; phi::DenseTensor qkv_transpose_out_grad; - bool is_bf16 = - qkv_transpose_out->dtype() == DataType::BFLOAT16 ? true : false; - - if (std::is_same::value) { - std::cout << "[Grad]: T is phi::dtype::float16. \n"; - } else if (std::is_same::value) { - std::cout << "[Grad]: T is phi::dtype::bfloat16. \n"; - } else if (std::is_same::value) { - std::cout << "[Grad]: T is float. \n"; - } - - if (config->CanUseFlashAttn()) { + if (merge_qkv_) { PADDLE_ENFORCE_NOT_NULL( qkv_transpose_out, - platform::errors::NotFound("The input qkv_transpose_out can not be" + platform::errors::NotFound("The input qkv_transpose_out can not be " "nullptr when merge_qkv is true.")); + int64_t q_size = config->GetQuerySize(); q_ptr = qkv_transpose_out->data(); k_ptr = q_ptr + q_size; @@ -606,93 +592,27 @@ class FMHAGateRef { AllocWithDebugInfo( dev_ctx_, "qkv_transpose_out_grad", &qkv_transpose_out_grad); - int seq_batch_size = static_cast(config->batch_size) * - static_cast(config->seq_len_m); - LOG(INFO) << WaitWithDebugInfo(dev_ctx_); - - // 2. Dealing with cu_seq_q and cu_seq_k for flash_attn. - phi::DenseTensor cu_seq_q, cu_seq_k; - int64_t start = 0; - int64_t step = static_cast(config->seq_len_r); - int64_t end_size = (seq_batch_size + 1); - int64_t end = end_size; - int64_t seq_size = 0; - phi::funcs::GetSize(start, end, step, &seq_size); - cu_seq_q.Resize({end_size}); - cu_seq_k.Resize({end_size}); - AllocWithDebugInfo(dev_ctx_, "Grad: cu_seq_q", &cu_seq_q); - AllocWithDebugInfo(dev_ctx_, "Grad: cu_seq_k", &cu_seq_k); - int64_t block = std::min(seq_size, static_cast(256)); - int64_t grid = (seq_size + block - 1) / block; - FlashAttRange<<>>( - start, step, end, cu_seq_q.data(), cu_seq_k.data()); - VLOG(4) << "[Flash_attn] cu_seq_len : start = " << start - << ", step = " << step << ", end = " << end; - LOG(INFO) << WaitWithDebugInfo(dev_ctx_); - - // 3. Dealing with mask and bias for flash_attn. - phi::DenseTensor temp_mask, temp_bias; - auto dims_merge_func = [&](const phi::DenseTensor* src_tensor, - phi::DenseTensor* dst_tensor, - const std::string& prefix) { - if (src_tensor) { - int64_t first_dim = 1; - dst_tensor->ShareDataWith(*src_tensor); - auto dims_ = src_tensor->dims(); - for (int i = 0; i < dims_.size() - 3; ++i) { - first_dim *= dims_[i]; - } - auto dims_rank = dims_.size(); - dst_tensor->Resize({first_dim, - dims_[dims_rank - 3], - dims_[dims_rank - 2], - dims_[dims_rank - 1]}); - GetFlashAttnDimsString(prefix, temp_mask.dims()); - } - }; - dims_merge_func(src_mask, &temp_mask, "[Grad] mask_dim"); - dims_merge_func(nonbatched_bias, &temp_bias, "[Grad] bias_dim"); - - phi::DDim qkv_dims({3, - seq_batch_size * static_cast(config->seq_len_r), - static_cast(config->num_heads), - static_cast(config->head_dim)}); - int batch_size_ = seq_batch_size; - int total_q_ = qkv_dims[1]; // q.dims()[0] - int total_k_ = qkv_dims[1]; // q.dims()[0] - int num_heads_ = qkv_dims[2]; // q.dims()[1] - int head_size_ = qkv_dims[3]; // q.dims()[2] - int max_seqlen_q_ = batch_size_; - int max_seqlen_k_ = batch_size_; - VLOG(6) << "[Flash_attn Grad] batch_size : " << batch_size_; - VLOG(6) << "[Flash_attn Grad] total_q : " << total_q_; - VLOG(6) << "[Flash_attn Grad] total_k : " << total_k_; - VLOG(6) << "[Flash_attn Grad] num_heads : " << num_heads_; - VLOG(6) << "[Flash_attn Grad] head_size : " << head_size_; - VLOG(6) << "[Flash_attn Grad] max_seqlen_q : " << max_seqlen_q_; - VLOG(6) << "[Flash_attn Grad] max_seqlen_k : " << max_seqlen_k_; - - // 5. construct softmax_lse - int last_q_dim = ((max_seqlen_q_ + 16 - 1) / 16) * 16; - // softmax_lse->Resize({batch_size_, num_heads_, last_q_dim}); - // AllocWithDebugInfo( - // dev_ctx_, "flash_attn: softmax_lse", softmax_lse); - LOG(INFO) << WaitWithDebugInfo(dev_ctx_); - - phi::DenseTensor softmax_d = phi::Empty( - dev_ctx_, {batch_size_, num_heads_, last_q_dim}); - LOG(INFO) << WaitWithDebugInfo(dev_ctx_); - - phi::DenseTensor bias_d; - if (nonbatched_bias) { - bias_d = phi::Empty( - dev_ctx_, {batch_size_, num_heads_, max_seqlen_q_, max_seqlen_k_}); - } - LOG(INFO) << WaitWithDebugInfo(dev_ctx_); + q_grad_ptr = qkv_transpose_out_grad.data(); + k_grad_ptr = q_grad_ptr + q_size; + v_grad_ptr = k_grad_ptr + q_size; + } else { + PADDLE_ENFORCE_NOT_NULL( + q_transpose_out, + platform::errors::NotFound("The input q_transpose_out can not be " + "nullptr when merge_qkv is false.")); + PADDLE_ENFORCE_NOT_NULL( + k_transpose_out, + platform::errors::NotFound("The input k_transpose_out can not be " + "nullptr when merge_qkv is false.")); + PADDLE_ENFORCE_NOT_NULL( + v_transpose_out, + platform::errors::NotFound("The input v_transpose_out can not be " + "nullptr when merge_qkv is false.")); q_ptr = q_transpose_out->data(); k_ptr = k_transpose_out->data(); v_ptr = v_transpose_out->data(); + q_transpose_out_grad.Resize(config->q_transpose_out_dims); k_transpose_out_grad.Resize(config->kv_transpose_out_dims); v_transpose_out_grad.Resize(config->kv_transpose_out_dims); @@ -703,273 +623,99 @@ class FMHAGateRef { k_transpose_out_grad.numel() * sizeof(T)); v_grad_ptr = dev_ctx_.Alloc(&v_transpose_out_grad, v_transpose_out_grad.numel() * sizeof(T)); + } - // 6. construct random seed - auto gen = dev_ctx_.GetGenerator(); - uint64_t inc = batch_size_ * num_heads_ * 32; - auto seed_offset_pair = gen->IncrementOffset(inc); - uint64_t seed = seed_offset_pair.first; - uint64_t offset = seed_offset_pair.second; - - // 7. flas_attn part one, get temp worksapce size. - uint64_t workspace_size; - float p_dropout = 0.f; - float softmax_scale = static_cast(1); - cudaStream_t stream = dev_ctx_.stream(); - int num_splits = 0; // 0 for an internal heuristic, which is optimal - bool succ = phi::dynload::flash_attn_bwd_with_bias_and_mask( - static_cast(q_ptr), - static_cast(k_ptr), - static_cast(v_ptr), - static_cast(q_grad_ptr), - static_cast(k_grad_ptr), - static_cast(v_grad_ptr), - static_cast(fmha_out->data()), - static_cast(fmha_out_grad->data()), - cu_seq_q.data(), - cu_seq_k.data(), - total_q_, - total_k_, - batch_size_, - num_heads_, - head_size_, - max_seqlen_q_, - max_seqlen_k_, - p_dropout, - softmax_scale, - /*zero_tensors=*/false, - /*is_causal=*/false, - is_bf16, - num_splits, - softmax_lse->data(), - softmax_d.data(), - bias_d.data(), - nullptr, - &workspace_size, - stream, - seed, - offset, - src_mask ? temp_mask.data() : nullptr, - nonbatched_bias ? temp_bias.data() : nullptr, - temp_mask.dims().Get(), - temp_bias.dims().Get()); - if (!succ) { - PADDLE_THROW(phi::errors::External(phi::dynload::flash_attn_error())); - } - LOG(INFO) << WaitWithDebugInfo(dev_ctx_); - - phi::DenseTensor workspace; - printf("workspace_size = %d\n", workspace_size); - if (workspace_size > 0) { - workspace = phi::Empty( - dev_ctx_, {int64_t(workspace_size / sizeof(float))}); - DBGPTR(workspace.data(), "workspace"); - } - LOG(INFO) << WaitWithDebugInfo(dev_ctx_); - - succ = phi::dynload::flash_attn_bwd_with_bias_and_mask( - static_cast(q_ptr), - static_cast(k_ptr), - static_cast(v_ptr), - static_cast(q_grad_ptr), - static_cast(k_grad_ptr), - static_cast(v_grad_ptr), - static_cast(fmha_out->data()), - static_cast(fmha_out_grad->data()), - cu_seq_q.data(), - cu_seq_k.data(), - total_q_, - total_k_, - batch_size_, - num_heads_, - head_size_, - max_seqlen_q_, - max_seqlen_k_, - p_dropout, - softmax_scale, - /*zero_tensors=*/false, - /*is_causal=*/false, - is_bf16, - num_splits, - softmax_lse->data(), - softmax_d.data(), - bias_d.data(), - workspace.data(), - &workspace_size, - stream, - seed, - offset, - src_mask ? temp_mask.data() : nullptr, - nonbatched_bias ? temp_bias.data() : nullptr, - temp_mask.dims().Get(), - temp_bias.dims().Get()); - if (!succ) { - PADDLE_THROW(phi::errors::External(phi::dynload::flash_attn_error())); - } - LOG(INFO) << WaitWithDebugInfo(dev_ctx_); - - if (nonbatched_bias) { - // compare block reduce - // auto size = attn_bias->sizes(); - // dbias = ds.reshape({ -1, size[0], size[1], size[2], size[3] }).sum({ - // 0 }); result.push_back( dbias ); - const auto temp_bias_num = temp_bias.numel(); - const auto bias_d_num = bias_d.numel(); - auto dbias_first_dim = bias_d_num / temp_bias_num; - bias_d.Resize({dbias_first_dim, - temp_bias.dims()[0], - temp_bias.dims()[1], - temp_bias.dims()[2], - temp_bias.dims()[3]}); - phi::funcs:: - ReduceKernel>( - dev_ctx_, - bias_d, - nonbatched_bias_grad, - kps::IdentityFunctor(), - {0}); - } - } else { - if (merge_qkv_) { - PADDLE_ENFORCE_NOT_NULL( - qkv_transpose_out, - platform::errors::NotFound("The input qkv_transpose_out can not be " - "nullptr when merge_qkv is true.")); - - int64_t q_size = config->GetQuerySize(); - q_ptr = qkv_transpose_out->data(); - k_ptr = q_ptr + q_size; - v_ptr = k_ptr + q_size; - - qkv_transpose_out_grad.Resize(config->qkv_transpose_out_dims); - AllocWithDebugInfo( - dev_ctx_, "qkv_transpose_out_grad", &qkv_transpose_out_grad); - - q_grad_ptr = qkv_transpose_out_grad.data(); - k_grad_ptr = q_grad_ptr + q_size; - v_grad_ptr = k_grad_ptr + q_size; - } else { - PADDLE_ENFORCE_NOT_NULL( - q_transpose_out, - platform::errors::NotFound("The input q_transpose_out can not be " - "nullptr when merge_qkv is false.")); - PADDLE_ENFORCE_NOT_NULL( - k_transpose_out, - platform::errors::NotFound("The input k_transpose_out can not be " - "nullptr when merge_qkv is false.")); - PADDLE_ENFORCE_NOT_NULL( - v_transpose_out, - platform::errors::NotFound("The input v_transpose_out can not be " - "nullptr when merge_qkv is false.")); - - q_ptr = q_transpose_out->data(); - k_ptr = k_transpose_out->data(); - v_ptr = v_transpose_out->data(); - - q_transpose_out_grad.Resize(config->q_transpose_out_dims); - k_transpose_out_grad.Resize(config->kv_transpose_out_dims); - v_transpose_out_grad.Resize(config->kv_transpose_out_dims); - - q_grad_ptr = dev_ctx_.Alloc( - &q_transpose_out_grad, q_transpose_out_grad.numel() * sizeof(T)); - k_grad_ptr = dev_ctx_.Alloc( - &k_transpose_out_grad, k_transpose_out_grad.numel() * sizeof(T)); - v_grad_ptr = dev_ctx_.Alloc( - &v_transpose_out_grad, v_transpose_out_grad.numel() * sizeof(T)); - } - - phi::DenseTensor softmax_out_grad; - softmax_out_grad.Resize(config->softmax_out_dims); - AllocWithDebugInfo(dev_ctx_, "softmax_out_grad", &softmax_out_grad); - - int64_t gemm_batch_size = - config->batch_size * config->seq_len_m * config->num_heads; - { - // Forward: fmha_out = transpose(qktv_out) - phi::DenseTensor qktv_out_grad; - qktv_out_grad.Resize(config->qktv_out_dims); - AllocWithDebugInfo(dev_ctx_, "qktv_out_grad", &qktv_out_grad); - ComputeQKTVTransposeBackward(*fmha_out_grad, &qktv_out_grad); - - // Forward: qktv_out = BatchedGEMM(softmax_out, V) - // Backward: - // V_grad = BatchedGEMM(softmax_out^T, qktv_out_grad) (dy = x^T * dout) - int64_t gemm_m = config->m_size; - int64_t gemm_n = config->head_dim; - int64_t gemm_k = config->seq_len_r; - - const T* softmax_out_ptr = softmax_out->data(); - const T* qktv_out_grad_ptr = qktv_out_grad.data(); - ComputeBatchedGEMM(softmax_out_ptr, - qktv_out_grad_ptr, - v_grad_ptr, - true, - false, - gemm_m, - gemm_n, - gemm_k, - gemm_batch_size); - - // Backward: softmax_out_grad = qktv_out_grad * V^T (dx = dout * y^T) - gemm_m = config->seq_len_r; - gemm_n = config->m_size; - gemm_k = config->head_dim; - - T* softmax_out_grad_ptr = softmax_out_grad.data(); - ComputeBatchedGEMM(qktv_out_grad_ptr, - v_ptr, - softmax_out_grad_ptr, - false, - true, - gemm_m, - gemm_n, - gemm_k, - gemm_batch_size); - } - - phi::DenseTensor* qk_out_grad = config->GetQKOutGrad(&softmax_out_grad); - ComputeBiasMaskSoftmaxBackward(&softmax_out_grad, - softmax_out, - src_mask_grad, - qk_out_grad, - nonbatched_bias_grad); + phi::DenseTensor softmax_out_grad; + softmax_out_grad.Resize(config->softmax_out_dims); + AllocWithDebugInfo(dev_ctx_, "softmax_out_grad", &softmax_out_grad); - // Forward: qk_out = BatchedGEMM(Q, K^T) - // Backward: k_grad = BatchedGEMM(qk_out_grad^T, Q) (dy = dout^t * x) + int64_t gemm_batch_size = + config->batch_size * config->seq_len_m * config->num_heads; + { + // Forward: fmha_out = transpose(qktv_out) + phi::DenseTensor qktv_out_grad; + qktv_out_grad.Resize(config->qktv_out_dims); + AllocWithDebugInfo(dev_ctx_, "qktv_out_grad", &qktv_out_grad); + ComputeQKTVTransposeBackward(*fmha_out_grad, &qktv_out_grad); + + // Forward: qktv_out = BatchedGEMM(softmax_out, V) + // Backward: + // V_grad = BatchedGEMM(softmax_out^T, qktv_out_grad) (dy = x^T * dout) int64_t gemm_m = config->m_size; int64_t gemm_n = config->head_dim; int64_t gemm_k = config->seq_len_r; - T alpha = static_cast(1.0 / sqrt(config->head_dim)); - T* qk_out_grad_ptr = qk_out_grad->data(); - ComputeBatchedGEMM(qk_out_grad_ptr, - q_ptr, - k_grad_ptr, + const T* softmax_out_ptr = softmax_out->data(); + const T* qktv_out_grad_ptr = qktv_out_grad.data(); + ComputeBatchedGEMM(softmax_out_ptr, + qktv_out_grad_ptr, + v_grad_ptr, true, false, gemm_m, gemm_n, gemm_k, - gemm_batch_size, - alpha); + gemm_batch_size); - // Backward: q_grad = BatchedGEMM(qk_out_grad, K) (dx = dout * y) + // Backward: softmax_out_grad = qktv_out_grad * V^T (dx = dout * y^T) gemm_m = config->seq_len_r; - gemm_n = config->head_dim; - gemm_k = config->m_size; - ComputeBatchedGEMM(qk_out_grad_ptr, - k_ptr, - q_grad_ptr, - false, + gemm_n = config->m_size; + gemm_k = config->head_dim; + + T* softmax_out_grad_ptr = softmax_out_grad.data(); + ComputeBatchedGEMM(qktv_out_grad_ptr, + v_ptr, + softmax_out_grad_ptr, false, + true, gemm_m, gemm_n, gemm_k, - gemm_batch_size, - alpha); + gemm_batch_size); } - if (merge_qkv_ || config->use_flash_attn) { + phi::DenseTensor* qk_out_grad = config->GetQKOutGrad(&softmax_out_grad); + ComputeBiasMaskSoftmaxBackward(&softmax_out_grad, + softmax_out, + src_mask_grad, + qk_out_grad, + nonbatched_bias_grad); + + // Forward: qk_out = BatchedGEMM(Q, K^T) + // Backward: k_grad = BatchedGEMM(qk_out_grad^T, Q) (dy = dout^t * x) + int64_t gemm_m = config->m_size; + int64_t gemm_n = config->head_dim; + int64_t gemm_k = config->seq_len_r; + T alpha = static_cast(1.0 / sqrt(config->head_dim)); + + T* qk_out_grad_ptr = qk_out_grad->data(); + ComputeBatchedGEMM(qk_out_grad_ptr, + q_ptr, + k_grad_ptr, + true, + false, + gemm_m, + gemm_n, + gemm_k, + gemm_batch_size, + alpha); + + // Backward: q_grad = BatchedGEMM(qk_out_grad, K) (dx = dout * y) + gemm_m = config->seq_len_r; + gemm_n = config->head_dim; + gemm_k = config->m_size; + ComputeBatchedGEMM(qk_out_grad_ptr, + k_ptr, + q_grad_ptr, + false, + false, + gemm_m, + gemm_n, + gemm_k, + gemm_batch_size, + alpha); + + if (merge_qkv_) { phi::DenseTensor* qkv_out_grad = config->GetQKVOutGrad(); ComputeQKVTransposeBackward(qkv_transpose_out_grad, qkv_out_grad); } else { @@ -1433,6 +1179,290 @@ class FlashAttnWithGating { } } + void ComputeBackward(const phi::DenseTensor* q_transpose_out, + const phi::DenseTensor* k_transpose_out, + const phi::DenseTensor* v_transpose_out, + const phi::DenseTensor* qkv_transpose_out, + const phi::DenseTensor* fmha_out_grad, + phi::DenseTensor* src_mask_grad, + phi::DenseTensor* nonbatched_bias_grad, + GateAttentionGradConfig* config, + const phi::DenseTensor* fmha_out = nullptr, + const phi::DenseTensor* softmax_lse = nullptr, + const phi::DenseTensor* nonbatched_bias = nullptr, + const phi::DenseTensor* src_mask = nullptr) { + T* q_grad_ptr = nullptr; + T* k_grad_ptr = nullptr; + T* v_grad_ptr = nullptr; + + phi::DenseTensor q_transpose_out_grad; + phi::DenseTensor k_transpose_out_grad; + phi::DenseTensor v_transpose_out_grad; + phi::DenseTensor qkv_transpose_out_grad; + + bool is_bf16 = + qkv_transpose_out->dtype() == DataType::BFLOAT16 ? true : false; + + if (std::is_same::value) { + std::cout << "[Grad]: T is phi::dtype::float16. \n"; + } else if (std::is_same::value) { + std::cout << "[Grad]: T is phi::dtype::bfloat16. \n"; + } else if (std::is_same::value) { + std::cout << "[Grad]: T is float. \n"; + } + + PADDLE_ENFORCE_NOT_NULL( + qkv_transpose_out, + platform::errors::NotFound("The input qkv_transpose_out can not be" + "nullptr when merge_qkv is true.")); + int64_t q_size = config->GetQuerySize(); + const T* q_ptr = qkv_transpose_out->data(); + const T* k_ptr = q_ptr + q_size; + const T* v_ptr = k_ptr + q_size; + + qkv_transpose_out_grad.Resize(config->qkv_transpose_out_dims); + AllocWithDebugInfo( + dev_ctx_, "qkv_transpose_out_grad", &qkv_transpose_out_grad); + + int seq_batch_size = static_cast(config->batch_size) * + static_cast(config->seq_len_m); + LOG(INFO) << WaitWithDebugInfo(dev_ctx_); + + // 2. Dealing with cu_seq_q and cu_seq_k for flash_attn. + phi::DenseTensor cu_seq_q, cu_seq_k; + int64_t start = 0; + int64_t step = static_cast(config->seq_len_r); + int64_t end_size = (seq_batch_size + 1); + int64_t end = end_size; + int64_t seq_size = 0; + phi::funcs::GetSize(start, end, step, &seq_size); + cu_seq_q.Resize({end_size}); + cu_seq_k.Resize({end_size}); + AllocWithDebugInfo(dev_ctx_, "Grad: cu_seq_q", &cu_seq_q); + AllocWithDebugInfo(dev_ctx_, "Grad: cu_seq_k", &cu_seq_k); + int64_t block = std::min(seq_size, static_cast(256)); + int64_t grid = (seq_size + block - 1) / block; + FlashAttRange<<>>( + start, step, end, cu_seq_q.data(), cu_seq_k.data()); + VLOG(4) << "[Flash_attn] cu_seq_len : start = " << start + << ", step = " << step << ", end = " << end; + LOG(INFO) << WaitWithDebugInfo(dev_ctx_); + + // 3. Dealing with mask and bias for flash_attn. + phi::DenseTensor temp_mask, temp_bias; + auto dims_merge_func = [&](const phi::DenseTensor* src_tensor, + phi::DenseTensor* dst_tensor, + const std::string& prefix) { + if (src_tensor) { + int64_t first_dim = 1; + dst_tensor->ShareDataWith(*src_tensor); + auto dims_ = src_tensor->dims(); + for (int i = 0; i < dims_.size() - 3; ++i) { + first_dim *= dims_[i]; + } + auto dims_rank = dims_.size(); + dst_tensor->Resize({first_dim, + dims_[dims_rank - 3], + dims_[dims_rank - 2], + dims_[dims_rank - 1]}); + GetFlashAttnDimsString(prefix, temp_mask.dims()); + } + }; + dims_merge_func(src_mask, &temp_mask, "[Grad] mask_dim"); + dims_merge_func(nonbatched_bias, &temp_bias, "[Grad] bias_dim"); + + phi::DDim qkv_dims({3, + seq_batch_size * static_cast(config->seq_len_r), + static_cast(config->num_heads), + static_cast(config->head_dim)}); + int batch_size_ = seq_batch_size; + int total_q_ = qkv_dims[1]; // q.dims()[0] + int total_k_ = qkv_dims[1]; // q.dims()[0] + int num_heads_ = qkv_dims[2]; // q.dims()[1] + int head_size_ = qkv_dims[3]; // q.dims()[2] + int max_seqlen_q_ = batch_size_; + int max_seqlen_k_ = batch_size_; + VLOG(6) << "[Flash_attn Grad] batch_size : " << batch_size_; + VLOG(6) << "[Flash_attn Grad] total_q : " << total_q_; + VLOG(6) << "[Flash_attn Grad] total_k : " << total_k_; + VLOG(6) << "[Flash_attn Grad] num_heads : " << num_heads_; + VLOG(6) << "[Flash_attn Grad] head_size : " << head_size_; + VLOG(6) << "[Flash_attn Grad] max_seqlen_q : " << max_seqlen_q_; + VLOG(6) << "[Flash_attn Grad] max_seqlen_k : " << max_seqlen_k_; + + // 5. construct softmax_lse + int last_q_dim = ((max_seqlen_q_ + 16 - 1) / 16) * 16; + // softmax_lse->Resize({batch_size_, num_heads_, last_q_dim}); + // AllocWithDebugInfo( + // dev_ctx_, "flash_attn: softmax_lse", softmax_lse); + LOG(INFO) << WaitWithDebugInfo(dev_ctx_); + + phi::DenseTensor softmax_d = phi::Empty( + dev_ctx_, {batch_size_, num_heads_, last_q_dim}); + LOG(INFO) << WaitWithDebugInfo(dev_ctx_); + + phi::DenseTensor bias_d; + if (nonbatched_bias) { + bias_d = phi::Empty( + dev_ctx_, {batch_size_, num_heads_, max_seqlen_q_, max_seqlen_k_}); + } + LOG(INFO) << WaitWithDebugInfo(dev_ctx_); + + q_ptr = q_transpose_out->data(); + k_ptr = k_transpose_out->data(); + v_ptr = v_transpose_out->data(); + q_transpose_out_grad.Resize(config->q_transpose_out_dims); + k_transpose_out_grad.Resize(config->kv_transpose_out_dims); + v_transpose_out_grad.Resize(config->kv_transpose_out_dims); + + q_grad_ptr = dev_ctx_.Alloc(&q_transpose_out_grad, + q_transpose_out_grad.numel() * sizeof(T)); + k_grad_ptr = dev_ctx_.Alloc(&k_transpose_out_grad, + k_transpose_out_grad.numel() * sizeof(T)); + v_grad_ptr = dev_ctx_.Alloc(&v_transpose_out_grad, + v_transpose_out_grad.numel() * sizeof(T)); + + // 6. construct random seed + auto gen = dev_ctx_.GetGenerator(); + uint64_t inc = batch_size_ * num_heads_ * 32; + auto seed_offset_pair = gen->IncrementOffset(inc); + uint64_t seed = seed_offset_pair.first; + uint64_t offset = seed_offset_pair.second; + + // 7. flas_attn part one, get temp worksapce size. + uint64_t workspace_size; + float p_dropout = 0.f; + float softmax_scale = static_cast(1); + cudaStream_t stream = dev_ctx_.stream(); + int num_splits = 0; // 0 for an internal heuristic, which is optimal + bool succ = phi::dynload::flash_attn_bwd_with_bias_and_mask( + static_cast(q_ptr), + static_cast(k_ptr), + static_cast(v_ptr), + static_cast(q_grad_ptr), + static_cast(k_grad_ptr), + static_cast(v_grad_ptr), + static_cast(fmha_out->data()), + static_cast(fmha_out_grad->data()), + cu_seq_q.data(), + cu_seq_k.data(), + total_q_, + total_k_, + batch_size_, + num_heads_, + head_size_, + max_seqlen_q_, + max_seqlen_k_, + p_dropout, + softmax_scale, + /*zero_tensors=*/false, + /*is_causal=*/false, + is_bf16, + num_splits, + softmax_lse->data(), + softmax_d.data(), + bias_d.data(), + nullptr, + &workspace_size, + stream, + seed, + offset, + src_mask ? temp_mask.data() : nullptr, + nonbatched_bias ? temp_bias.data() : nullptr, + src_mask ? temp_mask.dims().Get() : nullptr, + nonbatched_bias ? temp_bias.dims().Get() : nullptr); + if (!succ) { + PADDLE_THROW(phi::errors::External(phi::dynload::flash_attn_error())); + } + LOG(INFO) << WaitWithDebugInfo(dev_ctx_); + + phi::DenseTensor workspace; + printf("workspace_size = %d\n", workspace_size); + if (workspace_size > 0) { + workspace = phi::Empty( + dev_ctx_, {int64_t(workspace_size / sizeof(float))}); + DBGPTR(workspace.data(), "workspace"); + } + LOG(INFO) << WaitWithDebugInfo(dev_ctx_); + + succ = phi::dynload::flash_attn_bwd_with_bias_and_mask( + static_cast(q_ptr), + static_cast(k_ptr), + static_cast(v_ptr), + static_cast(q_grad_ptr), + static_cast(k_grad_ptr), + static_cast(v_grad_ptr), + static_cast(fmha_out->data()), + static_cast(fmha_out_grad->data()), + cu_seq_q.data(), + cu_seq_k.data(), + total_q_, + total_k_, + batch_size_, + num_heads_, + head_size_, + max_seqlen_q_, + max_seqlen_k_, + p_dropout, + softmax_scale, + /*zero_tensors=*/false, + /*is_causal=*/false, + is_bf16, + num_splits, + softmax_lse->data(), + softmax_d.data(), + bias_d.data(), + workspace.data(), + &workspace_size, + stream, + seed, + offset, + src_mask ? temp_mask.data() : nullptr, + nonbatched_bias ? temp_bias.data() : nullptr, + src_mask ? temp_mask.dims().Get() : nullptr, + nonbatched_bias ? temp_bias.dims().Get() : nullptr); + if (!succ) { + PADDLE_THROW(phi::errors::External(phi::dynload::flash_attn_error())); + } + LOG(INFO) << WaitWithDebugInfo(dev_ctx_); + + if (nonbatched_bias) { + // compare block reduce + // auto size = attn_bias->sizes(); + // dbias = ds.reshape({ -1, size[0], size[1], size[2], size[3] }).sum({ + // 0 }); result.push_back( dbias ); + const auto temp_bias_num = temp_bias.numel(); + const auto bias_d_num = bias_d.numel(); + auto dbias_first_dim = bias_d_num / temp_bias_num; + bias_d.Resize({dbias_first_dim, + temp_bias.dims()[0], + temp_bias.dims()[1], + temp_bias.dims()[2], + temp_bias.dims()[3]}); + phi::funcs::ReduceKernel>( + dev_ctx_, + bias_d, + nonbatched_bias_grad, + kps::IdentityFunctor(), + {0}); + } + + if (merge_qkv_) { + phi::DenseTensor* qkv_out_grad = config->GetQKVOutGrad(); + ComputeQKVTransposeBackward(qkv_transpose_out_grad, qkv_out_grad); + } else { + phi::DenseTensor* q_out_grad = config->GetQueryOutGrad(); + phi::DenseTensor* k_out_grad = config->GetKeyOutGrad(); + phi::DenseTensor* v_out_grad = config->GetValueOutGrad(); + ComputeQKVTransposeBackward(q_transpose_out_grad, + k_transpose_out_grad, + v_transpose_out_grad, + q_out_grad, + k_out_grad, + v_out_grad); + } + } + private: // [batch_size, seq_len_m, seq_len_r, 3, num_heads, head_dim] -> // [3, batch_size, seq_len_m, seq_len_r, num_heads, head_dim] @@ -1443,6 +1473,29 @@ class FlashAttnWithGating { dev_ctx_, qkv_out, perm, qkv_transpose_out); } + void ComputeQKVTransposeBackward(const phi::DenseTensor& q_transpose_out_grad, + const phi::DenseTensor& k_transpose_out_grad, + const phi::DenseTensor& v_transpose_out_grad, + phi::DenseTensor* q_out_grad, + phi::DenseTensor* k_out_grad, + phi::DenseTensor* v_out_grad) { + std::vector perm = {0, 1, 3, 2, 4}; + phi::funcs::TransposeGPUKernelDriver( + dev_ctx_, q_transpose_out_grad, perm, q_out_grad); + phi::funcs::TransposeGPUKernelDriver( + dev_ctx_, k_transpose_out_grad, perm, k_out_grad); + phi::funcs::TransposeGPUKernelDriver( + dev_ctx_, v_transpose_out_grad, perm, v_out_grad); + } + + void ComputeQKVTransposeBackward( + const phi::DenseTensor& qkv_transpose_out_grad, + phi::DenseTensor* qkv_out_grad) { + std::vector perm = {1, 2, 4, 0, 3, 5}; + phi::funcs::TransposeGPUKernelDriver( + dev_ctx_, qkv_transpose_out_grad, perm, qkv_out_grad); + } + const phi::GPUContext& dev_ctx_; bool merge_qkv_; }; diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cc b/paddle/fluid/operators/fused/fused_gate_attention_op.cc index a3fd178059fab..9743bc33fd055 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention_op.cc +++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cc @@ -231,15 +231,15 @@ class FusedGateAttentionGradOp : public framework::OperatorWithKernel { OP_INOUT_CHECK(ctx->HasInput("QueryWeight"), "Input", "QueryWeight", - "fused_aate_attention_arad"); + "fused_gate_attention_arad"); OP_INOUT_CHECK(ctx->HasInput("KeyWeight"), "Input", "KeyWeight", - "fused_aate_attention_arad"); + "fused_gate_attention_arad"); OP_INOUT_CHECK(ctx->HasInput("ValueWeight"), "Input", "ValueWeight", - "fused_aate_attention_arad"); + "fused_gate_attention_arad"); for (auto& name : {"QueryWeight", "KeyWeight", "ValueWeight"}) { ctx->SetOutputDim(framework::GradVarName(name), ctx->GetInputDim(name)); @@ -267,6 +267,27 @@ class FusedGateAttentionGradOp : public framework::OperatorWithKernel { ctx->SetOutputDim(framework::GradVarName("OutLinearBias"), ctx->GetInputDim("OutLinearBias")); } + + protected: + phi::KernelKey GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto input = ctx.Input("Query"); + auto input_data_type = framework::TransToProtoVarType(input->dtype()); + return phi::KernelKey(input_data_type, ctx.GetPlace()); + } + + phi::KernelKey GetKernelTypeForVar( + const std::string& var_name, + const phi::DenseTensor& tensor, + const phi::KernelKey& expected_kernel_type) const override { + if (var_name == "SoftmaxLse") { + return phi::KernelKey(phi::Backend::ALL_BACKEND, + expected_kernel_type.layout(), + expected_kernel_type.dtype()); + } + return phi::KernelKey( + tensor.place(), tensor.layout(), expected_kernel_type.dtype()); + } }; template diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu index 3d10047bf23a0..a0f97ce59109b 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu @@ -474,6 +474,8 @@ template class FusedGateAttentionGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { + LOG(INFO) << "In FusedGateAttentionGradKernel"; + // forward input const auto *query = ctx.Input("Query"); const auto *key = ctx.Input("Key"); @@ -489,7 +491,6 @@ class FusedGateAttentionGradKernel : public framework::OpKernel { ctx.Input("ValueTransposeOut"); const auto *qkv_transpose_out = ctx.Input("QKVTransposeOut"); - const auto *softmax_out = ctx.Input("SoftmaxOut"); const auto *fmha_out = ctx.Input("FMHAOut"); const auto *gate_out = ctx.Input("GateOut"); @@ -516,17 +517,6 @@ class FusedGateAttentionGradKernel : public framework::OpKernel { has_gating, use_flash_attn); - const phi::DenseTensor *fwd_out = nullptr; - const phi::DenseTensor *fwd_bias = nullptr; - const phi::DenseTensor *fwd_mask = nullptr; - const phi::DenseTensor *fwd_softmax_lse = nullptr; - if (merge_qkv && use_flash_attn) { - fwd_bias = ctx.Input("NonbatchedBias"); - fwd_mask = ctx.Input("SrcMask"); - fwd_softmax_lse = ctx.Input("SoftmaxLse"); - fwd_out = fmha_out; - } - phi::DenseTensor fmha_out_grad; fmha_out_grad.Resize(config.gate_out_dims); AllocWithDebugInfo(dev_ctx, "fmha_out_grad", &fmha_out_grad); @@ -560,28 +550,29 @@ class FusedGateAttentionGradKernel : public framework::OpKernel { dev_ctx, "nonbatched_bias_grad", nonbatched_bias_grad); } - auto fmha_compute = FMHAGateRef(dev_ctx, merge_qkv); - - if (use_flash_attn) { - const phi::DenseTensor *fwd_bias = + if (config.CanUseFlashAttn()) { + const auto *non_batched_bias = ctx.Input("NonbatchedBias"); - const phi::DenseTensor *fwd_mask = ctx.Input("SrcMask"); - const phi::DenseTensor *fwd_softmax_lse = - ctx.Input("SoftmaxLse"); + const auto *src_mask = ctx.Input("SrcMask"); + const auto *softmax_lse = ctx.Input("SoftmaxLse"); + + auto fmha_compute = FlashAttnWithGating(dev_ctx, merge_qkv); fmha_compute.ComputeBackward(q_transpose_out, k_transpose_out, v_transpose_out, qkv_transpose_out, - softmax_out, &fmha_out_grad, nullptr, nonbatched_bias_grad, &config, fmha_out, - fwd_softmax_lse, - fwd_bias, - fwd_mask); + softmax_lse, + non_batched_bias, + src_mask); } else { + const auto *softmax_out = ctx.Input("SoftmaxOut"); + + auto fmha_compute = FMHAGateRef(dev_ctx, merge_qkv); fmha_compute.ComputeBackward(q_transpose_out, k_transpose_out, v_transpose_out, From 37a095390bf3fea5c1190d9fd9a2bf66ff420161 Mon Sep 17 00:00:00 2001 From: Ruibiao Chen Date: Tue, 25 Apr 2023 09:12:17 +0800 Subject: [PATCH 039/405] Remove managed memory msg in cuda allocator (#53263) --- paddle/fluid/memory/allocation/cuda_allocator.cc | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc index 4ea182cb722ce..781addd7dba60 100644 --- a/paddle/fluid/memory/allocation/cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_allocator.cc @@ -70,29 +70,20 @@ phi::Allocation* CUDAAllocator::AllocateImpl(size_t size) { limit_size); } - std::string managed_memory_msg; - if (platform::IsGPUManagedMemoryOversubscriptionSupported(place_.device)) { - managed_memory_msg = string::Sprintf( - "If the above ways do not solve the out of memory problem, you can try " - "to use CUDA managed memory. The command is `export " - "FLAGS_use_cuda_managed_memory=true`."); - } - PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( "\n\nOut of memory error on GPU %d. " "Cannot allocate %s memory on GPU %d, %s memory has been allocated and " "available memory is only %s.\n\n" "Please check whether there is any other process using GPU %d.\n" "1. If yes, please stop them, or start PaddlePaddle on another GPU.\n" - "2. If no, please decrease the batch size of your model. %s\n%s\n", + "2. If no, please decrease the batch size of your model. %s\n", place_.device, string::HumanReadableSize(size), place_.device, string::HumanReadableSize(allocated), string::HumanReadableSize(avail), place_.device, - err_msg, - managed_memory_msg)); + err_msg)); } } // namespace allocation From ade7a070c5b7012d1e6a7a6d810cf1fe13984f38 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Tue, 25 Apr 2023 09:33:38 +0800 Subject: [PATCH 040/405] Simplify codes. --- .../operators/fused/fused_gate_attention.h | 155 +++++++----------- 1 file changed, 61 insertions(+), 94 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h index 07b4b5973cac8..8e91eca7b2e8a 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention.h +++ b/paddle/fluid/operators/fused/fused_gate_attention.h @@ -998,23 +998,8 @@ class FlashAttnWithGating { // 2. Dealing with cu_seq_q and cu_seq_k for flash_attn. phi::DenseTensor cu_seq_q, cu_seq_k; - int64_t end_size = seq_batch_size + 1; - int64_t seq_size = 0; - int64_t start = 0; - int64_t end = end_size; - int64_t step = static_cast(config->seq_len_r); - phi::funcs::GetSize(start, end, step, &seq_size); - cu_seq_q.Resize({end_size}); - cu_seq_k.Resize({end_size}); - AllocWithDebugInfo(dev_ctx_, "cu_seq_q", &cu_seq_q); - AllocWithDebugInfo(dev_ctx_, "cu_seq_k", &cu_seq_k); - int64_t block = std::min(seq_size, static_cast(256)); - int64_t grid = (seq_size + block - 1) / block; - FlashAttRange<<>>( - start, step, end, cu_seq_q.data(), cu_seq_k.data()); - VLOG(4) << "[Flash_attn] cu_seq_len : start = " << start - << ", step = " << step << ", end = " << end; - LOG(INFO) << WaitWithDebugInfo(dev_ctx_) << "2: Init cu_seq_q and cu_seq_k"; + int64_t step = static_cast(config->seq_len_r); + AllocAndInitSeqQK(seq_batch_size, step, &cu_seq_q, &cu_seq_k); // 3. Dealing with mask and bias for flash_attn. phi::DenseTensor temp_mask, temp_bias; @@ -1071,9 +1056,7 @@ class FlashAttnWithGating { << "]"; // 6. construct random seed - auto gen = dev_ctx_.GetGenerator(); - uint64_t inc = batch_size_ * num_heads_ * 32; - auto seed_offset_pair = gen->IncrementOffset(inc); + auto seed_offset_pair = GenerateSeedOffsetPair(batch_size_, num_heads_); uint64_t seed = seed_offset_pair.first; uint64_t offset = seed_offset_pair.second; @@ -1124,13 +1107,7 @@ class FlashAttnWithGating { LOG(INFO) << WaitWithDebugInfo(dev_ctx_) << "7: Get workspace_size=" << workspace_size; - phi::DenseTensor workspace; - if (workspace_size > 0) { - workspace = phi::Empty( - dev_ctx_, {int64_t(workspace_size / sizeof(float))}); - DBGPTR(workspace.data(), "workspace"); - } - LOG(INFO) << WaitWithDebugInfo(dev_ctx_) << "Allocate workspace"; + phi::DenseTensor workspace = CreateWorkspace(workspace_size); LOG(INFO) << "qkv_transpose_out: " << TensorDebugString(qkv_transpose_out); LOG(INFO) << "src_mask: " << TensorDebugString(src_mask); @@ -1191,15 +1168,6 @@ class FlashAttnWithGating { const phi::DenseTensor* softmax_lse = nullptr, const phi::DenseTensor* nonbatched_bias = nullptr, const phi::DenseTensor* src_mask = nullptr) { - T* q_grad_ptr = nullptr; - T* k_grad_ptr = nullptr; - T* v_grad_ptr = nullptr; - - phi::DenseTensor q_transpose_out_grad; - phi::DenseTensor k_transpose_out_grad; - phi::DenseTensor v_transpose_out_grad; - phi::DenseTensor qkv_transpose_out_grad; - bool is_bf16 = qkv_transpose_out->dtype() == DataType::BFLOAT16 ? true : false; @@ -1215,38 +1183,29 @@ class FlashAttnWithGating { qkv_transpose_out, platform::errors::NotFound("The input qkv_transpose_out can not be" "nullptr when merge_qkv is true.")); + int64_t q_size = config->GetQuerySize(); const T* q_ptr = qkv_transpose_out->data(); const T* k_ptr = q_ptr + q_size; const T* v_ptr = k_ptr + q_size; + phi::DenseTensor qkv_transpose_out_grad; qkv_transpose_out_grad.Resize(config->qkv_transpose_out_dims); AllocWithDebugInfo( dev_ctx_, "qkv_transpose_out_grad", &qkv_transpose_out_grad); + T* q_grad_ptr = qkv_transpose_out_grad.data(); + T* k_grad_ptr = q_grad_ptr + q_size; + T* v_grad_ptr = k_grad_ptr + q_size; + int seq_batch_size = static_cast(config->batch_size) * static_cast(config->seq_len_m); LOG(INFO) << WaitWithDebugInfo(dev_ctx_); - // 2. Dealing with cu_seq_q and cu_seq_k for flash_attn. + // 2. Init with cu_seq_q and cu_seq_k for flash_attn. phi::DenseTensor cu_seq_q, cu_seq_k; - int64_t start = 0; - int64_t step = static_cast(config->seq_len_r); - int64_t end_size = (seq_batch_size + 1); - int64_t end = end_size; - int64_t seq_size = 0; - phi::funcs::GetSize(start, end, step, &seq_size); - cu_seq_q.Resize({end_size}); - cu_seq_k.Resize({end_size}); - AllocWithDebugInfo(dev_ctx_, "Grad: cu_seq_q", &cu_seq_q); - AllocWithDebugInfo(dev_ctx_, "Grad: cu_seq_k", &cu_seq_k); - int64_t block = std::min(seq_size, static_cast(256)); - int64_t grid = (seq_size + block - 1) / block; - FlashAttRange<<>>( - start, step, end, cu_seq_q.data(), cu_seq_k.data()); - VLOG(4) << "[Flash_attn] cu_seq_len : start = " << start - << ", step = " << step << ", end = " << end; - LOG(INFO) << WaitWithDebugInfo(dev_ctx_); + int64_t step = static_cast(config->seq_len_r); + AllocAndInitSeqQK(seq_batch_size, step, &cu_seq_q, &cu_seq_k); // 3. Dealing with mask and bias for flash_attn. phi::DenseTensor temp_mask, temp_bias; @@ -1308,24 +1267,8 @@ class FlashAttnWithGating { } LOG(INFO) << WaitWithDebugInfo(dev_ctx_); - q_ptr = q_transpose_out->data(); - k_ptr = k_transpose_out->data(); - v_ptr = v_transpose_out->data(); - q_transpose_out_grad.Resize(config->q_transpose_out_dims); - k_transpose_out_grad.Resize(config->kv_transpose_out_dims); - v_transpose_out_grad.Resize(config->kv_transpose_out_dims); - - q_grad_ptr = dev_ctx_.Alloc(&q_transpose_out_grad, - q_transpose_out_grad.numel() * sizeof(T)); - k_grad_ptr = dev_ctx_.Alloc(&k_transpose_out_grad, - k_transpose_out_grad.numel() * sizeof(T)); - v_grad_ptr = dev_ctx_.Alloc(&v_transpose_out_grad, - v_transpose_out_grad.numel() * sizeof(T)); - // 6. construct random seed - auto gen = dev_ctx_.GetGenerator(); - uint64_t inc = batch_size_ * num_heads_ * 32; - auto seed_offset_pair = gen->IncrementOffset(inc); + auto seed_offset_pair = GenerateSeedOffsetPair(batch_size_, num_heads_); uint64_t seed = seed_offset_pair.first; uint64_t offset = seed_offset_pair.second; @@ -1376,15 +1319,7 @@ class FlashAttnWithGating { } LOG(INFO) << WaitWithDebugInfo(dev_ctx_); - phi::DenseTensor workspace; - printf("workspace_size = %d\n", workspace_size); - if (workspace_size > 0) { - workspace = phi::Empty( - dev_ctx_, {int64_t(workspace_size / sizeof(float))}); - DBGPTR(workspace.data(), "workspace"); - } - LOG(INFO) << WaitWithDebugInfo(dev_ctx_); - + phi::DenseTensor workspace = CreateWorkspace(workspace_size); succ = phi::dynload::flash_attn_bwd_with_bias_and_mask( static_cast(q_ptr), static_cast(k_ptr), @@ -1447,23 +1382,55 @@ class FlashAttnWithGating { {0}); } - if (merge_qkv_) { - phi::DenseTensor* qkv_out_grad = config->GetQKVOutGrad(); - ComputeQKVTransposeBackward(qkv_transpose_out_grad, qkv_out_grad); - } else { - phi::DenseTensor* q_out_grad = config->GetQueryOutGrad(); - phi::DenseTensor* k_out_grad = config->GetKeyOutGrad(); - phi::DenseTensor* v_out_grad = config->GetValueOutGrad(); - ComputeQKVTransposeBackward(q_transpose_out_grad, - k_transpose_out_grad, - v_transpose_out_grad, - q_out_grad, - k_out_grad, - v_out_grad); - } + phi::DenseTensor* qkv_out_grad = config->GetQKVOutGrad(); + ComputeQKVTransposeBackward(qkv_transpose_out_grad, qkv_out_grad); } private: + void AllocAndInitSeqQK(int64_t seq_batch_size, + int64_t step, + phi::DenseTensor* cu_seq_q, + phi::DenseTensor* cu_seq_k) { + int64_t start = 0; + int64_t end_size = seq_batch_size + 1; + int64_t end = end_size; + int64_t seq_size = 0; + phi::funcs::GetSize(start, end, step, &seq_size); + + cu_seq_q->Resize({end_size}); + cu_seq_k->Resize({end_size}); + AllocWithDebugInfo(dev_ctx_, "cu_seq_q", cu_seq_q); + AllocWithDebugInfo(dev_ctx_, "cu_seq_k", cu_seq_k); + + int64_t block = std::min(seq_size, static_cast(256)); + int64_t grid = (seq_size + block - 1) / block; + FlashAttRange<<>>( + start, step, end, cu_seq_q->data(), cu_seq_k->data()); + + LOG(INFO) << WaitWithDebugInfo(dev_ctx_) + << "AllocAndInit cu_seq_q and cu_seq_k: start=" << start + << ", step=" << step << ", end=" << end; + } + + phi::DenseTensor CreateWorkspace(uint64_t workspace_size) { + phi::DenseTensor workspace; + if (workspace_size > 0) { + workspace = phi::Empty( + dev_ctx_, {int64_t(workspace_size / sizeof(float))}); + DBGPTR(workspace.data(), "workspace"); + } + LOG(INFO) << WaitWithDebugInfo(dev_ctx_) + << "Allocate workspace: workspace_size=" << workspace_size; + return workspace; + } + + std::pair GenerateSeedOffsetPair(int64_t batch_size, + int64_t num_heads) { + auto gen = dev_ctx_.GetGenerator(); + uint64_t inc = batch_size * num_heads * 32; + return gen->IncrementOffset(inc); + } + // [batch_size, seq_len_m, seq_len_r, 3, num_heads, head_dim] -> // [3, batch_size, seq_len_m, seq_len_r, num_heads, head_dim] void ComputeQKVTransposeForwardForFlashAttn( From b7565222beee8bfaec05048c05c5b17a71de289a Mon Sep 17 00:00:00 2001 From: Bo Zhang <105368690+zhangbopd@users.noreply.github.com> Date: Tue, 25 Apr 2023 11:04:05 +0800 Subject: [PATCH 041/405] add syncthreads (#53149) --- paddle/phi/kernels/funcs/dropout_impl.cu.h | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/phi/kernels/funcs/dropout_impl.cu.h b/paddle/phi/kernels/funcs/dropout_impl.cu.h index 0b47febb0d30e..006784eb64c76 100644 --- a/paddle/phi/kernels/funcs/dropout_impl.cu.h +++ b/paddle/phi/kernels/funcs/dropout_impl.cu.h @@ -272,6 +272,7 @@ __global__ void DropOutNdForwardKernel( idx = fast_divmoder.val[0]; offset += broadcast_config.strides[j] * fast_divmoder.val[1]; } + __syncthreads(); y[i] = dst_functor(src[i], mask[offset]); } } From a1f8f411c24ccb8c7f345d102ac8c777789a8624 Mon Sep 17 00:00:00 2001 From: co63oc Date: Tue, 25 Apr 2023 11:22:27 +0800 Subject: [PATCH 042/405] change gpu_time_diff (#53252) --- tools/check_op_benchmark_result.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/check_op_benchmark_result.py b/tools/check_op_benchmark_result.py index 4396734a85491..30d417b2fb771 100644 --- a/tools/check_op_benchmark_result.py +++ b/tools/check_op_benchmark_result.py @@ -74,7 +74,7 @@ def check_speed_result(case_name, develop_data, pr_data, pr_result): gpu_time_diff = (pr_gpu_time - develop_gpu_time) / develop_gpu_time gpu_time_diff_str = "{:.5f}".format(gpu_time_diff * 100) else: - gpu_time_diff = None + gpu_time_diff = 0 gpu_time_diff_str = "" pr_total_time = pr_data.get("total") From 5ca3bc6d981ce29b625c5127d87f5a22d2d6352e Mon Sep 17 00:00:00 2001 From: Meteor Liu Date: Tue, 25 Apr 2023 11:25:13 +0800 Subject: [PATCH 043/405] rename monkey_patch_{math_}varbase as monkey_patch_{math_}tensor (#53191) * rename monkey_patch_varbase as monkey_patch_tensor & monkey_patch_math_varbase as monkey_patch_math_tensor * rename monkey_patch_varbase as monkey_patch_tensor & monkey_patch_math_varbase as monkey_patch_math_tensor * rename monkey_patch_varbase as monkey_patch_tensor & monkey_patch_math_varbase as monkey_patch_math_tensor v2 * rename monkey_patch_varbase as monkey_patch_tensor & monkey_patch_math_varbase as monkey_patch_math_tensor fixed bug --- python/paddle/__init__.py | 8 +++-- python/paddle/fluid/__init__.py | 4 +-- python/paddle/fluid/dygraph/__init__.py | 2 -- python/paddle/fluid/dygraph/math_op_patch.py | 8 ++--- ...tch_methods.py => tensor_patch_methods.py} | 22 ++++++------- python/paddle/fluid/framework.py | 31 ------------------- .../unittests/test_deprecated_decorator.py | 2 +- python/paddle/framework/__init__.py | 7 +++-- 8 files changed, 29 insertions(+), 55 deletions(-) rename python/paddle/fluid/dygraph/{varbase_patch_methods.py => tensor_patch_methods.py} (98%) diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index ca237df8e53fe..f319ab27c063a 100644 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -24,11 +24,15 @@ ) from .batch import batch # noqa: F401 + +# Do the *DUPLICATED* monkey-patch for the tensor object. +# We need remove the duplicated code here once we fix +# the illogical implement in the monkey-patch methods later. from .framework import monkey_patch_variable -from .framework import monkey_patch_math_varbase +from .framework import monkey_patch_math_tensor monkey_patch_variable() -monkey_patch_math_varbase() +monkey_patch_math_tensor() from .framework import disable_signal_handler # noqa: F401 from .framework import get_flags # noqa: F401 diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index c33406dba8e6a..a82febe3ad6d7 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -79,7 +79,7 @@ from .compiler import * from paddle.fluid.layers.math_op_patch import monkey_patch_variable from .dygraph.base import enable_dygraph, disable_dygraph -from .dygraph.varbase_patch_methods import monkey_patch_varbase +from .dygraph.tensor_patch_methods import monkey_patch_tensor from .core import _cuda_synchronize from .trainer_desc import ( TrainerDesc, @@ -211,7 +211,7 @@ def remove_flag_if_exists(name): # Consider paddle.init(args) or paddle.main(args) monkey_patch_variable() __bootstrap__() -monkey_patch_varbase() +monkey_patch_tensor() # NOTE(Aurelius84): clean up ExecutorCacheInfo in advance manually. atexit.register(core.clear_executor_cache) diff --git a/python/paddle/fluid/dygraph/__init__.py b/python/paddle/fluid/dygraph/__init__.py index 36dbf90c52b8b..c40262a45d7c3 100644 --- a/python/paddle/fluid/dygraph/__init__.py +++ b/python/paddle/fluid/dygraph/__init__.py @@ -21,8 +21,6 @@ from . import learning_rate_scheduler from .learning_rate_scheduler import * -from .math_op_patch import monkey_patch_math_varbase - __all__ = [] __all__ += base.__all__ __all__ += learning_rate_scheduler.__all__ diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py index 220d849070d18..7b20e626b8e95 100644 --- a/python/paddle/fluid/dygraph/math_op_patch.py +++ b/python/paddle/fluid/dygraph/math_op_patch.py @@ -65,7 +65,7 @@ _already_patch_eager_tensor = False -def monkey_patch_math_varbase(): +def monkey_patch_math_tensor(): """ Similar to monkey_patch_variable. The difference is, in dygraph mode, use auto-generated op functions for better performance. @@ -248,7 +248,7 @@ def __impl__(self, other_var): # do nothing pass - # 2. create varbase for scalar + # 2. create Tensor for scalar lhs_dtype = self.dtype other_var_should_be = core.eager.Tensor if not isinstance(other_var, other_var_should_be): @@ -343,7 +343,7 @@ def __impl__(self, other_var): __impl__.__name__ = method_name return __impl__ - varbase_methods = [ + tensor_methods = [ ('__neg__', _neg_), ('__float__', _float_), ('__long__', _long_), @@ -498,7 +498,7 @@ def __impl__(self, other_var): setattr(local_tensor, method_name, method_impl) else: - for method in varbase_methods: + for method in tensor_methods: method_name = method[0] method_impl = method[1] setattr(local_tensor, method_name, method_impl) diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/tensor_patch_methods.py similarity index 98% rename from python/paddle/fluid/dygraph/varbase_patch_methods.py rename to python/paddle/fluid/dygraph/tensor_patch_methods.py index 5e64cd6bad3cb..882a333b5ebf7 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/tensor_patch_methods.py @@ -32,7 +32,7 @@ in_dygraph_mode, ) from .base import switch_to_static_graph -from .math_op_patch import monkey_patch_math_varbase +from .math_op_patch import monkey_patch_math_tensor from paddle.fluid.data_feeder import convert_dtype, _PADDLE_DTYPE_2_NUMPY_DTYPE import paddle.utils.deprecated as deprecated import paddle.profiler as profiler @@ -86,7 +86,7 @@ def remove(self): _already_patch_repr = False -def monkey_patch_varbase(): +def monkey_patch_tensor(): @switch_to_static_graph def _to_static_var(self, to_parameter=False, **kwargs): """ @@ -110,8 +110,8 @@ def _to_static_var(self, to_parameter=False, **kwargs): data = np.ones([3, 1024], dtype='float32') with fluid.dygraph.guard(): - var_base = to_variable(data) - static_var = var_base._to_static_var() + tensor = to_variable(data) + static_var = tensor._to_static_var() """ @@ -700,11 +700,11 @@ def __deepcopy__(self, memo): raise RuntimeError( "Only Leaf Tensor support the deepcopy at the moment, non-Leaf Tensors contains graph information that does't support deepcopy" ) - new_varbase = core.eager.Tensor() - new_varbase.name = self.name + unique_name.generate("_deepcopy") - memo[id(self)] = new_varbase - new_varbase.copy_(self, True) - return new_varbase + new_tensor = core.eager.Tensor() + new_tensor.name = self.name + unique_name.generate("_deepcopy") + memo[id(self)] = new_tensor + new_tensor.copy_(self, True) + return new_tensor @property def block(self): @@ -1073,5 +1073,5 @@ def dtype_str(dtype): setattr(core.VarDesc.VarType, "__str__", dtype_str) _already_patch_repr = True - # patch math methods for varbase - monkey_patch_math_varbase() + # patch math methods for tensor + monkey_patch_math_tensor() diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index a699a2f0fe720..63ab3a65bb6b9 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -112,8 +112,6 @@ def __setattr__(self, name, val): _current_device = None global_prog_seed = 0 _current_pipeline_stage = None -_already_patch_eager_tensor = False -_already_patch_varbase = False _current_cuda_graph_mode = None _global_flags_ = core.globals() @@ -182,35 +180,6 @@ def __setattr__(self, name, val): # to make sure in most case, we find new dygraph mode first with only one if statement. -def _update_monkey_methods(): - """ - Update monkey methods of Tensor or eager.Tensor while - switching eager mode and legacy mode. - """ - from paddle import _C_ops, _legacy_C_ops - from .dygraph.varbase_patch_methods import monkey_patch_varbase - from .dygraph import monkey_patch_math_varbase - - global _already_patch_eager_tensor - global _already_patch_varbase - - if not _already_patch_eager_tensor: - monkey_patch_varbase() - monkey_patch_math_varbase() - - _already_patch_eager_tensor = True - - # switch Paddle.Tensor bind type - _switch_tensor_bind_type() - - -def _switch_tensor_bind_type(): - import paddle - - paddle.Tensor = core.eager.Tensor - paddle.Tensor.__qualname__ = 'Tensor' - - def _in_eager_without_dygraph_check(): return global_var._in_eager_mode_ diff --git a/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py b/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py index 4624897f0168e..1e4bfe50515dd 100755 --- a/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py +++ b/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py @@ -118,7 +118,7 @@ def test_tensor_gradient(self): with warnings.catch_warnings(record=True) as w: grad = x.gradient() assert ( - 'API "paddle.fluid.dygraph.varbase_patch_methods.gradient" is ' + 'API "paddle.fluid.dygraph.tensor_patch_methods.gradient" is ' 'deprecated since 2.1.0' ) in str(w[-1].message) diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py index 2b9449b7c0902..3c6338e345ddb 100755 --- a/python/paddle/framework/__init__.py +++ b/python/paddle/framework/__init__.py @@ -44,8 +44,11 @@ from .io_utils import _unpack_saved_dict from .io_utils import _load_program_scope -from ..fluid import monkey_patch_variable -from ..fluid.dygraph import monkey_patch_math_varbase +# Do the *DUPLICATED* monkey-patch for the tensor object. +# We need remove the duplicated code here once we fix +# the illogical implement in the monkey-patch methods later. +from ..fluid.layers.math_op_patch import monkey_patch_variable +from ..fluid.dygraph.math_op_patch import monkey_patch_math_tensor from ..fluid.framework import disable_signal_handler # noqa: F401 from ..fluid.framework import get_flags # noqa: F401 from ..fluid.framework import set_flags # noqa: F401 From 4695122492eee3cc9e9c585e33429c0f98dbdbb0 Mon Sep 17 00:00:00 2001 From: Difer <707065510@qq.com> Date: Tue, 25 Apr 2023 11:58:34 +0800 Subject: [PATCH 044/405] =?UTF-8?q?=E3=80=90Hackathon=20No57=E3=80=91add?= =?UTF-8?q?=20fp16=20&=20bf16=20for=20max=5Fpool2d=5Fwith=5Findex,=20max?= =?UTF-8?q?=5Fpool3d=5Fwith=5Findex=20(#52314)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add fp_bf for pool_max_withidx * fix some error * fix error * codestyle error * fix masktype * fix input bf type * input bf dtype convert error * back to convert input to bf16 first * fix convert error * fix bf16 grad check --- paddle/phi/kernels/funcs/pooling.cu | 24 +- paddle/phi/kernels/gpu/pool_grad_kernel.cu | 8 +- paddle/phi/kernels/gpu/pool_kernel.cu | 8 +- .../fluid/tests/unittests/test_pool_max_op.py | 205 +++++++++++++++++- 4 files changed, 227 insertions(+), 18 deletions(-) diff --git a/paddle/phi/kernels/funcs/pooling.cu b/paddle/phi/kernels/funcs/pooling.cu index d0a0416994169..2f89b51815e64 100644 --- a/paddle/phi/kernels/funcs/pooling.cu +++ b/paddle/phi/kernels/funcs/pooling.cu @@ -1963,7 +1963,7 @@ __global__ void KernelMaxPool2dWithIdx(const int nthreads, wstart = max(wstart, 0); } - T1 ele = -FLT_MAX; + T1 ele = static_cast(-FLT_MAX); int max_index = -1; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { @@ -2015,7 +2015,7 @@ __global__ void AdaptiveKernelMaxPool2dWithIdx(const int nthreads, wstart = AdaptStartIndex(w_offset, input_width, output_width); wend = AdaptEndIndex(w_offset, input_width, output_width); - T1 ele = -FLT_MAX; + T1 ele = static_cast(-FLT_MAX); int max_index = -1; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { @@ -2089,7 +2089,7 @@ __global__ void KernelMaxPool2DWithIdxGrad(const int nthreads, pwend = min((w_offset + padding_width) / stride_width + 1, output_width); } - T1 input_grad_data = 0; + T1 input_grad_data = static_cast(0); int input_current_featuremap_idx = h_offset * input_width + w_offset; for (int ph = phstart; ph < phend; ++ph) { for (int pw = pwstart; pw < pwend; ++pw) { @@ -2259,6 +2259,14 @@ template class MaxPool2dWithIndexFunctor; template class MaxPool2dWithIndexGradFunctor; template class MaxPool2dWithIndexFunctor; template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; template __global__ void KernelMaxPool3DWithIdx(const int ncd, @@ -2324,7 +2332,7 @@ __global__ void KernelMaxPool3DWithIdx(const int ncd, wstart = max(wstart, 0); } - T1 ele = -FLT_MAX; + T1 ele = static_cast(-FLT_MAX); int max_index = -1; for (int d = dstart; d < dend; ++d) { for (int h = hstart; h < hend; ++h) { @@ -2560,6 +2568,14 @@ template class MaxPool3dWithIndexFunctor; template class MaxPool3dWithIndexGradFunctor; template class MaxPool3dWithIndexFunctor; template class MaxPool3dWithIndexGradFunctor; +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/gpu/pool_grad_kernel.cu b/paddle/phi/kernels/gpu/pool_grad_kernel.cu index e4cfcb23b730e..c625977543558 100644 --- a/paddle/phi/kernels/gpu/pool_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/pool_grad_kernel.cu @@ -38,7 +38,9 @@ PD_REGISTER_KERNEL(max_pool2d_with_index_grad, ALL_LAYOUT, phi::MaxPool2dWithIndexGradKernel, float, - double) { + double, + phi::dtype::float16, + phi::dtype::bfloat16) { kernel->InputAt(1).SetDataType(phi::CppTypeToDataType::Type()); } @@ -55,6 +57,8 @@ PD_REGISTER_KERNEL(max_pool3d_with_index_grad, ALL_LAYOUT, phi::MaxPool3dWithIndexGradKernel, float, - double) { + double, + phi::dtype::float16, + phi::dtype::bfloat16) { kernel->InputAt(1).SetDataType(phi::CppTypeToDataType::Type()); } diff --git a/paddle/phi/kernels/gpu/pool_kernel.cu b/paddle/phi/kernels/gpu/pool_kernel.cu index 65d0ef4bdc916..511cc263bc760 100644 --- a/paddle/phi/kernels/gpu/pool_kernel.cu +++ b/paddle/phi/kernels/gpu/pool_kernel.cu @@ -32,7 +32,9 @@ PD_REGISTER_KERNEL(max_pool2d_with_index, ALL_LAYOUT, phi::MaxPool2dWithIndexKernel, float, - double) { + double, + phi::dtype::float16, + phi::dtype::bfloat16) { kernel->OutputAt(1).SetDataType(phi::CppTypeToDataType::Type()); } @@ -49,6 +51,8 @@ PD_REGISTER_KERNEL(max_pool3d_with_index, ALL_LAYOUT, phi::MaxPool3dWithIndexKernel, float, - double) { + double, + phi::dtype::float16, + phi::dtype::bfloat16) { kernel->OutputAt(1).SetDataType(phi::CppTypeToDataType::Type()); } diff --git a/python/paddle/fluid/tests/unittests/test_pool_max_op.py b/python/paddle/fluid/tests/unittests/test_pool_max_op.py index d8d61f4fb2904..16d1f356537bc 100644 --- a/python/paddle/fluid/tests/unittests/test_pool_max_op.py +++ b/python/paddle/fluid/tests/unittests/test_pool_max_op.py @@ -15,9 +15,16 @@ import unittest import numpy as np -from eager_op_test import OpTest +from eager_op_test import ( + OpTest, + convert_float_to_uint16, + convert_uint16_to_float, + get_numeric_gradient, +) import paddle +from paddle.fluid import core +from paddle.fluid.tests.unittests.testsuite import create_op def adaptive_start_index(index, input_size, output_size): @@ -149,9 +156,18 @@ def setUp(self): self.init_test_case() self.init_global() self.init_adaptive() + self.init_dtype() + + if self.is_bfloat16_op(): + input = np.random.random(self.shape).astype(np.float32) + input = convert_uint16_to_float( + convert_float_to_uint16(np.round(input * 100.0, 2)) + ) + + else: + input = np.random.random(self.shape).astype(self.dtype) + input = np.round(input * 100.0, 2) - input = np.random.random(self.shape).astype("float64") - input = np.round(input * 100.0, 2) output, mask = self.pool_forward_naive( input, self.ksize, @@ -160,8 +176,11 @@ def setUp(self): self.global_pool, self.adaptive, ) - output = output.astype("float64") mask = mask.astype("int32") + if self.is_bfloat16_op(): + output = output.astype(np.float32) + else: + output = output.astype(self.dtype) self.attrs = { 'strides': self.strides, @@ -171,8 +190,20 @@ def setUp(self): 'adaptive': self.adaptive, } - self.inputs = {'X': input} - self.outputs = {'Out': output, "Mask": mask} + if self.is_bfloat16_op(): + self.inputs = {'X': convert_float_to_uint16(input)} + self.outputs = { + 'Out': convert_float_to_uint16(output), + "Mask": mask, + } + self.inputs_fp32 = {'X': input} + + else: + self.inputs = {'X': input} + self.outputs = {'Out': output, "Mask": mask} + + def init_dtype(self): + self.dtype = np.float64 def test_check_output(self): self.check_output() @@ -220,9 +251,90 @@ def init_global(self): self.global_pool = False -# ----------------max_pool2d_with_index---------------- +class TestCastAdaptive3d(TestMaxPoolWithIndex_Op): + def init_adaptive(self): + self.adaptive = True +# ----------------max_pool3d_with_index_fp16---------------- +def create_test_fp16_class(parent): + @unittest.skipIf( + not core.is_compiled_with_cuda(), "core is not compiled with CUDA" + ) + class TestMaxPool3dFP16(parent): + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place) + + def test_check_grad(self): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_grad_with_place(place, {'X'}, ['Out']) + + cls_name = "{}_{}".format(parent.__name__, "FP16OP") + TestMaxPool3dFP16.__name__ = cls_name + globals()[cls_name] = TestMaxPool3dFP16 + + +create_test_fp16_class(TestMaxPoolWithIndex_Op) +create_test_fp16_class(TestCase1) +create_test_fp16_class(TestCase2) +create_test_fp16_class(TestCase3) +create_test_fp16_class(TestCastAdaptive3d) + + +# ----------------max_pool3d_with_index_bf16---------------- +def create_test_bf16_class(parent): + @unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA and do not support bfloat16", + ) + class TestMaxPool3dBF16(parent): + def init_dtype(self): + self.dtype = np.uint16 + + def get_numeric_grad(self, place, check_name): + scope = core.Scope() + self._check_grad_helper() + op = create_op( + scope, self.op_type, self.inputs, self.outputs, self.attrs + ) + return get_numeric_gradient( + place, scope, op, self.inputs_fp32, check_name, ['Out'] + ) + + def test_check_output(self): + place = core.CUDAPlace(0) + if core.is_bfloat16_supported(place): + self.check_output_with_place(place) + + def test_check_grad(self): + place = core.CUDAPlace(0) + numeric_grads = self.get_numeric_grad(place, 'X') + if core.is_bfloat16_supported(place): + self.check_grad_with_place( + place, {'X'}, ['Out'], user_defined_grads=[numeric_grads] + ) + + cls_name = "{}_{}".format(parent.__name__, "BF16OP") + TestMaxPool3dBF16.__name__ = cls_name + globals()[cls_name] = TestMaxPool3dBF16 + + +create_test_bf16_class(TestMaxPoolWithIndex_Op) +create_test_bf16_class(TestCase1) +create_test_bf16_class(TestCase2) +create_test_bf16_class(TestCase3) +create_test_bf16_class(TestCastAdaptive3d) + + +# ----------------max_pool2d_with_index---------------- def max_pool2d_with_index_wapper( x, kernel_size=[], @@ -279,9 +391,82 @@ def init_adaptive(self): self.adaptive = True -class TestCastAdaptive3d(TestMaxPoolWithIndex_Op): - def init_adaptive(self): - self.adaptive = True +# ----------------max_pool2d_with_index_fp16---------------- +def create_test_fp16_class(parent): + @unittest.skipIf( + not core.is_compiled_with_cuda(), "core is not compiled with CUDA" + ) + class TestMaxPool2dFP16(parent): + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place) + + def test_check_grad(self): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_grad_with_place(place, {'X'}, ['Out']) + + cls_name = "{}_{}".format(parent.__name__, "FP16OP") + TestMaxPool2dFP16.__name__ = cls_name + globals()[cls_name] = TestMaxPool2dFP16 + + +create_test_fp16_class(TestCase4) +create_test_fp16_class(TestCase5) +create_test_fp16_class(TestCase6) +create_test_fp16_class(TestCase7) +create_test_fp16_class(TestCastAdaptive2d) + + +# ----------------max_pool2d_with_index_bf16---------------- +def create_test_bf16_class(parent): + @unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA and do not support bfloat16", + ) + class TestMaxPool2dBF16(parent): + def init_dtype(self): + self.dtype = np.uint16 + + def get_numeric_grad(self, place, check_name): + scope = core.Scope() + self._check_grad_helper() + op = create_op( + scope, self.op_type, self.inputs, self.outputs, self.attrs + ) + return get_numeric_gradient( + place, scope, op, self.inputs_fp32, check_name, ['Out'] + ) + + def test_check_output(self): + place = core.CUDAPlace(0) + if core.is_bfloat16_supported(place): + self.check_output_with_place(place) + + def test_check_grad(self): + place = core.CUDAPlace(0) + numeric_grads = self.get_numeric_grad(place, 'X') + if core.is_bfloat16_supported(place): + self.check_grad_with_place( + place, {'X'}, ['Out'], user_defined_grads=[numeric_grads] + ) + + cls_name = "{}_{}".format(parent.__name__, "BF16OP") + TestMaxPool2dBF16.__name__ = cls_name + globals()[cls_name] = TestMaxPool2dBF16 + + +create_test_bf16_class(TestCase4) +create_test_bf16_class(TestCase5) +create_test_bf16_class(TestCase6) +create_test_bf16_class(TestCase7) +create_test_bf16_class(TestCastAdaptive2d) if __name__ == '__main__': From c7c5635e0752e3cdc4aa4cc50c10294cf154000e Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <39978853+zhoutianzi666@users.noreply.github.com> Date: Tue, 25 Apr 2023 13:24:10 +0800 Subject: [PATCH 045/405] [Paddle-TRT] The Graph uses OpConverterType for op converter (#53214) * add ```converter_type``` for op converter --- paddle/fluid/inference/tensorrt/convert/op_converter.h | 5 +++-- .../tensorrt/convert/test_custom_plugin_creater.cc | 4 ++-- .../inference/tensorrt/convert/test_op_converter.cc | 2 +- paddle/fluid/inference/tensorrt/op_teller.cc | 6 +++--- paddle/fluid/inference/tensorrt/op_teller.h | 9 ++------- .../fluid/operators/tensorrt/tensorrt_engine_op_test.cc | 7 +++++-- 6 files changed, 16 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index ee8cc0c8681c0..87ad887cef383 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -56,8 +56,9 @@ class OpConverter { OpConverter* it{nullptr}; - auto op_converter_type_map = OpTeller::Global().GetOpConverterTypeMap(); - switch (op_converter_type_map.at(op_desc.Type())) { + auto converter_type = static_cast( + PADDLE_GET_CONST(int, op_desc.GetAttr("converter_type"))); + switch (converter_type) { case OpConverterType::Default: if (op_desc.Type().find("elementwise") != std::string::npos) { static std::unordered_set add_tensor_op_set{ diff --git a/paddle/fluid/inference/tensorrt/convert/test_custom_plugin_creater.cc b/paddle/fluid/inference/tensorrt/convert/test_custom_plugin_creater.cc index 47c3793ab9f0b..eee4d0c12edbe 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_custom_plugin_creater.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_custom_plugin_creater.cc @@ -109,7 +109,7 @@ TEST(CustomPluginCreater, StaticShapePlugin) { framework::OpDesc custom_op(*op_desc, nullptr); CHECK_EQ((*custom_plugin_tell)(custom_op, false, false), true); - OpTeller::Global().SetOpConverterType("custom_op", + OpTeller::Global().SetOpConverterType(&custom_op, OpConverterType::CustomPluginCreater); OpConverter converter; @@ -196,7 +196,7 @@ TEST(CustomPluginCreater, DynamicShapePlugin) { framework::OpDesc custom_op(*op_desc, nullptr); CHECK_EQ((*custom_plugin_tell)(custom_op, false, true), true); - OpTeller::Global().SetOpConverterType("custom_op", + OpTeller::Global().SetOpConverterType(&custom_op, OpConverterType::CustomPluginCreater); OpConverter converter; diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc index ee45b602b3ab9..3a15af255e5bc 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc @@ -57,7 +57,7 @@ TEST(OpConverter, ConvertBlock) { x_tensor->Resize(phi::make_ddim(dim_vec)); x_tensor->mutable_data(platform::CUDAPlace(0)); - OpTeller::Global().SetOpConverterType("conv2d", OpConverterType::Default); + OpTeller::Global().SetOpConverterType(conv2d_op, OpConverterType::Default); OpConverter converter; converter.ConvertBlock( *block->Proto(), {"conv2d-Y"}, scope, engine_.get() /*TensorRTEngine*/); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index b17aca9e8cb4d..78e300a8d730d 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -3080,17 +3080,17 @@ bool OpTeller::Tell(const framework::ir::Node* node, return false; auto& default_teller = GetDefaultTeller(); if ((*default_teller)(desc, use_no_calib_int8, with_dynamic_shape)) { - SetOpConverterType(op_type, OpConverterType::Default); + SetOpConverterType(node->Op(), OpConverterType::Default); return true; } auto& generic_plugin_teller = GetGenericPluginTeller(); if ((*generic_plugin_teller)(desc, use_no_calib_int8, with_dynamic_shape)) { - SetOpConverterType(op_type, OpConverterType::GenericPluginCreater); + SetOpConverterType(node->Op(), OpConverterType::GenericPluginCreater); return true; } auto& custom_plugin_teller = GetCustomPluginTeller(); if ((*custom_plugin_teller)(desc, use_no_calib_int8, with_dynamic_shape)) { - SetOpConverterType(op_type, OpConverterType::CustomPluginCreater); + SetOpConverterType(node->Op(), OpConverterType::CustomPluginCreater); return true; } return false; diff --git a/paddle/fluid/inference/tensorrt/op_teller.h b/paddle/fluid/inference/tensorrt/op_teller.h index 2fa3dc361217e..cb879cb5f9f61 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.h +++ b/paddle/fluid/inference/tensorrt/op_teller.h @@ -82,12 +82,8 @@ class OpTeller { std::unique_ptr& GetCustomPluginTeller() { return tellers_.at(2); } - void SetOpConverterType(std::string name, OpConverterType type) { - op_converter_type_map_[name] = type; - } - - const std::map& GetOpConverterTypeMap() const { - return op_converter_type_map_; + void SetOpConverterType(framework::OpDesc* op_desc, OpConverterType type) { + op_desc->SetAttr("converter_type", static_cast(type)); } private: @@ -95,7 +91,6 @@ class OpTeller { private: std::vector> tellers_; - std::map op_converter_type_map_; }; } // namespace tensorrt diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index 5a9fa7241e853..3d96361d89f04 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -94,6 +94,11 @@ void DynamicShapeTest(bool allow_build_at_runtime) { "Out", std::vector({"z0"})); // 2 x 4 x 4 x 4 elementwise_add1->SetAttr("axis", static_cast(0)); + inference::tensorrt::OpTeller::Global().SetOpConverterType( + elementwise_add0, inference::tensorrt::OpConverterType::Default); + inference::tensorrt::OpTeller::Global().SetOpConverterType( + elementwise_add1, inference::tensorrt::OpConverterType::Default); + // Set inputs' variable shape in BlockDesc // the batch size is 2, so the dims of 'x' is {2, 4} AddTensorToBlockDesc(block_, "x", std::vector({2, 4, 4, 4})); @@ -170,8 +175,6 @@ void DynamicShapeTest(bool allow_build_at_runtime) { // Execute them. LOG(INFO) << "engine_op run"; - inference::tensorrt::OpTeller::Global().SetOpConverterType( - "elementwise_add", inference::tensorrt::OpConverterType::Default); engine_op->Run(scope, place); } From 336bc20b18cc3f30dea0819ee99ddc47109a4282 Mon Sep 17 00:00:00 2001 From: zhangyikun02 <48021248+zhangyk0314@users.noreply.github.com> Date: Tue, 25 Apr 2023 13:44:29 +0800 Subject: [PATCH 046/405] tile op support 0D input for xpu (#53237) --- paddle/phi/kernels/xpu/tile_kernel.cc | 41 +++++++++++------------- test/xpu/test_tile_op_xpu.py | 46 +++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 23 deletions(-) diff --git a/paddle/phi/kernels/xpu/tile_kernel.cc b/paddle/phi/kernels/xpu/tile_kernel.cc index 419ff72e640ff..f6bc716a7d58a 100644 --- a/paddle/phi/kernels/xpu/tile_kernel.cc +++ b/paddle/phi/kernels/xpu/tile_kernel.cc @@ -31,13 +31,15 @@ void TileKernel(const Context& dev_ctx, DenseTensor* out) { using XPUType = typename XPUTypeTrait::Type; auto rank = x.dims().size(); - PADDLE_ENFORCE_GE( - rank, - 1, - errors::InvalidArgument( - "The rank of the input 'x' for tile op must be a positive " - "integer, but the value received is %d.", - rank)); + std::vector repeat_times = repeat_times_arr.GetData(); + int repeat_times_size = repeat_times.size(); + rank = std::max(rank, repeat_times_size); + PADDLE_ENFORCE_GE(rank, + 0, + errors::InvalidArgument( + "The rank of the input 'x' for tile op must be a >=0 " + "integer, but the value received is %d.", + rank)); PADDLE_ENFORCE_LE( rank, MAX_RANK_SUPPORTED, @@ -46,14 +48,12 @@ void TileKernel(const Context& dev_ctx, "must be less than or equal to %d, but the value received is %d.", MAX_RANK_SUPPORTED, rank)); - std::vector repeat_times = repeat_times_arr.GetData(); - int repeat_times_size = repeat_times.size(); PADDLE_ENFORCE_GE( repeat_times_size, - 1, + 0, errors::InvalidArgument( "The number of elements of the input 'repeat_times' for tile " - "op must be positive, but the value received is %d.", + "op must be >=0, but the value received is %d.", repeat_times_size)); PADDLE_ENFORCE_LE( repeat_times_size, @@ -102,20 +102,15 @@ void TileKernel(const Context& dev_ctx, dev_ctx.template Alloc(out); std::vector temp(repeat_times.size(), 1); - if (repeat_times == temp) { + if (rank == 0 || repeat_times == temp) { out->Resize(x.dims()); dev_ctx.template Alloc(out); - if (std::is_same::value) { - int r = xpu::copy(dev_ctx.x_context(), - reinterpret_cast(x.data()), - reinterpret_cast(out->data()), - 8 * x.numel()); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy"); - } else { - int r = xpu::copy( - dev_ctx.x_context(), x.data(), out->data(), x.numel()); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy"); - } + int64_t count = x.numel() * sizeof(T); + int r = xpu::copy(dev_ctx.x_context(), + reinterpret_cast(x.data()), + reinterpret_cast(out->data()), + count); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy"); return; } diff --git a/test/xpu/test_tile_op_xpu.py b/test/xpu/test_tile_op_xpu.py index dc2b0d7f0edcd..2e661199a0928 100644 --- a/test/xpu/test_tile_op_xpu.py +++ b/test/xpu/test_tile_op_xpu.py @@ -90,6 +90,21 @@ def init_data(self): self.ori_shape = (2, 4, 5, 7) self.repeat_times = (3, 2, 1, 2) + class TestTileOpRank_ZeroDim1(TestTileOpRank1): + def init_data(self): + self.ori_shape = [] + self.repeat_times = [] + + class TestTileOpRank_ZeroDim2(TestTileOpRank1): + def init_data(self): + self.ori_shape = [] + self.repeat_times = [2] + + class TestTileOpRank_ZeroDim3(TestTileOpRank1): + def init_data(self): + self.ori_shape = [] + self.repeat_times = [2, 3] + # Situation 2: repeat_times is a list (with tensor) class XPUTestTileOpRank1_tensor_attr(XPUOpTestWrapper): @@ -209,5 +224,36 @@ def test_api(self): assert np.array_equal(out_3.numpy(), np.tile(np_x, (2, 3))) +class TestTileAPI_ZeroDim(unittest.TestCase): + def test_dygraph(self): + paddle.disable_static() + + x = paddle.rand([]) + x.stop_gradient = False + + out = paddle.tile(x, []) + out.retain_grads() + out.backward() + self.assertEqual(out.shape, []) + self.assertEqual(x.grad.shape, []) + self.assertEqual(out.grad.shape, []) + + out = paddle.tile(x, [3]) + out.retain_grads() + out.backward() + self.assertEqual(out.shape, [3]) + self.assertEqual(x.grad.shape, []) + self.assertEqual(out.grad.shape, [3]) + + out = paddle.tile(x, [2, 3]) + out.retain_grads() + out.backward() + self.assertEqual(out.shape, [2, 3]) + self.assertEqual(x.grad.shape, []) + self.assertEqual(out.grad.shape, [2, 3]) + + paddle.enable_static() + + if __name__ == "__main__": unittest.main() From bddeecd1a8e57d69110fbc0f5896a60bb25baa46 Mon Sep 17 00:00:00 2001 From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com> Date: Tue, 25 Apr 2023 14:07:45 +0800 Subject: [PATCH 047/405] test,test=develop (#53301) --- .../distributed/collective/process_group.h | 267 +++++++++--------- 1 file changed, 137 insertions(+), 130 deletions(-) diff --git a/paddle/fluid/distributed/collective/process_group.h b/paddle/fluid/distributed/collective/process_group.h index df9cfdfbc794f..447fc5d1b3c7b 100644 --- a/paddle/fluid/distributed/collective/process_group.h +++ b/paddle/fluid/distributed/collective/process_group.h @@ -59,20 +59,20 @@ class ProcessGroup { virtual ~Task() = default; virtual bool IsCompleted(); - virtual bool Wait(std::chrono::milliseconds timeout = kWaitTimeout) { + virtual bool Wait(std::chrono::milliseconds timeout UNUSED = kWaitTimeout) { return false; } virtual void Synchronize() {} - virtual void UpdateWaitChain(const phi::DeviceContext& ctx) {} + virtual void UpdateWaitChain(const phi::DeviceContext& ctx UNUSED) {} bool IsSync() const { return sync_op_; } // TODO(sunyilun): methods below will be removed later Task(int rank, - const std::vector& inputs, + const std::vector& inputs UNUSED, CommType comm_type) : rank_(rank), comm_type_(comm_type) {} Task(int rank, - const std::vector& inputs, + const std::vector& inputs UNUSED, CommType comm_type, bool sync_op) : rank_(rank), comm_type_(comm_type), sync_op_(sync_op) {} @@ -97,14 +97,15 @@ class ProcessGroup { virtual std::string GetBackendName() const = 0; - virtual phi::DeviceContext* GetDeviceContext(const Place& place) const { + virtual phi::DeviceContext* GetDeviceContext( + const Place& place UNUSED) const { PADDLE_THROW(phi::errors::Unimplemented( "ProcessGroup%s does not support get device_context.", GetBackendName())); } - virtual phi::DeviceContext* GetDeviceContext(const Place& place, - bool use_calc_stream) const { + virtual phi::DeviceContext* GetDeviceContext( + const Place& place UNUSED, bool use_calc_stream UNUSED) const { PADDLE_THROW(phi::errors::Unimplemented( "ProcessGroup%s does not support get device_context.", GetBackendName())); @@ -112,123 +113,127 @@ class ProcessGroup { // without stream APIs virtual std::shared_ptr AllGather( - phi::DenseTensor* out_tensor, - const phi::DenseTensor& in_tensor, - bool sync_op) { + phi::DenseTensor* out_tensor UNUSED, + const phi::DenseTensor& in_tensor UNUSED, + bool sync_op UNUSED) { PADDLE_THROW(phi::errors::Unimplemented( "ProcessGroup%s does not support all_gather with sync_op flag.", GetBackendName())); } virtual std::shared_ptr AllGather( - phi::DenseTensor* out_tensor, - const phi::DenseTensor& in_tensor, - int64_t offset, - int64_t numel, - bool sync_op) { + phi::DenseTensor* out_tensor UNUSED, + const phi::DenseTensor& in_tensor UNUSED, + int64_t offset UNUSED, + int64_t numel UNUSED, + bool sync_op UNUSED) { PADDLE_THROW(phi::errors::Unimplemented( "ProcessGroup%s does not support all_gather with sync_op flag.", GetBackendName())); } virtual std::shared_ptr AllReduce( - phi::DenseTensor* out_tensor, - const phi::DenseTensor& in_tensor, - const AllreduceOptions& opts, - bool sync_op) { + phi::DenseTensor* out_tensor UNUSED, + const phi::DenseTensor& in_tensor UNUSED, + const AllreduceOptions& opts UNUSED, + bool sync_op UNUSED) { PADDLE_THROW(phi::errors::Unimplemented( "ProcessGroup%s does not support all_reduce with sync_op flag.", GetBackendName())); } virtual std::shared_ptr AllToAll( - phi::DenseTensor* out_tensor, - const phi::DenseTensor& in_tensor, - const std::vector& out_size_each_rank, - const std::vector& in_size_each_rank, - bool sync_op) { + phi::DenseTensor* out_tensor UNUSED, + const phi::DenseTensor& in_tensor UNUSED, + const std::vector& out_size_each_rank UNUSED, + const std::vector& in_size_each_rank UNUSED, + bool sync_op UNUSED) { PADDLE_THROW(phi::errors::Unimplemented( "ProcessGroup%s does not support all_to_all with sync_op flag.", GetBackendName())); } virtual std::shared_ptr Barrier( - const BarrierOptions& = BarrierOptions()) { + const BarrierOptions& UNUSED = BarrierOptions()) { PADDLE_THROW(phi::errors::Unimplemented( "ProcessGroup%s does not support barrier.", GetBackendName())); } virtual std::shared_ptr Broadcast( - phi::DenseTensor* out_tensor, - const phi::DenseTensor& in_tensor, - const BroadcastOptions& opts, - bool sync_op) { + phi::DenseTensor* out_tensor UNUSED, + const phi::DenseTensor& in_tensor UNUSED, + const BroadcastOptions& opts UNUSED, + bool sync_op UNUSED) { PADDLE_THROW(phi::errors::Unimplemented( "ProcessGroup%s does not support broadcast with sync_op flag", GetBackendName())); } virtual std::shared_ptr Reduce( - phi::DenseTensor* out_tensor, - const phi::DenseTensor& in_tensor, - const ReduceOptions& opts, - bool sync_op) { + phi::DenseTensor* out_tensor UNUSED, + const phi::DenseTensor& in_tensor UNUSED, + const ReduceOptions& opts UNUSED, + bool sync_op UNUSED) { PADDLE_THROW(phi::errors::Unimplemented( "ProcessGroup%s does not support reduce with sync_op flag.", GetBackendName())); } virtual std::shared_ptr ReduceScatter( - phi::DenseTensor* out_tensor, - const phi::DenseTensor& in_tensor, - const ReduceScatterOptions& opts, - bool sync_op) { + phi::DenseTensor* out_tensor UNUSED, + const phi::DenseTensor& in_tensor UNUSED, + const ReduceScatterOptions& opts UNUSED, + bool sync_op UNUSED) { PADDLE_THROW(phi::errors::Unimplemented( "ProcessGroup%s does not support reduce_scatter with sync_op flag.", GetBackendName())); } virtual std::shared_ptr Scatter( - phi::DenseTensor* out_tensor, - const phi::DenseTensor& in_tensor, - const ScatterOptions& opts, - bool sync_op) { + phi::DenseTensor* out_tensor UNUSED, + const phi::DenseTensor& in_tensor UNUSED, + const ScatterOptions& opts UNUSED, + bool sync_op UNUSED) { PADDLE_THROW(phi::errors::Unimplemented( "ProcessGroup%s does not support scatter with sync_op flag.", GetBackendName())); } - virtual std::shared_ptr Recv(phi::DenseTensor* tensor, - int src_rank, - bool sync_op) { + virtual std::shared_ptr Recv(phi::DenseTensor* tensor + UNUSED, + int src_rank UNUSED, + bool sync_op UNUSED) { PADDLE_THROW(phi::errors::Unimplemented( "ProcessGroup%s does not support recv with sync_op flag.", GetBackendName())); } - virtual std::shared_ptr Recv(phi::DenseTensor* tensor, - int src_rank, - int64_t offset, - int64_t numel, - bool sync_op) { + virtual std::shared_ptr Recv(phi::DenseTensor* tensor + UNUSED, + int src_rank UNUSED, + int64_t offset UNUSED, + int64_t numel UNUSED, + bool sync_op UNUSED) { PADDLE_THROW(phi::errors::Unimplemented( "ProcessGroup%s does not support recv with sync_op flag.", GetBackendName())); } virtual std::shared_ptr Send( - const phi::DenseTensor& tensor, int dst_rank, bool sync_op) { + const phi::DenseTensor& tensor UNUSED, + int dst_rank UNUSED, + bool sync_op UNUSED) { PADDLE_THROW(phi::errors::Unimplemented( "ProcessGroup%s does not support send with sync_op flag.", GetBackendName())); } virtual std::shared_ptr Send( - const phi::DenseTensor& tensor, - int dst_rank, - int64_t offset, - int64_t numel, - bool sync_op) { + const phi::DenseTensor& tensor UNUSED, + int dst_rank UNUSED, + int64_t offset UNUSED, + int64_t numel UNUSED, + bool sync_op UNUSED) { PADDLE_THROW(phi::errors::Unimplemented( "ProcessGroup%s does not support send with sync_op flag.", GetBackendName())); @@ -236,10 +241,10 @@ class ProcessGroup { // stream APIs virtual std::shared_ptr AllGather( - phi::DenseTensor* out_tensor, - const phi::DenseTensor& in_tensor, - bool sync_op, - bool use_calc_stream) { + phi::DenseTensor* out_tensor UNUSED, + const phi::DenseTensor& in_tensor UNUSED, + bool sync_op UNUSED, + bool use_calc_stream UNUSED) { PADDLE_THROW( phi::errors::Unimplemented("ProcessGroup%s does not support all_gather " "with sync_op and use_calc_stream flag.", @@ -247,12 +252,12 @@ class ProcessGroup { } virtual std::shared_ptr AllGather( - phi::DenseTensor* out_tensor, - const phi::DenseTensor& in_tensor, - int64_t offset, - int64_t numel, - bool sync_op, - bool use_calc_stream) { + phi::DenseTensor* out_tensor UNUSED, + const phi::DenseTensor& in_tensor UNUSED, + int64_t offset UNUSED, + int64_t numel UNUSED, + bool sync_op UNUSED, + bool use_calc_stream UNUSED) { PADDLE_THROW( phi::errors::Unimplemented("ProcessGroup%s does not support all_gather " "with sync_op and use_calc_stream flag.", @@ -260,11 +265,11 @@ class ProcessGroup { } virtual std::shared_ptr AllReduce( - phi::DenseTensor* out_tensor, - const phi::DenseTensor& in_tensor, - const AllreduceOptions& opts, - bool sync_op, - bool use_calc_stream) { + phi::DenseTensor* out_tensor UNUSED, + const phi::DenseTensor& in_tensor UNUSED, + const AllreduceOptions& opts UNUSED, + bool sync_op UNUSED, + bool use_calc_stream UNUSED) { PADDLE_THROW( phi::errors::Unimplemented("ProcessGroup%s does not support all_reduce " "with sync_op and use_calc_stream flag.", @@ -272,12 +277,12 @@ class ProcessGroup { } virtual std::shared_ptr AllToAll( - phi::DenseTensor* out_tensor, - const phi::DenseTensor& in_tensor, - const std::vector& out_size_each_rank, - const std::vector& in_size_each_rank, - bool sync_op, - bool use_calc_stream) { + phi::DenseTensor* out_tensor UNUSED, + const phi::DenseTensor& in_tensor UNUSED, + const std::vector& out_size_each_rank UNUSED, + const std::vector& in_size_each_rank UNUSED, + bool sync_op UNUSED, + bool use_calc_stream UNUSED) { PADDLE_THROW( phi::errors::Unimplemented("ProcessGroup%s does not support all_to_all " "with sync_op and use_calc_stream flag.", @@ -285,11 +290,11 @@ class ProcessGroup { } virtual std::shared_ptr Broadcast( - phi::DenseTensor* out_tensor, - const phi::DenseTensor& in_tensor, - const BroadcastOptions& opts, - bool sync_op, - bool use_calc_stream) { + phi::DenseTensor* out_tensor UNUSED, + const phi::DenseTensor& in_tensor UNUSED, + const BroadcastOptions& opts UNUSED, + bool sync_op UNUSED, + bool use_calc_stream UNUSED) { PADDLE_THROW( phi::errors::Unimplemented("ProcessGroup%s does not support broadcast " "with sync_op and use_calc_stream flag.", @@ -297,11 +302,11 @@ class ProcessGroup { } virtual std::shared_ptr Reduce( - phi::DenseTensor* out_tensor, - const phi::DenseTensor& in_tensor, - const ReduceOptions& opts, - bool sync_op, - bool use_calc_stream) { + phi::DenseTensor* out_tensor UNUSED, + const phi::DenseTensor& in_tensor UNUSED, + const ReduceOptions& opts UNUSED, + bool sync_op UNUSED, + bool use_calc_stream UNUSED) { PADDLE_THROW( phi::errors::Unimplemented("ProcessGroup%s does not support reduce " "with sync_op and use_calc_stream flag.", @@ -309,11 +314,11 @@ class ProcessGroup { } virtual std::shared_ptr ReduceScatter( - phi::DenseTensor* out_tensor, - const phi::DenseTensor& in_tensor, - const ReduceScatterOptions& opts, - bool sync_op, - bool use_calc_stream) { + phi::DenseTensor* out_tensor UNUSED, + const phi::DenseTensor& in_tensor UNUSED, + const ReduceScatterOptions& opts UNUSED, + bool sync_op UNUSED, + bool use_calc_stream UNUSED) { PADDLE_THROW(phi::errors::Unimplemented( "ProcessGroup%s does not support reduce_scatter " "with sync_op and use_calc_stream flag.", @@ -321,11 +326,11 @@ class ProcessGroup { } virtual std::shared_ptr Scatter( - phi::DenseTensor* out_tensor, - const phi::DenseTensor& in_tensor, - const ScatterOptions& opts, - bool sync_op, - bool use_calc_stream) { + phi::DenseTensor* out_tensor UNUSED, + const phi::DenseTensor& in_tensor UNUSED, + const ScatterOptions& opts UNUSED, + bool sync_op UNUSED, + bool use_calc_stream UNUSED) { PADDLE_THROW( phi::errors::Unimplemented("ProcessGroup%s does not support scatter " "with sync_op and use_calc_stream flag.", @@ -333,11 +338,11 @@ class ProcessGroup { } virtual std::shared_ptr Gather( - phi::DenseTensor* out_tensor, - const phi::DenseTensor& in_tensor, - const GatherOptions& opts, - bool sync_op, - bool use_calc_stream) { + phi::DenseTensor* out_tensor UNUSED, + const phi::DenseTensor& in_tensor UNUSED, + const GatherOptions& opts UNUSED, + bool sync_op UNUSED, + bool use_calc_stream UNUSED) { PADDLE_THROW( phi::errors::Unimplemented("ProcessGroup%s does not support gather " "with sync_op and use_calc_stream flag.", @@ -345,33 +350,35 @@ class ProcessGroup { } virtual std::shared_ptr Gather( - std::vector* gather_tensors_ptr, - const phi::DenseTensor& in_tensor, - const GatherOptions& opts, - bool sync_op, - bool use_calc_stream) { + std::vector* gather_tensors_ptr UNUSED, + const phi::DenseTensor& in_tensor UNUSED, + const GatherOptions& opts UNUSED, + bool sync_op UNUSED, + bool use_calc_stream UNUSED) { PADDLE_THROW( phi::errors::Unimplemented("ProcessGroup%s does not support gather " "with sync_op and use_calc_stream flag.", GetBackendName())); } - virtual std::shared_ptr Recv(phi::DenseTensor* tensor, - int src_rank, - bool sync_op, - bool use_calc_stream) { + virtual std::shared_ptr Recv( + phi::DenseTensor* tensor UNUSED, + int src_rank UNUSED, + bool sync_op UNUSED, + bool use_calc_stream UNUSED) { PADDLE_THROW( phi::errors::Unimplemented("ProcessGroup%s does not support recv with " "sync_op and use_calc_stream flag.", GetBackendName())); } - virtual std::shared_ptr Recv(phi::DenseTensor* tensor, - int src_rank, - int64_t offset, - int64_t numel, - bool sync_op, - bool use_calc_stream) { + virtual std::shared_ptr Recv( + phi::DenseTensor* tensor UNUSED, + int src_rank UNUSED, + int64_t offset UNUSED, + int64_t numel UNUSED, + bool sync_op UNUSED, + bool use_calc_stream UNUSED) { PADDLE_THROW( phi::errors::Unimplemented("ProcessGroup%s does not support recv " "with sync_op and use_calc_stream flag.", @@ -379,10 +386,10 @@ class ProcessGroup { } virtual std::shared_ptr Send( - const phi::DenseTensor& tensor, - int dst_rank, - bool sync_op, - bool use_calc_stream) { + const phi::DenseTensor& tensor UNUSED, + int dst_rank UNUSED, + bool sync_op UNUSED, + bool use_calc_stream UNUSED) { PADDLE_THROW( phi::errors::Unimplemented("ProcessGroup%s does not support send " "with sync_op and use_calc_stream flag.", @@ -390,12 +397,12 @@ class ProcessGroup { } virtual std::shared_ptr Send( - const phi::DenseTensor& tensor, - int dst_rank, - int64_t offset, - int64_t numel, - bool sync_op, - bool use_calc_stream) { + const phi::DenseTensor& tensor UNUSED, + int dst_rank UNUSED, + int64_t offset UNUSED, + int64_t numel UNUSED, + bool sync_op UNUSED, + bool use_calc_stream UNUSED) { PADDLE_THROW( phi::errors::Unimplemented("ProcessGroup%s does not support send " "with sync_op and use_calc_stream flag.", @@ -407,7 +414,7 @@ class ProcessGroup { virtual std::shared_ptr AllReduce( std::vector& /* input tensors */, // NOLINT std::vector& /* output tensors */, // NOLINT - const AllreduceOptions& = AllreduceOptions()) { + const AllreduceOptions& UNUSED = AllreduceOptions()) { PADDLE_THROW(phi::errors::InvalidArgument( "ProcessGroup%s does not support allreduce", GetBackendName())); } @@ -415,7 +422,7 @@ class ProcessGroup { virtual std::shared_ptr AllReduce( std::vector& /* input tensors */, // NOLINT std::vector& /* output tensors */, // NOLINT - const AllreduceOptions&, + const AllreduceOptions& UNUSED, bool) { PADDLE_THROW(phi::errors::InvalidArgument( "ProcessGroup%s does not support allreduce with sync_op flag", @@ -426,7 +433,7 @@ class ProcessGroup { virtual std::shared_ptr Broadcast( std::vector& /* input tensors */, // NOLINT std::vector& /* output tensors */, // NOLINT - const BroadcastOptions& = BroadcastOptions()) { + const BroadcastOptions& UNUSED = BroadcastOptions()) { PADDLE_THROW(phi::errors::InvalidArgument( "ProcessGroup%s does not support broadcast", GetBackendName())); } @@ -434,7 +441,7 @@ class ProcessGroup { virtual std::shared_ptr Broadcast( std::vector& /* input tensors */, // NOLINT std::vector& /* output tensors */, // NOLINT - const BroadcastOptions&, + const BroadcastOptions& UNUSED, bool) { PADDLE_THROW(phi::errors::InvalidArgument( "ProcessGroup%s does not support broadcast with sync_op flag", From ed45ecc626c07d50c0a1128b92797a4247d69aa9 Mon Sep 17 00:00:00 2001 From: qizhaoaoe <10208099+qizhaoaoe@users.noreply.github.com> Date: Tue, 25 Apr 2023 14:25:03 +0800 Subject: [PATCH 048/405] [fluid clean] remove Print. (#51778) * fluid clean: remove print/switch from fluid to static * remove Switch in static.__init__ * fix conflicts. * replace Switch by case. * fix piecewise_lr decay. * fix typo * fix conflicts. * fix lr dtype * keep Switch in paddle.static.nn.control_flow and fix piecewise_lr. * fix conflicts. * keep Switch in the fluid. * fix Switch doc * fix example in Switch doc * fix Switch doc. * fix static/__init__. --- python/paddle/fluid/layers/control_flow.py | 100 +----------------- .../fluid/layers/learning_rate_scheduler.py | 37 ++++--- python/paddle/fluid/optimizer.py | 69 ++++++------ .../dist_fleet_heter_pipeline_ctr.py | 2 +- .../fluid/tests/unittests/test_switch.py | 34 +++--- .../paddle/jit/dy2static/convert_operators.py | 4 +- python/paddle/nn/layer/rnn.py | 2 - python/paddle/reader/decorator.py | 3 +- python/paddle/static/__init__.py | 2 +- python/paddle/static/nn/control_flow.py | 100 +++++++++++++++++- .../test_multi_precision_fp16_train.py | 10 +- test/ipu/test_print_op_ipu.py | 2 +- 12 files changed, 185 insertions(+), 180 deletions(-) diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index a0ad94df79d67..6d402df9f3cd4 100755 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -50,7 +50,6 @@ __all__ = [ 'Switch', 'StaticRNN', - 'Print', 'while_loop', ] @@ -141,104 +140,6 @@ def select_input(inputs, mask): return out -@static_only -def Print( - input, - first_n=-1, - message=None, - summarize=20, - print_tensor_name=True, - print_tensor_type=True, - print_tensor_shape=True, - print_tensor_layout=True, - print_tensor_lod=True, - print_phase='both', -): - ''' - :api_attr: Static Graph - - **Print operator** - - This creates a print op that will print when a tensor is accessed. - - Wraps the tensor passed in so that whenever that a tensor is accessed, - the message `message` is printed, along with the current value of the - tensor `t`. - - Args: - input (Tensor): A Tensor to print. - first_n (int, optional): Only log `first_n` number of times. Default: -1. - message (str, optional): A string message to print as a prefix. Default: None. - summarize (int, optional): Number of elements in the tensor to be print. If - it's value is -1, then all elements in the tensor will be print. - print_tensor_name (bool, optional): Print the tensor name. Default: True. - print_tensor_type (bool, optional): Print the tensor type. Defaultt: True. - print_tensor_shape (bool, optional): Print the tensor shape. Default: True. - print_tensor_layout (bool, optional): Print the tensor layout. Default: True. - print_tensor_lod (bool, optional): Print the tensor lod. Default: True. - print_phase (str, optional): Which phase to displace, including 'forward', - 'backward' and 'both'. Default: 'both'. If set to 'backward', will - only print the gradients of input tensor; If set to 'both', will - both print the input tensor itself and the gradients of input tensor. - - Returns: - Tensor: Output tensor. - - NOTES: - The input and output are two different Tensor, and in the - following process, you should use the output Tensor but not the input, - otherwise, the print layer doesn't have backward. - - Examples: - .. code-block:: python - - import paddle - - paddle.enable_static() - - x = paddle.full(shape=[2, 3], fill_value=3, dtype='int64') - out = paddle.static.Print(x, message="The content of input layer:") - - main_program = paddle.static.default_main_program() - exe = paddle.static.Executor(place=paddle.CPUPlace()) - res = exe.run(main_program, fetch_list=[out]) - # Variable: fill_constant_1.tmp_0 - # - message: The content of input layer: - # - lod: {} - # - place: CPUPlace - # - shape: [2, 3] - # - layout: NCHW - # - dtype: long - # - data: [3 3 3 3 3 3] - ''' - check_variable_and_dtype( - input, - 'input', - ['float32', 'float64', 'int32', 'int64', 'bool'], - 'fluid.layers.Print', - ) - - helper = LayerHelper('print' + "_" + input.name, **locals()) - output = helper.create_variable_for_type_inference(input.dtype) - helper.append_op( - type='print', - inputs={'In': input}, - outputs={'Out': output}, - attrs={ - 'first_n': first_n, - 'summarize': summarize, - 'message': message or "", - 'print_tensor_name': print_tensor_name, - 'print_tensor_type': print_tensor_type, - 'print_tensor_shape': print_tensor_shape, - 'print_tensor_layout': print_tensor_layout, - 'print_tensor_lod': print_tensor_lod, - 'print_phase': print_phase.upper(), - }, - ) - return output - - # (TODO: Mine) There exists dependency. It will be removed later. class BlockGuard: """ @@ -1512,6 +1413,7 @@ def map_fn(n1, n2, name, order): return nest1_out, nest2_out +# TODO: It will be deleted later. class Switch: """ :api_attr: Static Graph diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py index 6d86fa9448c17..65fe1d1e77b35 100644 --- a/python/paddle/fluid/layers/learning_rate_scheduler.py +++ b/python/paddle/fluid/layers/learning_rate_scheduler.py @@ -359,9 +359,11 @@ def polynomial_decay( shape=[1], dtype='float32', value=1.0 ) - with control_flow.Switch() as switch: - with switch.case(global_step == zero_var): - paddle.assign(one_var, output=div_res) + div_val = paddle.static.nn.cond( + global_step == zero_var, lambda: one_var, lambda: div_res + ) + paddle.assign(div_val, output=div_res) + decay_steps = decay_steps * div_res else: decay_steps_var = paddle.tensor.fill_constant( @@ -432,7 +434,7 @@ def piecewise_decay(boundaries, values): persistable=True, name="learning_rate", ) - + # TODO: fluid.layers.control_flow.Switch should be replaced by paddle.static.nn.case(or cond) if possible with control_flow.Switch() as switch: for i in range(len(boundaries)): boundary_val = paddle.tensor.fill_constant( @@ -455,7 +457,6 @@ def piecewise_decay(boundaries, values): value=float(values[len(values) - 1]), out=lr, ) - return lr @@ -589,17 +590,19 @@ def linear_lr_warmup(learning_rate, warmup_steps, start_lr, end_lr): ) global_step = _decay_step_counter() - - with control_flow.Switch() as switch: - with switch.case(global_step < warmup_steps): - decayed_lr = start_lr + linear_step * ( - global_step / float(warmup_steps) + if not isinstance(learning_rate, Variable): + learning_rate = paddle.tensor.fill_constant( + shape=[1], dtype=dtype, value=float(learning_rate) + ) + lr_val = paddle.static.nn.case( + pred_fn_pairs=[ + ( + global_step < warmup_steps, + lambda: start_lr + + linear_step * (global_step / float(warmup_steps)), ) - paddle.assign(decayed_lr, lr) - with switch.default(): - if not isinstance(learning_rate, Variable): - learning_rate = paddle.tensor.fill_constant( - shape=[1], dtype=dtype, value=float(learning_rate) - ) - paddle.assign(learning_rate, lr) + ], + default=lambda: learning_rate, + ) + paddle.assign(lr_val, lr) return lr diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 1dca88d61e327..f6bd5dbd37c33 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -4373,12 +4373,12 @@ def __init__(self, decay=0.999, thres_steps=None, name=None): ema = block._clone_variable(self._ema_vars[param.name]) paddle.assign(param, output=tmp) # bias correction - with layers.control_flow.Switch() as switch: - with switch.case(global_step > 0): - paddle.assign(ema / (1.0 - decay_pow), output=param) - with switch.default(): - paddle.assign(ema, output=param) - + param_val = paddle.static.nn.cond( + global_step > 0, + lambda: ema / (1.0 - decay_pow), + lambda: ema, + ) + paddle.assign(param_val, output=param) self.restore_program = Program() block = self.restore_program.global_block() with program_guard(main_program=self.restore_program): @@ -4399,13 +4399,12 @@ def _get_ema_decay(self): if self._thres_steps is not None: decay_t = (self._thres_steps + 1.0) / (self._thres_steps + 10.0) - with layers.control_flow.Switch() as switch: - with switch.case(decay_t < self._decay): - paddle.assign(decay_t, decay_var) - with switch.default(): - paddle.assign( - np.array([self._decay], dtype=np.float32), decay_var - ) + decay_val = paddle.static.nn.cond( + decay_t < self._decay, + lambda: decay_t, + lambda: np.array([self._decay], dtype=np.float32), + ) + paddle.assign(decay_val, decay_var) return decay_var def _get_decay_pow(self, block): @@ -7408,26 +7407,30 @@ def minimize(self, loss, startup_program=None): ) mod = paddle.remainder(step, k) - with layers.control_flow.Switch() as switch: - with switch.case(step == one_var): - for param_name in params: - fast_var = main_block.var(param_name) - slow_var = param_to_slow[param_name] - paddle.assign(fast_var, output=slow_var) - with switch.case(mod == zero_var): - for param_name in params: - fast_var = main_block.var(param_name) - slow_var = param_to_slow[param_name] - tmp_var = paddle.add( - paddle.multiply(fast_var, alpha), - paddle.multiply( - slow_var, paddle.subtract(one_var, alpha) - ), - ) - paddle.assign(tmp_var, output=slow_var) - paddle.assign(tmp_var, output=fast_var) - with switch.default(): - pass + for param_name in params: + fast_var = main_block.var(param_name) + slow_var = param_to_slow[param_name] + tmp_var = paddle.add( + paddle.multiply(fast_var, alpha), + paddle.multiply(slow_var, paddle.subtract(one_var, alpha)), + ) + slow_val = paddle.static.nn.case( + [ + (step == one_var, lambda: fast_var), + (mod == zero_var, lambda: tmp_var), + ], + default=lambda: slow_var, + ) + paddle.assign(slow_val, slow_var) + + fast_val = paddle.static.nn.case( + [ + (mod == zero_var, lambda: tmp_var), + ], + default=lambda: fast_var, + ) + paddle.assign(fast_val, fast_var) + return mini_out diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_heter_pipeline_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_heter_pipeline_ctr.py index 5dd23f1352559..a5010e275aa8a 100644 --- a/python/paddle/fluid/tests/unittests/dist_fleet_heter_pipeline_ctr.py +++ b/python/paddle/fluid/tests/unittests/dist_fleet_heter_pipeline_ctr.py @@ -126,7 +126,7 @@ def net(self, args, batch_size=4, lr=0.01): input=predict, label=label, reduction='none', use_softmax=False ) avg_cost = paddle.mean(x=cost) - fluid.layers.Print(avg_cost, message="avg_cost") + paddle.static.Print(avg_cost, message="avg_cost") self.feeds = datas self.train_file_path = ["fake1", "fake2"] diff --git a/python/paddle/fluid/tests/unittests/test_switch.py b/python/paddle/fluid/tests/unittests/test_switch.py index d5d118867b1a5..428e5537f8ba6 100644 --- a/python/paddle/fluid/tests/unittests/test_switch.py +++ b/python/paddle/fluid/tests/unittests/test_switch.py @@ -15,7 +15,7 @@ import unittest import paddle -from paddle.fluid import core, framework, layers +from paddle.fluid import core, framework from paddle.fluid.executor import Executor from paddle.fluid.framework import default_startup_program @@ -40,15 +40,15 @@ def check_switch(self, value): shape=[1], value=-1.0, dtype='float32', persistable=True ) - with layers.Switch() as switch: - with switch.case(paddle.less_than(x, zero_var)): - paddle.assign(zero_var, result) - with switch.case(paddle.less_than(x, one_var)): - paddle.assign(one_var, result) - with switch.case(paddle.less_than(x, two_var)): - paddle.assign(two_var, result) - with switch.default(): - paddle.assign(three_var, result) + res = paddle.static.nn.case( + pred_fn_pairs=[ + (paddle.less_than(x, zero_var), lambda: zero_var), + (paddle.less_than(x, one_var), lambda: one_var), + (paddle.less_than(x, two_var), lambda: two_var), + ], + default=lambda: three_var, + ) + paddle.assign(res, result) cpu = core.CPUPlace() exe = Executor(cpu) @@ -85,17 +85,19 @@ def test_error(self): # 1. The type of 'condition' in case must be Variable. def test_condition_type(): - with layers.Switch() as switch: - with switch.case(1): - paddle.assign(zero_var, result) + res = paddle.static.nn.case( + [(1, lambda: zero_var)], default=lambda: result + ) + paddle.assign(res, result) self.assertRaises(TypeError, test_condition_type) # 2. The dtype of 'condition' in case must be 'bool'. def test_condition_dtype(): - with layers.Switch() as switch: - with switch.case(cond): - paddle.assign(zero_var, result) + res = paddle.static.nn.case( + [cond, lambda: zero_var], default=lambda: result + ) + paddle.assign(res, result) self.assertRaises(TypeError, test_condition_dtype) diff --git a/python/paddle/jit/dy2static/convert_operators.py b/python/paddle/jit/dy2static/convert_operators.py index ad9abcc9849ef..52d6b7cb854d9 100644 --- a/python/paddle/jit/dy2static/convert_operators.py +++ b/python/paddle/jit/dy2static/convert_operators.py @@ -21,7 +21,7 @@ in_declarative_mode, ) from paddle.fluid.framework import Variable, core -from paddle.fluid.layers import Print, control_flow +from paddle.fluid.layers import control_flow from paddle.fluid.layers.control_flow import while_loop from .utils import ( @@ -749,7 +749,7 @@ def convert_print(*objects, sep=' ', end='\n', file=None, flush=False): """ for obj in objects: if isinstance(obj, Variable): - Print(obj) + paddle.static.Print(obj) print(*objects, sep=sep, end=end, file=file, flush=flush) diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py index cc8ab648b8895..2a0c9157a7a4a 100644 --- a/python/paddle/nn/layer/rnn.py +++ b/python/paddle/nn/layer/rnn.py @@ -299,13 +299,11 @@ def _switch_grad(x, stop=False): pre_state = paddle.utils.map_structure( lambda x: paddle.tensor.array_read(x, start_i), init_array ) - # pre_state = paddle.fluid.layers.Print( pre_state, message="pre") outputs, new_states = cell(step_in, pre_state, **kwargs) assert isinstance(outputs, paddle.fluid.framework.Variable) paddle.utils.assert_same_structure(new_states, pre_state) if sequence_length: step_mask = paddle.unsqueeze(mask[start_i], 1) - # paddle.fluid.layers.Print( step_mask, message="mask") # new_states = map_structure( # partial(_maybe_copy, step_mask=step_mask), # pre_state, new_states diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py index e5c47ebdb3432..bd40c4553e89d 100644 --- a/python/paddle/reader/decorator.py +++ b/python/paddle/reader/decorator.py @@ -553,8 +553,9 @@ def _impl(): with fluid.program_guard(fluid.Program(), fluid.Program()): place = fluid.CPUPlace() # the 1st 2 is batch size + image = paddle.static.data(name='image', dtype='int64', shape=[2, 1, 2]) - fluid.layers.Print(image) + paddle.static.Print(image) # print detailed tensor info of image variable reader = fluid.io.PyReader(feed_list=[image], capacity=2) diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py index f63971b966a74..084579a58e591 100644 --- a/python/paddle/static/__init__.py +++ b/python/paddle/static/__init__.py @@ -65,7 +65,7 @@ from ..fluid.framework import Parameter # noqa: F401 from ..fluid.framework import ipu_shard_guard # noqa: F401 from ..fluid.framework import set_ipu_shard # noqa: F401 -from ..fluid.layers.control_flow import Print # noqa: F401 +from .nn.control_flow import Print # noqa: F401 from ..fluid.param_attr import WeightNormParamAttr # noqa: F401 from ..fluid.optimizer import Optimizer # noqa: F401 from ..fluid.optimizer import Adam # noqa: F401 diff --git a/python/paddle/static/nn/control_flow.py b/python/paddle/static/nn/control_flow.py index c5d5265477541..bc5f1d2d5d6f5 100644 --- a/python/paddle/static/nn/control_flow.py +++ b/python/paddle/static/nn/control_flow.py @@ -24,7 +24,7 @@ convert_dtype, ) from paddle.fluid import core -from paddle.fluid.framework import Operator, Program, Variable +from paddle.fluid.framework import Operator, Program, Variable, static_only # Temporary solution, it will be deleted later from paddle.fluid.layers.control_flow import ConditionalBlock, select_input @@ -1329,3 +1329,101 @@ def map_fn(x): nest1_out = pack_sequence_as(nest1, list(map(map_fn, flatten(nest1)))) nest2_out = pack_sequence_as(nest2, list(map(map_fn, flatten(nest2)))) return nest1_out, nest2_out + + +@static_only +def Print( + input, + first_n=-1, + message=None, + summarize=20, + print_tensor_name=True, + print_tensor_type=True, + print_tensor_shape=True, + print_tensor_layout=True, + print_tensor_lod=True, + print_phase='both', +): + ''' + :api_attr: Static Graph + + **Print operator** + + This creates a print op that will print when a tensor is accessed. + + Wraps the tensor passed in so that whenever that a tensor is accessed, + the message `message` is printed, along with the current value of the + tensor `t`. + + Args: + input (Tensor): A Tensor to print. + first_n (int, optional): Only log `first_n` number of times. Default: -1. + message (str, optional): A string message to print as a prefix. Default: None. + summarize (int, optional): Number of elements in the tensor to be print. If + it's value is -1, then all elements in the tensor will be print. + print_tensor_name (bool, optional): Print the tensor name. Default: True. + print_tensor_type (bool, optional): Print the tensor type. Defaultt: True. + print_tensor_shape (bool, optional): Print the tensor shape. Default: True. + print_tensor_layout (bool, optional): Print the tensor layout. Default: True. + print_tensor_lod (bool, optional): Print the tensor lod. Default: True. + print_phase (str, optional): Which phase to displace, including 'forward', + 'backward' and 'both'. Default: 'both'. If set to 'backward', will + only print the gradients of input tensor; If set to 'both', will + both print the input tensor itself and the gradients of input tensor. + + Returns: + Tensor: Output tensor. + + NOTES: + The input and output are two different Tensor, and in the + following process, you should use the output Tensor but not the input, + otherwise, the print layer doesn't have backward. + + Examples: + .. code-block:: python + + import paddle + + paddle.enable_static() + + x = paddle.full(shape=[2, 3], fill_value=3, dtype='int64') + out = paddle.static.Print(x, message="The content of input layer:") + + main_program = paddle.static.default_main_program() + exe = paddle.static.Executor(place=paddle.CPUPlace()) + res = exe.run(main_program, fetch_list=[out]) + # Variable: fill_constant_1.tmp_0 + # - message: The content of input layer: + # - lod: {} + # - place: CPUPlace + # - shape: [2, 3] + # - layout: NCHW + # - dtype: long + # - data: [3 3 3 3 3 3] + ''' + check_variable_and_dtype( + input, + 'input', + ['float32', 'float64', 'int32', 'int64', 'bool'], + 'paddle.static.Print', + ) + + helper = LayerHelper('print' + "_" + input.name, **locals()) + output = helper.create_variable_for_type_inference(input.dtype) + helper.append_op( + type='print', + inputs={'In': input}, + outputs={'Out': output}, + attrs={ + 'first_n': first_n, + 'summarize': summarize, + 'message': message or "", + 'print_tensor_name': print_tensor_name, + 'print_tensor_type': print_tensor_type, + 'print_tensor_shape': print_tensor_shape, + 'print_tensor_layout': print_tensor_layout, + 'print_tensor_lod': print_tensor_lod, + 'print_phase': print_phase.upper(), + }, + ) + return output diff --git a/test/contrib/test_multi_precision_fp16_train.py b/test/contrib/test_multi_precision_fp16_train.py index a364d2161ebe4..218cfcd542da4 100644 --- a/test/contrib/test_multi_precision_fp16_train.py +++ b/test/contrib/test_multi_precision_fp16_train.py @@ -295,12 +295,10 @@ def decorate_with_data_loader(self): one_var = paddle.tensor.fill_constant( shape=[1], dtype='int64', value=1 ) - with fluid.layers.control_flow.Switch() as switch: - with switch.case(label != zero_var): - paddle.assign(zero_var, output=label) - with switch.default(): - paddle.assign(one_var, output=label) - + label_val = paddle.static.nn.cond( + label != zero_var, lambda: zero_var, lambda: one_var + ) + paddle.assign(label_val, output=label) net = resnet_cifar10(image) logits = paddle.static.nn.fc( x=net, size=10, activation="softmax" diff --git a/test/ipu/test_print_op_ipu.py b/test/ipu/test_print_op_ipu.py index 358866ee0a812..10449cd48ae83 100644 --- a/test/ipu/test_print_op_ipu.py +++ b/test/ipu/test_print_op_ipu.py @@ -55,7 +55,7 @@ def build_model(self): dtype=self.feed_dtype[0], ) out = paddle.static.nn.conv2d(x, num_filters=3, filter_size=3) - out = paddle.fluid.layers.Print(out, **self.attrs) + out = paddle.static.Print(out, **self.attrs) if self.is_training: loss = paddle.mean(out) From 6f684bd2ee5de97053292a7bf648419273e671c3 Mon Sep 17 00:00:00 2001 From: Shaojie WANG Date: Tue, 25 Apr 2023 00:03:56 -0700 Subject: [PATCH 049/405] fix shared memory over usage in embedding grad kernel on deterministic mode (#53247) * fix shared memory over usage in embedding grad kernel on determistic mode * use IdT as interger dtype --- .../phi/kernels/gpu/embedding_grad_kernel.cu | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu index cb34f5844b0d9..4771dd15dd296 100644 --- a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu @@ -18,6 +18,7 @@ #include "glog/logging.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/core/kernel_registry.h" @@ -74,16 +75,14 @@ __global__ void EmbeddingGrad(T* table, } template -__global__ void EmbeddingGradDeterministic(T* table, - const T* output, - const IdT* ids, - const int64_t K, - const int64_t D) { +__global__ void EmbeddingGradDeterministic( + T* table, const T* output, const IdT* ids, const IdT K, const IdT D) { + using MT = typename dtype::MPTypeTrait::Type; extern __shared__ char buf[]; - T* smem = reinterpret_cast(buf); - T* my_s = smem + WARP_SIZE * threadIdx.y; - int64_t* indices_batch = - reinterpret_cast(buf + sizeof(T) * WARP_SIZE * BLOCKDIMY); + MT* smem = reinterpret_cast(buf); + MT* my_s = smem + WARP_SIZE * threadIdx.y; + IdT* indices_batch = + reinterpret_cast(buf + sizeof(MT) * WARP_SIZE * BLOCKDIMY); const int stride = static_cast(D); @@ -97,10 +96,10 @@ __global__ void EmbeddingGradDeterministic(T* table, batch_start += WARP_SIZE * BLOCKDIMY) { int tid = threadIdx.x + threadIdx.y * WARP_SIZE; if (batch_start + tid < K) - indices_batch[tid] = static_cast(ids[batch_start + tid]); + indices_batch[tid] = static_cast(ids[batch_start + tid]); int batch_end = - min(static_cast(batch_start + WARP_SIZE * BLOCKDIMY), K); + min(static_cast(batch_start + WARP_SIZE * BLOCKDIMY), K); // Loop over the batch of <= 1024 loaded indices in chunks of BLOCKDIMY for (int chunk_start = batch_start; chunk_start < batch_end; @@ -112,10 +111,10 @@ __global__ void EmbeddingGradDeterministic(T* table, int n_this_chunk = min(batch_end - chunk_start, BLOCKDIMY); - int64_t src_row = static_cast(chunk_start + threadIdx.y); - int64_t dst_row = indices_batch[src_row - batch_start]; + IdT src_row = static_cast(chunk_start + threadIdx.y); + IdT dst_row = indices_batch[src_row - batch_start]; if (src_row < K && feature < stride) - my_s[threadIdx.x] = static_cast(output[src_row * D + feature]); + my_s[threadIdx.x] = static_cast(output[src_row * D + feature]); __syncthreads(); @@ -202,11 +201,12 @@ struct EmbeddingGradCUDAFunctor { if (FLAGS_embedding_deterministic) { dim3 threads(WARP_SIZE, BLOCKDIMY); dim3 grids(static_cast((D + WARP_SIZE - 1) / WARP_SIZE)); + using MT = typename dtype::MPTypeTrait::Type; EmbeddingGradDeterministic <<>>(d_table, d_output, ids, K, D); } else { const int gridx = 2 * dev_ctx_.GetSMCount(); From d7a5e900fab07060a542b32fad887c3f0422f493 Mon Sep 17 00:00:00 2001 From: cyberslack_lee Date: Tue, 25 Apr 2023 15:05:48 +0800 Subject: [PATCH 050/405] =?UTF-8?q?=20=E3=80=90Hackathon=20No.61=E3=80=91m?= =?UTF-8?q?in=20=E7=AE=97=E5=AD=90FP16/BF16=E5=8D=95=E6=B5=8B=E5=AE=8C?= =?UTF-8?q?=E5=96=84=20(#52887)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../phi/kernels/gpu/reduce_min_grad_kernel.cu | 61 ++++++++++++++++++- paddle/phi/kernels/kps/reduce_min_kernel.cu | 12 +++- paddle/phi/kernels/reduce_min_kernel.cc | 15 ++++- .../fluid/tests/unittests/test_reduce_op.py | 45 ++++++++++++++ python/paddle/tensor/math.py | 5 +- 5 files changed, 132 insertions(+), 6 deletions(-) diff --git a/paddle/phi/kernels/gpu/reduce_min_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_min_grad_kernel.cu index ea1d377c45976..86cccc5e03b1c 100644 --- a/paddle/phi/kernels/gpu/reduce_min_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/reduce_min_grad_kernel.cu @@ -16,8 +16,63 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" +#include "paddle/phi/kernels/funcs/compare_functors.h" +#include "paddle/phi/kernels/funcs/elementwise_functor.h" +#include "paddle/phi/kernels/funcs/reduce_function.h" +namespace phi { + +template +void ReduceMinGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& out_grad, + const IntArray& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* x_grad) { + dev_ctx.Alloc(x_grad, x.dtype()); + reduce_all = recompute_reduce_all(x, dims, reduce_all); + + // get reduce_dim + int dim_size = x.dims().size(); + auto reduce_dims = + funcs::details::GetReduceDim(dims.GetData(), dim_size, reduce_all); + auto update_dims = vectorize(x.dims()); + for (auto i : reduce_dims) { + update_dims[i] = 1; + } + + // make new tensor of out and out_grad + phi::DenseTensor new_out(out.type()); + new_out.ShareDataWith(out); + new_out.Resize(phi::make_ddim(update_dims)); + + phi::DenseTensor new_out_grad(out_grad.type()); + new_out_grad.ShareDataWith(out_grad); + new_out_grad.Resize(phi::make_ddim(update_dims)); + + // make equal_out + phi::DenseTensor* equal_out = new phi::DenseTensor(); + equal_out->Resize(x.dims()); + dev_ctx.template Alloc(equal_out); + + // compute + // 1. equal_out = Equal(x, y) + std::vector equal_inputs = {&new_out, &x}; + std::vector equal_outputs = {equal_out}; + funcs::BroadcastKernel( + dev_ctx, equal_inputs, &equal_outputs, 0, funcs::EqualFunctor()); + + // 2. dx = dout * 1 + std::vector mul_inputs = {&new_out_grad, equal_out}; + std::vector mul_outputs = {x_grad}; + funcs::BroadcastKernel( + dev_ctx, mul_inputs, &mul_outputs, 0, funcs::MultiplyFunctor()); + delete equal_out; +} +} // namespace phi PD_REGISTER_KERNEL(min_grad, GPU, ALL_LAYOUT, @@ -25,4 +80,6 @@ PD_REGISTER_KERNEL(min_grad, float, double, int, - int64_t) {} + int64_t, + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/kps/reduce_min_kernel.cu b/paddle/phi/kernels/kps/reduce_min_kernel.cu index 450fee16b4ca9..8ed9ec30c1920 100644 --- a/paddle/phi/kernels/kps/reduce_min_kernel.cu +++ b/paddle/phi/kernels/kps/reduce_min_kernel.cu @@ -36,6 +36,14 @@ void MinRawKernel(const Context& dev_ctx, #ifdef PADDLE_WITH_XPU_KP PD_REGISTER_KERNEL(min_raw, KPS, ALL_LAYOUT, phi::MinRawKernel, float) {} #else -PD_REGISTER_KERNEL( - min_raw, KPS, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {} +PD_REGISTER_KERNEL(min_raw, + KPS, + ALL_LAYOUT, + phi::MinRawKernel, + float, + double, + int, + int64_t, + phi::dtype::float16, + phi::dtype::bfloat16) {} #endif diff --git a/paddle/phi/kernels/reduce_min_kernel.cc b/paddle/phi/kernels/reduce_min_kernel.cc index c4c58c8342e60..ff50e9d1077b0 100644 --- a/paddle/phi/kernels/reduce_min_kernel.cc +++ b/paddle/phi/kernels/reduce_min_kernel.cc @@ -39,7 +39,20 @@ void MinKernel(const Context& dev_ctx, PD_REGISTER_KERNEL( min, CPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) +PD_REGISTER_KERNEL(min, + GPU, + ALL_LAYOUT, + phi::MinKernel, + float, + double, + int, + int64_t, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#endif + +#if defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL( min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {} #endif diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py index 050879369244d..631b760a7b8da 100644 --- a/python/paddle/fluid/tests/unittests/test_reduce_op.py +++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py @@ -418,6 +418,51 @@ def test_check_output(self): self.check_output() +@skip_check_grad_ci( + reason="reduce_min is discontinuous non-derivable function," + " its gradient check is not supported by unittest framework." +) +class TestMinFP16Op(OpTest): + """Remove Min with subgradient from gradient check to confirm the success of CI.""" + + def setUp(self): + self.op_type = "reduce_min" + self.python_api = paddle.min + self.public_python_api = paddle.min + self.init_dtype() + if self.dtype == np.uint16: + x = np.random.random((5, 6, 10)).astype(np.float32) + self.inputs = {'X': convert_float_to_uint16(x)} + else: + x = np.random.random((5, 6, 10)).astype(self.dtype) + self.inputs = {'X': x} + self.attrs = {'dim': [2], 'keep_dim': True} + out = x.min(axis=tuple(self.attrs['dim']), keepdims=True) + if self.dtype == np.uint16: + self.outputs = {'Out': convert_float_to_uint16(out)} + else: + self.outputs = {'Out': out} + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output() + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA or not support the bfloat16", +) +class TestMinBF16Op(TestMinFP16Op): + def init_dtype(self): + self.dtype = np.uint16 + + def test_check_output(self): + self.check_output_with_place(core.CUDAPlace(0)) + + def raw_reduce_prod(x, dim=[0], keep_dim=False): return paddle.prod(x, dim, keep_dim) diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index 6b07f57e33a17..9f5212abf9147 100644 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -2456,7 +2456,10 @@ def min(x, axis=None, keepdim=False, name=None): reduce_all, axis = _get_reduce_axis_with_tensor(axis, x) helper = LayerHelper('min', **locals()) check_variable_and_dtype( - x, 'x', ['float32', 'float64', 'int32', 'int64'], 'min' + x, + 'x', + ['float16', 'uint16', 'float32', 'float64', 'int32', 'int64'], + 'min', ) out = helper.create_variable_for_type_inference(dtype=x.dtype) From eb677102725e0a653e1f3d291f35162fd9b18f1d Mon Sep 17 00:00:00 2001 From: Difer <707065510@qq.com> Date: Tue, 25 Apr 2023 15:19:26 +0800 Subject: [PATCH 051/405] =?UTF-8?q?=E3=80=90Hackathon=20No57=E3=80=91add?= =?UTF-8?q?=5Fbf16=5Ffp16=20unittest=20for=20conv3d=20&=20conv3d=5Ftranspo?= =?UTF-8?q?se=20(#52195)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add test+conv3d_transpose_part2 * fix some merge error * fix codestyle * fix typo * fix codestyle * fix some error * add redef float2uint * fix conv3d and conv3d_transpose --- .../fluid/tests/unittests/test_conv3d_op.py | 128 +++++++++++++-- .../unittests/test_conv3d_transpose_op.py | 147 +++++++++++++++++- .../test_conv3d_transpose_part2_op.py | 22 ++- 3 files changed, 277 insertions(+), 20 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py index 0b843663827c1..d2b6545a81a4c 100644 --- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py @@ -15,10 +15,16 @@ import unittest import numpy as np -from eager_op_test import OpTest, paddle_static_guard +from eager_op_test import ( + OpTest, + convert_float_to_uint16, + get_numeric_gradient, + paddle_static_guard, +) import paddle from paddle.fluid import core +from paddle.fluid.tests.unittests.testsuite import create_op def conv3d_forward_naive( @@ -179,6 +185,77 @@ def init_kernel_type(self): globals()[cls_name] = TestCUDNNCase +def create_test_cudnn_bf16_class(parent): + @unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA and do not support bfloat16", + ) + class TestConv3DCUDNNBF16(parent): + def get_numeric_grad(self, place, check_name): + scope = core.Scope() + self._check_grad_helper() + op = create_op( + scope, self.op_type, self.inputs, self.outputs, self.attrs + ) + return get_numeric_gradient( + place, scope, op, self.inputs_fp32, check_name, ['Output'] + ) + + def init_kernel_type(self): + self.use_cudnn = True + self.dtype = np.uint16 + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place( + place, check_dygraph=(not self.use_mkldnn) + ) + + def test_check_grad_no_filter(self): + place = core.CUDAPlace(0) + numeric_grads = self.get_numeric_grad(place, 'Input') + + self.check_grad_with_place( + place, + ['Input'], + 'Output', + no_grad_set={'Filter'}, + check_dygraph=(not self.use_mkldnn), + user_defined_grads=[numeric_grads], + ) + + def test_check_grad_no_input(self): + place = core.CUDAPlace(0) + numeric_grads = self.get_numeric_grad(place, 'Filter') + + self.check_grad_with_place( + place, + ['Filter'], + 'Output', + no_grad_set={'Input'}, + check_dygraph=(not self.use_mkldnn), + user_defined_grads=[numeric_grads], + ) + + def test_check_grad(self): + place = core.CUDAPlace(0) + numeric_input_grads = self.get_numeric_grad(place, 'Input') + numeric_fliter_grads = self.get_numeric_grad(place, 'Filter') + + self.check_grad_with_place( + place, + {'Input', 'Filter'}, + 'Output', + user_defined_grads=[numeric_input_grads, numeric_fliter_grads], + check_dygraph=(not self.use_mkldnn), + ) + + cls_name = "{}_{}".format(parent.__name__, "CUDNNBF16OP") + TestConv3DCUDNNBF16.__name__ = cls_name + globals()[cls_name] = TestConv3DCUDNNBF16 + + def create_test_padding_SAME_class(parent): class TestPaddingSMAECase(parent): def init_paddings(self): @@ -323,19 +400,37 @@ def setUp(self): 'dilations': self.dilations, } - input = np.random.random(self.input_size).astype(self.dtype) - filter = np.random.random(self.filter_size).astype(self.dtype) + if self.is_bfloat16_op(): + input = np.random.random(self.input_size).astype(np.float32) + filter = np.random.random(self.filter_size).astype(np.float32) + else: + input = np.random.random(self.input_size).astype(self.dtype) + filter = np.random.random(self.filter_size).astype(self.dtype) + output = conv3d_forward_naive( input, filter, self.groups, conv3d_param, - ).astype(self.dtype) + ) + + if self.is_bfloat16_op(): + output = convert_float_to_uint16(output) + self.inputs = { + 'Input': convert_float_to_uint16(input), + 'Filter': convert_float_to_uint16(filter), + } + self.inputs_fp32 = { + 'Input': OpTest.np_dtype_to_fluid_dtype(input), + 'Filter': OpTest.np_dtype_to_fluid_dtype(filter), + } + else: + output = output.astype(self.dtype) + self.inputs = { + 'Input': OpTest.np_dtype_to_fluid_dtype(input), + 'Filter': OpTest.np_dtype_to_fluid_dtype(filter), + } - self.inputs = { - 'Input': OpTest.np_dtype_to_fluid_dtype(input), - 'Filter': OpTest.np_dtype_to_fluid_dtype(filter), - } self.attrs = { 'strides': self.stride, 'paddings': self.pad, @@ -358,8 +453,6 @@ def test_check_output(self): ) def test_check_grad(self): - if self.dtype == np.float16: - return place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace() # TODO(wangzhongpu): support mkldnn op in dygraph mode self.check_grad_with_place( @@ -371,8 +464,7 @@ def test_check_grad(self): ) def test_check_grad_no_filter(self): - if self.dtype == np.float16: - return + place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace() # TODO(wangzhongpu): support mkldnn op in dygraph mode self.check_grad_with_place( @@ -385,8 +477,7 @@ def test_check_grad_no_filter(self): ) def test_check_grad_no_input(self): - if self.dtype == np.float16: - return + place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace() # TODO(wangzhongpu): support mkldnn op in dygraph mode self.check_grad_with_place( @@ -617,6 +708,14 @@ def init_kernel_type(self): self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64 +# ----------------Conv3DCUDNN bf16---------------- +create_test_cudnn_bf16_class(TestConv3DOp) +create_test_cudnn_bf16_class(TestWithGroup1) +create_test_cudnn_bf16_class(TestWithGroup2) +create_test_cudnn_bf16_class(TestWith1x1) +create_test_cudnn_bf16_class(TestWithInput1x1Filter1x1) + + # ---- test asymmetric padding ---- @@ -1114,4 +1213,5 @@ def run_8(): if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py index 22e22d9b2f66a..c0814754cc231 100644 --- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py @@ -19,11 +19,25 @@ import paddle paddle.enable_static() -from eager_op_test import OpTest +from eager_op_test import OpTest, copy_bits_from_float_to_uint16 from paddle.fluid import core +def convert_float_to_uint16(float_list, data_format="NCHW"): + if data_format == "NHWC": + float_list = np.transpose(float_list, [0, 4, 1, 2, 3]) + + new_output = [] + for x in np.nditer(float_list): + new_output.append(np.uint16(copy_bits_from_float_to_uint16(x))) + new_output = np.reshape(new_output, float_list.shape).view(np.uint16) + + if data_format == "NHWC": + new_output = np.transpose(new_output, [0, 2, 3, 4, 1]) + return new_output + + def conv3dtranspose_forward_naive(input_, filter_, attrs): padding_algorithm = attrs['padding_algorithm'] if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]: @@ -134,6 +148,86 @@ def _get_padding_with_SAME(input_shape, kernel_size, kernel_stride): return out +def create_test_cudnn_fp16_class(parent, grad_check=True): + @unittest.skipIf( + not core.is_compiled_with_cuda(), "core is not compiled with CUDA" + ) + class TestConv3DTransposeCUDNNFP16(parent): + def init_kernel_type(self): + self.use_cudnn = True + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=2e-2) + + def test_check_grad_no_filter(self): + place = core.CUDAPlace(0) + if core.is_float16_supported(place) and grad_check: + self.check_grad_with_place( + place, ['Input'], 'Output', no_grad_set={'Filter'} + ) + + def test_check_grad_no_input(self): + place = core.CUDAPlace(0) + if core.is_float16_supported(place) and grad_check: + self.check_grad_with_place( + place, ['Filter'], 'Output', no_grad_set={'Input'} + ) + + cls_name = "{}_{}".format(parent.__name__, "CUDNNFP16OP") + TestConv3DTransposeCUDNNFP16.__name__ = cls_name + globals()[cls_name] = TestConv3DTransposeCUDNNFP16 + + +def create_test_cudnn_bf16_class(parent): + @unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA and do not support bfloat16", + ) + class TestConv3DTransposeCUDNNBF16(parent): + def init_kernel_type(self): + self.use_cudnn = True + self.dtype = np.uint16 + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place) + + def test_check_grad(self): + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, + {'Input', 'Filter'}, + 'Output', + ) + + def test_check_grad_no_filter(self): + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, + ['Input'], + 'Output', + no_grad_set={'Filter'}, + ) + + def test_check_grad_no_input(self): + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, + ['Filter'], + 'Output', + no_grad_set={'Input'}, + ) + + cls_name = "{}_{}".format(parent.__name__, "CUDNNBF16OP") + TestConv3DTransposeCUDNNBF16.__name__ = cls_name + globals()[cls_name] = TestConv3DTransposeCUDNNBF16 + + def conv3d_transpose_wrapper( x, weight, @@ -172,12 +266,16 @@ def setUp(self): self.pad = [0, 0, 0] self.padding_algorithm = "EXPLICIT" self.init_op_type() + self.init_kernel_type() self.init_test_case() - input_ = np.random.random(self.input_size).astype("float32") - filter_ = np.random.random(self.filter_size).astype("float32") + if self.is_bfloat16_op(): + input = np.random.random(self.input_size).astype(np.float32) + filter = np.random.random(self.filter_size).astype(np.float32) + else: + input = np.random.random(self.input_size).astype(self.dtype) + filter = np.random.random(self.filter_size).astype(self.dtype) - self.inputs = {'Input': input_, 'Filter': filter_} self.attrs = { 'strides': self.stride, 'paddings': self.pad, @@ -189,9 +287,21 @@ def setUp(self): } output = conv3dtranspose_forward_naive( - input_, filter_, self.attrs + input, filter, self.attrs ).astype("float32") + if self.is_bfloat16_op(): + self.inputs = { + 'Input': convert_float_to_uint16(input), + 'Filter': convert_float_to_uint16(filter), + } + else: + self.inputs = { + 'Input': input, + 'Filter': filter, + } + output = output.astype(self.dtype) + self.outputs = {'Output': output} def test_check_output(self): @@ -264,6 +374,9 @@ def init_op_type(self): self.op_type = "conv3d_transpose" self.python_api = conv3d_transpose_wrapper + def init_kernel_type(self): + self.dtype = np.float32 + class TestWithSymmetricPad(TestConv3DTransposeOp): def init_test_case(self): @@ -596,6 +709,30 @@ def init_op_type(self): self.python_api = conv3d_transpose_wrapper +# ----------------Conv3DTransposeCUDNN fp16---------------- +create_test_cudnn_fp16_class(TestConv3DTransposeOp) +create_test_cudnn_fp16_class(TestWithSymmetricPad) +create_test_cudnn_fp16_class(TestWithAsymmetricPad) +create_test_cudnn_fp16_class(TestWithSAMEPad) +create_test_cudnn_fp16_class(TestWithVALIDPad) +create_test_cudnn_fp16_class(TestWithStride) +create_test_cudnn_fp16_class(TestWithGroups) +create_test_cudnn_fp16_class(TestWithDilation) +create_test_cudnn_fp16_class(Test_NHWC) + + +# ----------------Conv3DTransposeCUDNN bf16---------------- +create_test_cudnn_bf16_class(TestConv3DTransposeOp) +create_test_cudnn_bf16_class(TestWithSymmetricPad) +create_test_cudnn_bf16_class(TestWithAsymmetricPad) +create_test_cudnn_bf16_class(TestWithSAMEPad) +create_test_cudnn_bf16_class(TestWithVALIDPad) +create_test_cudnn_bf16_class(TestWithStride) +create_test_cudnn_bf16_class(TestWithGroups) +create_test_cudnn_bf16_class(TestWithDilation) +create_test_cudnn_bf16_class(Test_NHWC) + + class TestConv3dTranspose(unittest.TestCase): def error_weight_input(self): array = np.array([1], dtype=np.float32) diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_part2_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_part2_op.py index b51cdd9b1087a..54d31a7bbd22b 100644 --- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_part2_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_part2_op.py @@ -15,7 +15,11 @@ import unittest import numpy as np -from test_conv3d_transpose_op import TestConv3DTransposeOp +from test_conv3d_transpose_op import ( + TestConv3DTransposeOp, + create_test_cudnn_bf16_class, + create_test_cudnn_fp16_class, +) import paddle from paddle import fluid @@ -84,6 +88,22 @@ def init_test_case(self): self.data_format = 'NHWC' +# ----------------Conv3DTransposeCUDNN fp16---------------- +create_test_cudnn_fp16_class(TestWithSymmetricPad_NHWC) +create_test_cudnn_fp16_class(TestWithAsymmetricPad_NHWC) +create_test_cudnn_fp16_class(TestWithGroups_NHWC) +create_test_cudnn_fp16_class(TestWithStride_NHWC) +create_test_cudnn_fp16_class(TestWithDilation_NHWC) + + +# ----------------Conv3DTransposeCUDNN bf16---------------- +create_test_cudnn_bf16_class(TestWithSymmetricPad_NHWC) +create_test_cudnn_bf16_class(TestWithAsymmetricPad_NHWC) +create_test_cudnn_bf16_class(TestWithGroups_NHWC) +create_test_cudnn_bf16_class(TestWithStride_NHWC) +create_test_cudnn_bf16_class(TestWithDilation_NHWC) + + class TestConv3DTransposeAPI(unittest.TestCase): def test_case1(self): data1 = paddle.static.data( From 8d4b64e8f537e4438eadb601a3a01835999d26ed Mon Sep 17 00:00:00 2001 From: Chitsing KUI Date: Tue, 25 Apr 2023 15:34:04 +0800 Subject: [PATCH 052/405] [DEBUG] print modifed flags (#53243) * print modifed flags * fix ref, opt print * fix default getter * fix ut --- .../pybind/global_value_getter_setter.cc | 89 ++++++++++++++----- python/paddle/distributed/parallel.py | 31 ++++++- 2 files changed, 98 insertions(+), 22 deletions(-) diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc index e6935b577d737..94e3ca1ba41bd 100644 --- a/paddle/fluid/pybind/global_value_getter_setter.cc +++ b/paddle/fluid/pybind/global_value_getter_setter.cc @@ -64,6 +64,11 @@ class PYBIND11_HIDDEN GlobalVarGetterSetterRegistry { return [&]() -> py::object { return py::cast(var); }; } + template + static Getter CreateDefaultValueGetter(const T &var) { + return [=]() -> py::object { return py::cast(var); }; + } + template static Setter CreateSetter(T *var) { return [var](const py::object &obj) { *var = py::cast(obj); }; @@ -71,14 +76,23 @@ class PYBIND11_HIDDEN GlobalVarGetterSetterRegistry { private: struct VarInfo { - VarInfo(bool is_public, const Getter &getter) - : is_public(is_public), getter(getter) {} - - VarInfo(bool is_public, const Getter &getter, const Setter &setter) - : is_public(is_public), getter(getter), setter(setter) {} + VarInfo(bool is_public, const Getter &getter, const Getter &default_getter) + : is_public(is_public), + getter(getter), + default_getter(default_getter) {} + + VarInfo(bool is_public, + const Getter &getter, + const Getter &default_getter, + const Setter &setter) + : is_public(is_public), + getter(getter), + default_getter(default_getter), + setter(setter) {} const bool is_public; const Getter getter; + const Getter default_getter; const Setter setter; }; @@ -87,7 +101,10 @@ class PYBIND11_HIDDEN GlobalVarGetterSetterRegistry { static GlobalVarGetterSetterRegistry *MutableInstance() { return &instance_; } - void Register(const std::string &name, bool is_public, const Getter &getter) { + void Register(const std::string &name, + bool is_public, + const Getter &getter, + const Getter &default_getter) { PADDLE_ENFORCE_EQ( HasGetterMethod(name), false, @@ -96,12 +113,13 @@ class PYBIND11_HIDDEN GlobalVarGetterSetterRegistry { PADDLE_ENFORCE_NOT_NULL(getter, platform::errors::InvalidArgument( "Getter of %s should not be null", name)); - var_infos_.insert({name, VarInfo(is_public, getter)}); + var_infos_.insert({name, VarInfo(is_public, getter, default_getter)}); } void Register(const std::string &name, bool is_public, const Getter &getter, + const Getter &default_getter, const Setter &setter) { PADDLE_ENFORCE_EQ( HasGetterMethod(name), @@ -122,7 +140,8 @@ class PYBIND11_HIDDEN GlobalVarGetterSetterRegistry { PADDLE_ENFORCE_NOT_NULL(setter, platform::errors::InvalidArgument( "Setter of %s should not be null", name)); - var_infos_.insert({name, VarInfo(is_public, getter, setter)}); + var_infos_.insert( + {name, VarInfo(is_public, getter, default_getter, setter)}); } const Getter &GetterMethod(const std::string &name) const { @@ -133,6 +152,14 @@ class PYBIND11_HIDDEN GlobalVarGetterSetterRegistry { return var_infos_.at(name).getter; } + const Getter &DefaultGetterMethod(const std::string &name) const { + PADDLE_ENFORCE_EQ( + HasGetterMethod(name), + true, + platform::errors::NotFound("Cannot find global variable %s", name)); + return var_infos_.at(name).default_getter; + } + py::object GetOrReturnDefaultValue(const std::string &name, const py::object &default_value) const { if (HasGetterMethod(name)) { @@ -142,6 +169,14 @@ class PYBIND11_HIDDEN GlobalVarGetterSetterRegistry { } } + py::object GetDefaultValue(const std::string &name) const { + if (HasGetterMethod(name)) { + return DefaultGetterMethod(name)(); + } else { + return py::cast(Py_None); + } + } + py::object Get(const std::string &name) const { return GetterMethod(name)(); } const Setter &SetterMethod(const std::string &name) const { @@ -198,6 +233,9 @@ void BindGlobalValueGetterSetter(pybind11::module *module) { .def("__contains__", &GlobalVarGetterSetterRegistry::HasGetterMethod) .def("keys", &GlobalVarGetterSetterRegistry::Keys) .def("is_public", &GlobalVarGetterSetterRegistry::IsPublic) + .def("get_default", + &GlobalVarGetterSetterRegistry::GetDefaultValue, + py::arg("key")) .def("get", &GlobalVarGetterSetterRegistry::GetOrReturnDefaultValue, py::arg("key"), @@ -209,13 +247,15 @@ void BindGlobalValueGetterSetter(pybind11::module *module) { } /* Public vars are designed to be writable. */ -#define REGISTER_PUBLIC_GLOBAL_VAR(var) \ - do { \ - auto *instance = GlobalVarGetterSetterRegistry::MutableInstance(); \ - instance->Register(#var, \ - /*is_public=*/true, \ - GlobalVarGetterSetterRegistry::CreateGetter(var), \ - GlobalVarGetterSetterRegistry::CreateSetter(&var)); \ +#define REGISTER_PUBLIC_GLOBAL_VAR(var) \ + do { \ + auto *instance = GlobalVarGetterSetterRegistry::MutableInstance(); \ + instance->Register( \ + #var, \ + /*is_public=*/true, \ + GlobalVarGetterSetterRegistry::CreateGetter(var), \ + GlobalVarGetterSetterRegistry::CreateDefaultValueGetter(var), \ + GlobalVarGetterSetterRegistry::CreateSetter(&var)); \ } while (0) struct RegisterGetterSetterVisitor { @@ -225,18 +265,25 @@ struct RegisterGetterSetterVisitor { : name_(name), is_writable_(is_writable), value_ptr_(value_ptr) {} template - void operator()(const T &) const { + void operator()(const T &default_value) const { auto &value = *static_cast(value_ptr_); auto *instance = GlobalVarGetterSetterRegistry::MutableInstance(); bool is_public = is_writable_; // currently, all writable vars are public if (is_writable_) { - instance->Register(name_, - is_public, - GlobalVarGetterSetterRegistry::CreateGetter(value), - GlobalVarGetterSetterRegistry::CreateSetter(&value)); + instance->Register( + name_, + is_public, + GlobalVarGetterSetterRegistry::CreateGetter(value), + GlobalVarGetterSetterRegistry::CreateDefaultValueGetter( + default_value), + GlobalVarGetterSetterRegistry::CreateSetter(&value)); } else { instance->Register( - name_, is_public, GlobalVarGetterSetterRegistry::CreateGetter(value)); + name_, + is_public, + GlobalVarGetterSetterRegistry::CreateGetter(value), + GlobalVarGetterSetterRegistry::CreateDefaultValueGetter( + default_value)); } } diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index c016f9d743c7f..3fdf7cdcdd954 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -14,9 +14,10 @@ import itertools import os +import sys import time import warnings -from collections import OrderedDict +from collections import OrderedDict, namedtuple from contextlib import contextmanager from multiprocessing import Manager # noqa: F401 from multiprocessing import Process # noqa: F401 @@ -905,6 +906,31 @@ def _check_var_exists(var_name): ) +def _get_modified_flags(): + ret = [] + FLAGS = namedtuple('FLAGS', ['name', 'current_value', 'default_value']) + global_flags = core.globals() + for key in global_flags.keys(): + value = global_flags.get(key) + default_value = global_flags.get_default(key) + if not value == default_value: + ret.append(FLAGS(key, value, default_value)) + return ret + + +def _print_modified_flags(modified_flags): + if len(modified_flags) > 0: + sys.stderr.write( + "======================= Modified FLAGS detected =======================\n" + ) + for flag in modified_flags: + sys.stderr.write(str(flag)) + sys.stderr.write("\n") + sys.stderr.write( + "=======================================================================\n" + ) + + def init_parallel_env(): """ @@ -967,6 +993,9 @@ def train(): """ + modified_flags = _get_modified_flags() + _print_modified_flags(modified_flags) + # 0. get env & check world size global _global_parallel_env # when call init_parallel_env, need update `_global_parallel_env` From 1ddf93961f57be866b1136fa21793f783a559243 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Tue, 25 Apr 2023 09:33:38 +0800 Subject: [PATCH 053/405] Simplify codes and fix backward. --- .../forwards/fused_gate_attention_fwd_func.cc | 14 +- .../nodes/fused_gate_attention_node.cc | 12 ++ .../api/manual/fluid_manual/nodes/nodes.h | 10 + .../operators/fused/fused_gate_attention.h | 186 +++++++----------- .../fused/fused_gate_attention_op.cc | 1 - .../fused/fused_gate_attention_op.cu | 15 +- 6 files changed, 114 insertions(+), 124 deletions(-) diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc index b0585bc7acd54..546b60438fedc 100644 --- a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc +++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc @@ -314,10 +314,10 @@ fused_gate_attention_dygraph_function( has_gating = PADDLE_GET_CONST(bool, attrs.at("has_gating")); } - // bool use_flash_attn = false; - // if (attrs.count("use_flash_attn")) { - // use_flash_attn = PADDLE_GET_CONST(bool, attrs.at("use_flash_attn")); - // } + bool use_flash_attn = false; + if (attrs.count("use_flash_attn")) { + use_flash_attn = PADDLE_GET_CONST(bool, attrs.at("use_flash_attn")); + } // Set Attributes grad_node->SetAttrMap(std::move(attrs)); @@ -365,6 +365,12 @@ fused_gate_attention_dygraph_function( grad_node->SetGradOutMeta(NonbatchedBias, 6); } + if (use_flash_attn) { + grad_node->SetTensorWrapperSoftmaxLse(SoftmaxLse); + grad_node->SetTensorWrapperSrcMask(SrcMask); + grad_node->SetGradOutMeta(SrcMask, 7); + } + egr::EagerUtils::SetOutRankWithSlot(p_autograd_QueryTransposeOut, 0); grad_node->SetGradInMeta(QueryTransposeOut, 0); egr::EagerUtils::SetOutRankWithSlot(p_autograd_KeyTransposeOut, 1); diff --git a/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_gate_attention_node.cc b/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_gate_attention_node.cc index 8c427eba8cd0a..3692a20faed98 100644 --- a/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_gate_attention_node.cc +++ b/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_gate_attention_node.cc @@ -45,6 +45,11 @@ fused_gate_attentionGradNodeCompat::operator()( has_gating = PADDLE_GET_CONST(bool, attr_map_.at("has_gating")); } + bool use_flash_attn = false; + if (attr_map_.count("use_flash_attn")) { + use_flash_attn = PADDLE_GET_CONST(bool, attr_map_.at("use_flash_attn")); + } + std::map>> ins0 = {{"FMHAOut", egr::EagerUtils::TrySyncToVars( @@ -168,6 +173,13 @@ fused_gate_attentionGradNodeCompat::operator()( egr::Controller::Instance().GenerateUniqueName())}; } + if (use_flash_attn) { + auto SrcMask = egr::EagerUtils::RecoverTensorWrapper(&this->SrcMask_); + ins0["SrcMask"] = egr::EagerUtils::TrySyncToVars(SrcMask); + auto SoftmaxLse = egr::EagerUtils::RecoverTensorWrapper(&this->SoftmaxLse_); + ins0["SoftmaxLse"] = egr::EagerUtils::TrySyncToVars(SoftmaxLse); + } + auto& attrs_map0 = this->attr_map_; // Pass the entire attribute map to TraceOp // The underlying kernel will pickup whatever attribute they need at runtime diff --git a/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h b/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h index b0576672ae18e..212f9d9f1da19 100644 --- a/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h +++ b/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h @@ -61,12 +61,14 @@ class fused_gate_attentionGradNodeCompat : public egr::GradNodeBase { GateOut_.clear(); GateWeight_.clear(); NonbatchedBias_.clear(); + SrcMask_.clear(); OutLinearBias_.clear(); OutLinearWeight_.clear(); QKVTransposeOut_.clear(); QKVWeight_.clear(); Query_.clear(); SoftmaxOut_.clear(); + SoftmaxLse_.clear(); Key_.clear(); QueryWeight_.clear(); KeyWeight_.clear(); @@ -103,6 +105,9 @@ class fused_gate_attentionGradNodeCompat : public egr::GradNodeBase { void SetTensorWrapperNonbatchedBias(const paddle::Tensor& NonbatchedBias) { NonbatchedBias_ = egr::TensorWrapper(NonbatchedBias, false); } + void SetTensorWrapperSrcMask(const paddle::Tensor& SrcMask) { + SrcMask_ = egr::TensorWrapper(SrcMask, false); + } void SetTensorWrapperOutLinearBias(const paddle::Tensor& OutLinearBias) { OutLinearBias_ = egr::TensorWrapper(OutLinearBias, false); } @@ -121,6 +126,9 @@ class fused_gate_attentionGradNodeCompat : public egr::GradNodeBase { void SetTensorWrapperSoftmaxOut(const paddle::Tensor& SoftmaxOut) { SoftmaxOut_ = egr::TensorWrapper(SoftmaxOut, false); } + void SetTensorWrapperSoftmaxLse(const paddle::Tensor& SoftmaxLse) { + SoftmaxLse_ = egr::TensorWrapper(SoftmaxLse, false); + } void SetTensorWrapperKey(const paddle::Tensor& Key) { Key_ = egr::TensorWrapper(Key, false); } @@ -160,12 +168,14 @@ class fused_gate_attentionGradNodeCompat : public egr::GradNodeBase { egr::TensorWrapper GateOut_; egr::TensorWrapper GateWeight_; egr::TensorWrapper NonbatchedBias_; + egr::TensorWrapper SrcMask_; egr::TensorWrapper OutLinearBias_; egr::TensorWrapper OutLinearWeight_; egr::TensorWrapper QKVTransposeOut_; egr::TensorWrapper QKVWeight_; egr::TensorWrapper Query_; egr::TensorWrapper SoftmaxOut_; + egr::TensorWrapper SoftmaxLse_; egr::TensorWrapper Key_; egr::TensorWrapper QueryWeight_; diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h index 07b4b5973cac8..3579caf1bc99d 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention.h +++ b/paddle/fluid/operators/fused/fused_gate_attention.h @@ -951,10 +951,6 @@ class FlashAttnWithGating { phi::DenseTensor* fmha_out, phi::DenseTensor* gate_out, GateAttentionConfig* config) { - T* q_ptr = nullptr; - T* k_ptr = nullptr; - T* v_ptr = nullptr; - bool is_bf16 = qkv_transpose_out->dtype() == DataType::BFLOAT16 ? true : false; @@ -992,29 +988,14 @@ class FlashAttnWithGating { // q_size == k_size int64_t q_size = config->GetQuerySize(); - q_ptr = qkv_transpose_out->data(); - k_ptr = q_ptr + q_size; - v_ptr = k_ptr + q_size; + T* q_ptr = qkv_transpose_out->data(); + T* k_ptr = q_ptr + q_size; + T* v_ptr = k_ptr + q_size; // 2. Dealing with cu_seq_q and cu_seq_k for flash_attn. phi::DenseTensor cu_seq_q, cu_seq_k; - int64_t end_size = seq_batch_size + 1; - int64_t seq_size = 0; - int64_t start = 0; - int64_t end = end_size; - int64_t step = static_cast(config->seq_len_r); - phi::funcs::GetSize(start, end, step, &seq_size); - cu_seq_q.Resize({end_size}); - cu_seq_k.Resize({end_size}); - AllocWithDebugInfo(dev_ctx_, "cu_seq_q", &cu_seq_q); - AllocWithDebugInfo(dev_ctx_, "cu_seq_k", &cu_seq_k); - int64_t block = std::min(seq_size, static_cast(256)); - int64_t grid = (seq_size + block - 1) / block; - FlashAttRange<<>>( - start, step, end, cu_seq_q.data(), cu_seq_k.data()); - VLOG(4) << "[Flash_attn] cu_seq_len : start = " << start - << ", step = " << step << ", end = " << end; - LOG(INFO) << WaitWithDebugInfo(dev_ctx_) << "2: Init cu_seq_q and cu_seq_k"; + int64_t step = static_cast(config->seq_len_r); + AllocAndInitSeqQK(seq_batch_size, step, &cu_seq_q, &cu_seq_k); // 3. Dealing with mask and bias for flash_attn. phi::DenseTensor temp_mask, temp_bias; @@ -1071,9 +1052,7 @@ class FlashAttnWithGating { << "]"; // 6. construct random seed - auto gen = dev_ctx_.GetGenerator(); - uint64_t inc = batch_size_ * num_heads_ * 32; - auto seed_offset_pair = gen->IncrementOffset(inc); + auto seed_offset_pair = GenerateSeedOffsetPair(batch_size_, num_heads_); uint64_t seed = seed_offset_pair.first; uint64_t offset = seed_offset_pair.second; @@ -1124,13 +1103,7 @@ class FlashAttnWithGating { LOG(INFO) << WaitWithDebugInfo(dev_ctx_) << "7: Get workspace_size=" << workspace_size; - phi::DenseTensor workspace; - if (workspace_size > 0) { - workspace = phi::Empty( - dev_ctx_, {int64_t(workspace_size / sizeof(float))}); - DBGPTR(workspace.data(), "workspace"); - } - LOG(INFO) << WaitWithDebugInfo(dev_ctx_) << "Allocate workspace"; + phi::DenseTensor workspace = CreateWorkspace(workspace_size); LOG(INFO) << "qkv_transpose_out: " << TensorDebugString(qkv_transpose_out); LOG(INFO) << "src_mask: " << TensorDebugString(src_mask); @@ -1179,27 +1152,15 @@ class FlashAttnWithGating { } } - void ComputeBackward(const phi::DenseTensor* q_transpose_out, - const phi::DenseTensor* k_transpose_out, - const phi::DenseTensor* v_transpose_out, - const phi::DenseTensor* qkv_transpose_out, + void ComputeBackward(const phi::DenseTensor* qkv_transpose_out, + const phi::DenseTensor* src_mask, + const phi::DenseTensor* nonbatched_bias, + const phi::DenseTensor* softmax_lse, + const phi::DenseTensor* fmha_out, const phi::DenseTensor* fmha_out_grad, phi::DenseTensor* src_mask_grad, phi::DenseTensor* nonbatched_bias_grad, - GateAttentionGradConfig* config, - const phi::DenseTensor* fmha_out = nullptr, - const phi::DenseTensor* softmax_lse = nullptr, - const phi::DenseTensor* nonbatched_bias = nullptr, - const phi::DenseTensor* src_mask = nullptr) { - T* q_grad_ptr = nullptr; - T* k_grad_ptr = nullptr; - T* v_grad_ptr = nullptr; - - phi::DenseTensor q_transpose_out_grad; - phi::DenseTensor k_transpose_out_grad; - phi::DenseTensor v_transpose_out_grad; - phi::DenseTensor qkv_transpose_out_grad; - + GateAttentionGradConfig* config) { bool is_bf16 = qkv_transpose_out->dtype() == DataType::BFLOAT16 ? true : false; @@ -1215,38 +1176,29 @@ class FlashAttnWithGating { qkv_transpose_out, platform::errors::NotFound("The input qkv_transpose_out can not be" "nullptr when merge_qkv is true.")); + int64_t q_size = config->GetQuerySize(); const T* q_ptr = qkv_transpose_out->data(); const T* k_ptr = q_ptr + q_size; const T* v_ptr = k_ptr + q_size; + phi::DenseTensor qkv_transpose_out_grad; qkv_transpose_out_grad.Resize(config->qkv_transpose_out_dims); AllocWithDebugInfo( dev_ctx_, "qkv_transpose_out_grad", &qkv_transpose_out_grad); + T* q_grad_ptr = qkv_transpose_out_grad.data(); + T* k_grad_ptr = q_grad_ptr + q_size; + T* v_grad_ptr = k_grad_ptr + q_size; + int seq_batch_size = static_cast(config->batch_size) * static_cast(config->seq_len_m); LOG(INFO) << WaitWithDebugInfo(dev_ctx_); - // 2. Dealing with cu_seq_q and cu_seq_k for flash_attn. + // 2. Init with cu_seq_q and cu_seq_k for flash_attn. phi::DenseTensor cu_seq_q, cu_seq_k; - int64_t start = 0; - int64_t step = static_cast(config->seq_len_r); - int64_t end_size = (seq_batch_size + 1); - int64_t end = end_size; - int64_t seq_size = 0; - phi::funcs::GetSize(start, end, step, &seq_size); - cu_seq_q.Resize({end_size}); - cu_seq_k.Resize({end_size}); - AllocWithDebugInfo(dev_ctx_, "Grad: cu_seq_q", &cu_seq_q); - AllocWithDebugInfo(dev_ctx_, "Grad: cu_seq_k", &cu_seq_k); - int64_t block = std::min(seq_size, static_cast(256)); - int64_t grid = (seq_size + block - 1) / block; - FlashAttRange<<>>( - start, step, end, cu_seq_q.data(), cu_seq_k.data()); - VLOG(4) << "[Flash_attn] cu_seq_len : start = " << start - << ", step = " << step << ", end = " << end; - LOG(INFO) << WaitWithDebugInfo(dev_ctx_); + int64_t step = static_cast(config->seq_len_r); + AllocAndInitSeqQK(seq_batch_size, step, &cu_seq_q, &cu_seq_k); // 3. Dealing with mask and bias for flash_attn. phi::DenseTensor temp_mask, temp_bias; @@ -1308,27 +1260,17 @@ class FlashAttnWithGating { } LOG(INFO) << WaitWithDebugInfo(dev_ctx_); - q_ptr = q_transpose_out->data(); - k_ptr = k_transpose_out->data(); - v_ptr = v_transpose_out->data(); - q_transpose_out_grad.Resize(config->q_transpose_out_dims); - k_transpose_out_grad.Resize(config->kv_transpose_out_dims); - v_transpose_out_grad.Resize(config->kv_transpose_out_dims); - - q_grad_ptr = dev_ctx_.Alloc(&q_transpose_out_grad, - q_transpose_out_grad.numel() * sizeof(T)); - k_grad_ptr = dev_ctx_.Alloc(&k_transpose_out_grad, - k_transpose_out_grad.numel() * sizeof(T)); - v_grad_ptr = dev_ctx_.Alloc(&v_transpose_out_grad, - v_transpose_out_grad.numel() * sizeof(T)); - // 6. construct random seed - auto gen = dev_ctx_.GetGenerator(); - uint64_t inc = batch_size_ * num_heads_ * 32; - auto seed_offset_pair = gen->IncrementOffset(inc); + auto seed_offset_pair = GenerateSeedOffsetPair(batch_size_, num_heads_); uint64_t seed = seed_offset_pair.first; uint64_t offset = seed_offset_pair.second; + LOG(INFO) << "fmha_out: " << TensorDebugString(fmha_out); + LOG(INFO) << "fmha_out_grad: " << TensorDebugString(fmha_out_grad); + LOG(INFO) << "softmax_lse: " << TensorDebugString(softmax_lse); + LOG(INFO) << "softmax_d: " << TensorDebugString(&softmax_d); + LOG(INFO) << "bias_d: " << TensorDebugString(&bias_d); + // 7. flas_attn part one, get temp worksapce size. uint64_t workspace_size; float p_dropout = 0.f; @@ -1376,15 +1318,7 @@ class FlashAttnWithGating { } LOG(INFO) << WaitWithDebugInfo(dev_ctx_); - phi::DenseTensor workspace; - printf("workspace_size = %d\n", workspace_size); - if (workspace_size > 0) { - workspace = phi::Empty( - dev_ctx_, {int64_t(workspace_size / sizeof(float))}); - DBGPTR(workspace.data(), "workspace"); - } - LOG(INFO) << WaitWithDebugInfo(dev_ctx_); - + phi::DenseTensor workspace = CreateWorkspace(workspace_size); succ = phi::dynload::flash_attn_bwd_with_bias_and_mask( static_cast(q_ptr), static_cast(k_ptr), @@ -1447,23 +1381,55 @@ class FlashAttnWithGating { {0}); } - if (merge_qkv_) { - phi::DenseTensor* qkv_out_grad = config->GetQKVOutGrad(); - ComputeQKVTransposeBackward(qkv_transpose_out_grad, qkv_out_grad); - } else { - phi::DenseTensor* q_out_grad = config->GetQueryOutGrad(); - phi::DenseTensor* k_out_grad = config->GetKeyOutGrad(); - phi::DenseTensor* v_out_grad = config->GetValueOutGrad(); - ComputeQKVTransposeBackward(q_transpose_out_grad, - k_transpose_out_grad, - v_transpose_out_grad, - q_out_grad, - k_out_grad, - v_out_grad); - } + phi::DenseTensor* qkv_out_grad = config->GetQKVOutGrad(); + ComputeQKVTransposeBackward(qkv_transpose_out_grad, qkv_out_grad); } private: + void AllocAndInitSeqQK(int64_t seq_batch_size, + int64_t step, + phi::DenseTensor* cu_seq_q, + phi::DenseTensor* cu_seq_k) { + int64_t start = 0; + int64_t end_size = seq_batch_size + 1; + int64_t end = end_size; + int64_t seq_size = 0; + phi::funcs::GetSize(start, end, step, &seq_size); + + cu_seq_q->Resize({end_size}); + cu_seq_k->Resize({end_size}); + AllocWithDebugInfo(dev_ctx_, "cu_seq_q", cu_seq_q); + AllocWithDebugInfo(dev_ctx_, "cu_seq_k", cu_seq_k); + + int64_t block = std::min(seq_size, static_cast(256)); + int64_t grid = (seq_size + block - 1) / block; + FlashAttRange<<>>( + start, step, end, cu_seq_q->data(), cu_seq_k->data()); + + LOG(INFO) << WaitWithDebugInfo(dev_ctx_) + << "AllocAndInit cu_seq_q and cu_seq_k: start=" << start + << ", step=" << step << ", end=" << end; + } + + phi::DenseTensor CreateWorkspace(uint64_t workspace_size) { + phi::DenseTensor workspace; + if (workspace_size > 0) { + workspace = phi::Empty( + dev_ctx_, {int64_t(workspace_size / sizeof(float))}); + DBGPTR(workspace.data(), "workspace"); + } + LOG(INFO) << WaitWithDebugInfo(dev_ctx_) + << "Allocate workspace: workspace_size=" << workspace_size; + return workspace; + } + + std::pair GenerateSeedOffsetPair(int64_t batch_size, + int64_t num_heads) { + auto gen = dev_ctx_.GetGenerator(); + uint64_t inc = batch_size * num_heads * 32; + return gen->IncrementOffset(inc); + } + // [batch_size, seq_len_m, seq_len_r, 3, num_heads, head_dim] -> // [3, batch_size, seq_len_m, seq_len_r, num_heads, head_dim] void ComputeQKVTransposeForwardForFlashAttn( diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cc b/paddle/fluid/operators/fused/fused_gate_attention_op.cc index 9743bc33fd055..7175a20787bc6 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention_op.cc +++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cc @@ -314,7 +314,6 @@ class FusedGateAttentionGradOpMaker : public framework::SingleGradOpMaker { op->SetInput("QKVTransposeOut", this->Output("QKVTransposeOut")); if (use_flash_attn) { - op->SetInput("NonbatchedBias", this->Input("NonbatchedBias")); op->SetInput("SrcMask", this->Input("SrcMask")); op->SetInput("SoftmaxLse", this->Output("SoftmaxLse")); } diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu index a0f97ce59109b..261e2e377b5f8 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu @@ -557,18 +557,15 @@ class FusedGateAttentionGradKernel : public framework::OpKernel { const auto *softmax_lse = ctx.Input("SoftmaxLse"); auto fmha_compute = FlashAttnWithGating(dev_ctx, merge_qkv); - fmha_compute.ComputeBackward(q_transpose_out, - k_transpose_out, - v_transpose_out, - qkv_transpose_out, + fmha_compute.ComputeBackward(qkv_transpose_out, + src_mask, + non_batched_bias, + softmax_lse, + fmha_out, &fmha_out_grad, nullptr, nonbatched_bias_grad, - &config, - fmha_out, - softmax_lse, - non_batched_bias, - src_mask); + &config); } else { const auto *softmax_out = ctx.Input("SoftmaxOut"); From 22e96bde5bd2d9ca01c9911c28fb6faa617f0599 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Tue, 25 Apr 2023 15:54:44 +0800 Subject: [PATCH 054/405] [PHI]Add flags macro for PHI (#52991) * add flags for phi * fix compile bugs * fix ci bugs * fix inference bugs * fix cinn' bugs * fix cinn bugs * perfect code according comment * fix ci bugs * fix ci bugs --- .../collective/process_group_nccl.cc | 3 +- .../fluid/distributed/collective/reducer.cc | 3 +- .../distributed/fleet_executor/carrier.cc | 2 +- .../ps/service/communicator/communicator.h | 3 +- .../distributed/ps/service/heter_server.h | 3 +- .../ps/table/common_graph_table.cc | 12 +- .../distributed/ps/table/ssd_sparse_table.cc | 4 +- .../distributed/ps/table/tensor_table.cc | 3 +- .../fluid/distributed/ps/table/tensor_table.h | 3 +- .../eager_manual/forwards/add_n_fwd_func.cc | 3 +- .../forwards/conv2d_fwd_function.cc | 3 +- .../forwards/sync_batch_norm_fwd_func.cc | 5 +- .../manual/eager_manual/nodes/add_n_node.cc | 4 +- .../manual/eager_manual/nodes/conv2d_nodes.cc | 4 +- .../nodes/sync_batch_norm_node.cc | 3 +- .../generator/eager_gen.py | 7 +- paddle/fluid/eager/nan_inf_utils.cc | 5 +- paddle/fluid/framework/custom_operator.cc | 4 +- paddle/fluid/framework/data_feed.cc | 2 +- paddle/fluid/framework/data_feed.cu | 9 +- paddle/fluid/framework/data_feed.h | 9 +- paddle/fluid/framework/data_set.cc | 6 +- .../framework/details/all_reduce_op_handle.cc | 3 +- .../details/computation_op_handle.cc | 4 +- .../details/fused_all_reduce_op_handle.cc | 3 +- .../grad_merge_all_reduce_op_handle.cc | 3 +- .../framework/details/nan_inf_utils_detail.cc | 3 +- .../framework/details/nan_inf_utils_detail.cu | 3 +- .../framework/details/nan_inf_utils_detail.h | 4 +- .../fluid/framework/details/nccl_op_handle.h | 3 +- .../framework/details/reduce_op_handle.cc | 2 +- .../details/scope_buffered_monitor.cc | 3 +- .../details/sparse_all_reduce_op_handle.cc | 3 +- .../framework/dist_multi_trainer_test.cc | 2 +- paddle/fluid/framework/executor.cc | 3 +- .../cudf/concurrent_unordered_map.cuh.h | 3 +- .../framework/fleet/heter_ps/gpu_graph_node.h | 4 +- .../fleet/heter_ps/gpu_graph_utils.h | 3 +- .../fleet/heter_ps/graph_gpu_ps_table.h | 3 +- .../fleet/heter_ps/graph_gpu_wrapper.cu | 5 +- .../framework/fleet/heter_ps/heter_comm_inl.h | 19 +- .../fleet/heter_ps/heter_resource.cc | 5 +- .../fluid/framework/fleet/ps_gpu_wrapper.cc | 5 +- paddle/fluid/framework/fleet/ps_gpu_wrapper.h | 4 +- paddle/fluid/framework/garbage_collector.cc | 7 +- paddle/fluid/framework/hogwild_worker.cc | 3 +- .../framework/ir/coalesce_grad_tensor_pass.cc | 2 +- paddle/fluid/framework/ir/graph.cc | 2 +- paddle/fluid/framework/ir/graph_helper.cc | 2 +- ...est_reference_count_pass_last_lived_ops.cc | 3 +- .../new_executor/executor_statistics.cc | 2 +- .../garbage_collector/garbage_collector.cc | 5 +- .../interpreter/dependency_builder.cc | 2 +- .../interpreter/interpreter_util.cc | 5 +- .../framework/new_executor/interpretercore.cc | 7 +- paddle/fluid/framework/op_registry.h | 6 +- paddle/fluid/framework/operator.cc | 7 +- paddle/fluid/framework/operator.h | 3 +- .../framework/paddle2cinn/cinn_compiler.cc | 7 +- .../paddle2cinn/cinn_compiler_test.cc | 5 +- paddle/fluid/framework/parallel_executor.cc | 5 +- paddle/fluid/framework/scope.cc | 2 +- paddle/fluid/framework/unused_var_check.cc | 3 +- paddle/fluid/imperative/basic_engine.cc | 3 +- paddle/fluid/imperative/flags.cc | 2 +- paddle/fluid/imperative/layer.cc | 3 +- .../fluid/imperative/partial_grad_engine.cc | 3 +- paddle/fluid/imperative/prepared_operator.cc | 5 +- paddle/fluid/imperative/prepared_operator.h | 3 +- paddle/fluid/imperative/profiler.cc | 3 +- paddle/fluid/imperative/tracer.cc | 7 +- paddle/fluid/inference/api/analysis_config.cc | 3 +- .../api/demo_ci/trt_mobilenet_demo.cc | 2 - .../fluid/inference/api/demo_ci/vis_demo.cc | 3 - paddle/fluid/inference/tensorrt/engine.h | 3 +- paddle/fluid/jit/serializer.cc | 3 +- paddle/fluid/memory/allocation/allocator.h | 3 +- .../memory/allocation/allocator_facade.cc | 6 +- .../allocator_facade_abs_flags_test.cc | 11 +- .../allocator_facade_frac_flags_test.cc | 11 +- .../memory/allocation/allocator_strategy.cc | 4 +- .../auto_growth_best_fit_allocator.cc | 2 +- ...o_growth_best_fit_allocator_facade_test.cc | 8 +- .../memory/allocation/buddy_allocator.cc | 4 +- .../memory/allocation/buddy_allocator_test.cc | 8 +- .../fluid/memory/allocation/mmap_allocator.cc | 3 +- .../allocation/naive_best_fit_allocator.cc | 9 +- .../memory/allocation/system_allocator.cc | 11 +- .../allocation/system_allocator_test.cc | 6 +- .../allocation/thread_local_allocator_test.cc | 5 +- paddle/fluid/operators/activation_op.cc | 3 +- paddle/fluid/operators/batch_norm_op.cu | 3 +- paddle/fluid/operators/cinn/cinn_launch_op.cc | 3 +- paddle/fluid/operators/cinn/cinn_launch_op.h | 6 +- .../operators/cinn/cinn_launch_op_test.cc | 10 +- .../controlflow/conditional_block_infer_op.cc | 3 +- .../controlflow/conditional_block_op.cc | 2 +- .../fluid/operators/controlflow/while_op.cc | 1 + .../operators/fused/cudnn_bn_add_relu_test.cc | 3 +- .../operators/fused/fused_bn_activation_op.cu | 3 +- .../fused/fused_bn_add_activation_op.cu | 3 +- .../operators/fused/fused_dropout_helper.h | 3 +- .../fused/fused_multi_transformer_op.cu.h | 3 +- .../pscore/heter_listen_and_serv_op.cc | 2 +- .../pscore/heter_listen_and_server_test.cc | 5 +- paddle/fluid/operators/run_program_op.h | 3 +- paddle/fluid/platform/cpu_info_test.cc | 3 +- .../platform/cuda_graph_with_memory_pool.cc | 3 +- paddle/fluid/platform/device/gpu/gpu_info.cc | 12 +- paddle/fluid/platform/device_code.cc | 2 +- paddle/fluid/platform/enforce.h | 2 +- paddle/fluid/platform/flags.h | 77 ++++ paddle/fluid/platform/gen_comm_id_helper.cc | 4 +- paddle/fluid/platform/init.cc | 3 +- paddle/fluid/platform/place.cc | 2 +- paddle/fluid/platform/profiler.cc | 2 +- paddle/fluid/platform/profiler/host_tracer.cc | 2 +- .../fluid/platform/test_limit_gpu_memory.cu | 5 +- .../utils/static/composite_grad_desc_maker.h | 2 +- paddle/fluid/prim/utils/utils.cc | 2 +- paddle/fluid/pybind/eager_functions.cc | 4 +- paddle/fluid/pybind/eager_method.cc | 3 +- paddle/fluid/pybind/parallel_executor.cc | 3 +- paddle/fluid/pybind/place.cc | 3 +- paddle/fluid/pybind/pybind.cc | 3 +- paddle/fluid/pybind/reader_py.cc | 3 +- paddle/fluid/pybind/tensor.cc | 5 +- paddle/phi/api/profiler/profiler.cc | 12 +- paddle/phi/api/profiler/profiler.h | 4 +- paddle/phi/backends/cpu/cpu_info.cc | 6 +- paddle/phi/backends/dynload/dynamic_loader.cc | 86 ++-- paddle/phi/backends/xpu/xpu_info.cc | 2 +- paddle/phi/core/flags.cc | 387 +++++++++--------- paddle/phi/core/flags.h | 145 ++++++- paddle/phi/kernels/funcs/jit/gen_base.cc | 2 +- paddle/phi/kernels/funcs/jit/gen_base.h | 3 +- paddle/utils/pybind.cc | 4 +- test/cpp/inference/test_helper.h | 3 +- test/cpp/prim/test_eager_prim.cc | 4 +- test/cpp/prim/test_static_prim.cc | 3 +- 140 files changed, 761 insertions(+), 503 deletions(-) create mode 100644 paddle/fluid/platform/flags.h diff --git a/paddle/fluid/distributed/collective/process_group_nccl.cc b/paddle/fluid/distributed/collective/process_group_nccl.cc index 4653799401bbe..ed6118287fa91 100644 --- a/paddle/fluid/distributed/collective/process_group_nccl.cc +++ b/paddle/fluid/distributed/collective/process_group_nccl.cc @@ -23,9 +23,10 @@ #include "paddle/phi/core/distributed/check/nccl_dynamic_check.h" #include "paddle/phi/core/distributed/check/static_check.h" #include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/core/utils/data_type.h" -DECLARE_bool(nccl_blocking_wait); +PHI_DECLARE_bool(nccl_blocking_wait); DECLARE_bool(use_stream_safe_cuda_allocator); // set this flag to `true` and recompile to enable dynamic checks diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc index defc84fbe3d9c..260da509a17cb 100644 --- a/paddle/fluid/distributed/collective/reducer.cc +++ b/paddle/fluid/distributed/collective/reducer.cc @@ -15,9 +15,10 @@ #include "paddle/fluid/distributed/collective/reducer.h" #include "paddle/phi/backends/device_guard.h" #include "paddle/phi/backends/device_manager.h" +#include "paddle/phi/core/flags.h" DECLARE_bool(use_stream_safe_cuda_allocator); -DECLARE_string(allocator_strategy); +PHI_DECLARE_string(allocator_strategy); namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc index ba6ab7aa55e38..82d99a3835230 100644 --- a/paddle/fluid/distributed/fleet_executor/carrier.cc +++ b/paddle/fluid/distributed/fleet_executor/carrier.cc @@ -28,7 +28,7 @@ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable_helper.h" - +#include "paddle/fluid/platform/flags.h" PADDLE_DEFINE_EXPORTED_bool( fleet_executor_with_standalone, false, diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator.h b/paddle/fluid/distributed/ps/service/communicator/communicator.h index 5247b9a3a3554..f3aa23a77826d 100644 --- a/paddle/fluid/distributed/ps/service/communicator/communicator.h +++ b/paddle/fluid/distributed/ps/service/communicator/communicator.h @@ -41,6 +41,7 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/split.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/selected_rows_functor.h" @@ -52,7 +53,7 @@ struct CommContext; } // namespace distributed } // namespace paddle -DECLARE_bool(communicator_is_sgd_optimizer); +PHI_DECLARE_bool(communicator_is_sgd_optimizer); namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/ps/service/heter_server.h b/paddle/fluid/distributed/ps/service/heter_server.h index 73e51871c964e..88d37e6022ed5 100755 --- a/paddle/fluid/distributed/ps/service/heter_server.h +++ b/paddle/fluid/distributed/ps/service/heter_server.h @@ -38,6 +38,7 @@ limitations under the License. */ #include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN #include "paddle/fluid/platform/profiler.h" +#include "paddle/phi/core/flags.h" namespace google { namespace protobuf { @@ -52,7 +53,7 @@ class ProgramDesc; class Scope; } // namespace framework } // namespace paddle -DECLARE_double(eager_delete_tensor_gb); +PHI_DECLARE_double(eager_delete_tensor_gb); namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc index f2b54def608af..13b44d409fad7 100644 --- a/paddle/fluid/distributed/ps/table/common_graph_table.cc +++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc @@ -21,7 +21,6 @@ #include #include -#include "gflags/gflags.h" #include "paddle/fluid/distributed/common/utils.h" #include "paddle/fluid/distributed/ps/table/graph/graph_node.h" #include "paddle/fluid/framework/fleet/fleet_wrapper.h" @@ -30,13 +29,14 @@ #include "paddle/fluid/platform/timer.h" #include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/string_helper.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/core/generator.h" -DECLARE_bool(graph_load_in_parallel); -DECLARE_bool(graph_get_neighbor_id); -DECLARE_int32(gpugraph_storage_mode); -DECLARE_uint64(gpugraph_slot_feasign_max_num); -DECLARE_bool(graph_metapath_split_opt); +PHI_DECLARE_bool(graph_load_in_parallel); +PHI_DECLARE_bool(graph_get_neighbor_id); +PHI_DECLARE_int32(gpugraph_storage_mode); +PHI_DECLARE_uint64(gpugraph_slot_feasign_max_num); +PHI_DECLARE_bool(graph_metapath_split_opt); namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc index aa626a5e49fb5..fbc6af49870d5 100644 --- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc +++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc @@ -18,8 +18,8 @@ #include "paddle/fluid/distributed/common/local_random.h" #include "paddle/fluid/distributed/common/topk_calculator.h" #include "paddle/fluid/framework/archive.h" +#include "paddle/fluid/platform/flags.h" #include "paddle/utils/string/string_helper.h" - DECLARE_bool(pserver_print_missed_key_num_every_push); DECLARE_bool(pserver_create_value_when_push); DECLARE_bool(pserver_enable_create_feasign_randomly); @@ -549,7 +549,6 @@ int32_t SSDSparseTable::Shrink(const std::string& param) { } int32_t SSDSparseTable::UpdateTable() { - // TODO implement with multi-thread int count = 0; for (int i = 0; i < _real_local_shard_num; ++i) { auto& shard = _local_shards[i]; @@ -578,7 +577,6 @@ int64_t SSDSparseTable::LocalSize() { for (int i = 0; i < _real_local_shard_num; ++i) { local_size += _local_shards[i].size(); } - // TODO rocksdb size return local_size; } diff --git a/paddle/fluid/distributed/ps/table/tensor_table.cc b/paddle/fluid/distributed/ps/table/tensor_table.cc index 187c7021d0281..adda11a78f637 100644 --- a/paddle/fluid/distributed/ps/table/tensor_table.cc +++ b/paddle/fluid/distributed/ps/table/tensor_table.cc @@ -13,8 +13,9 @@ // limitations under the License. #include "paddle/fluid/distributed/ps/table/tensor_table.h" +#include "paddle/phi/core/flags.h" -DECLARE_double(eager_delete_tensor_gb); +PHI_DECLARE_double(eager_delete_tensor_gb); namespace paddle { namespace distributed {} // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/ps/table/tensor_table.h b/paddle/fluid/distributed/ps/table/tensor_table.h index 9fb29d727cf9a..0139bd2737ea9 100644 --- a/paddle/fluid/distributed/ps/table/tensor_table.h +++ b/paddle/fluid/distributed/ps/table/tensor_table.h @@ -28,6 +28,7 @@ #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/core/flags.h" namespace paddle { namespace framework { @@ -37,7 +38,7 @@ struct ExecutorPrepareContext; } // namespace framework } // namespace paddle -DECLARE_double(eager_delete_tensor_gb); +PHI_DECLARE_double(eager_delete_tensor_gb); namespace paddle { namespace distributed { diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc index 5804d37540957..d14832b80a1db 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc @@ -19,8 +19,9 @@ #include "paddle/fluid/eager/eager_amp_auto_cast.h" #include "paddle/fluid/eager/nan_inf_utils.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/phi/core/flags.h" -DECLARE_bool(check_nan_inf); +PHI_DECLARE_bool(check_nan_inf); paddle::Tensor add_n_ad_func(const std::vector& x) { // Dygraph Record Event diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc index 39fa77d3ae95f..b7ca5a7c26710 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc @@ -20,8 +20,9 @@ #include "paddle/fluid/eager/eager_layout_auto_tune.h" #include "paddle/fluid/eager/nan_inf_utils.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/phi/core/flags.h" -DECLARE_bool(check_nan_inf); +PHI_DECLARE_bool(check_nan_inf); paddle::Tensor conv2d_ad_func(const paddle::Tensor& input, const paddle::Tensor& filter, diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/sync_batch_norm_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/sync_batch_norm_fwd_func.cc index 5bd9571dbf195..a5b0c4d70b07f 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/forwards/sync_batch_norm_fwd_func.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/sync_batch_norm_fwd_func.cc @@ -21,10 +21,11 @@ #include "paddle/fluid/eager/nan_inf_utils.h" #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/phi/api/include/sparse_api.h" +#include "paddle/phi/core/flags.h" #pragma GCC diagnostic ignored "-Wunused-variable" -DECLARE_bool(check_nan_inf); -DECLARE_string(tensor_operants_mode); +PHI_DECLARE_bool(check_nan_inf); +PHI_DECLARE_string(tensor_operants_mode); std::tuple, egr::kSlotSmallVectorSize> AddNGradNodeFinal::operator()( diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc b/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc index ce06b02728719..0d9862b7b2f2d 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc @@ -26,7 +26,9 @@ #include "paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h" #include "paddle/phi/api/include/sparse_api.h" -DECLARE_bool(check_nan_inf); +#include "paddle/phi/core/flags.h" + +PHI_DECLARE_bool(check_nan_inf); paddle::small_vector, egr::kSlotSmallVectorSize> Conv2dGradNodeFinal::operator()( diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/sync_batch_norm_node.cc b/paddle/fluid/eager/api/manual/eager_manual/nodes/sync_batch_norm_node.cc index 4fc847a1ecd8b..4242f74c81c08 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/nodes/sync_batch_norm_node.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/sync_batch_norm_node.cc @@ -25,8 +25,9 @@ #include "paddle/phi/api/backward/sparse_bw_api.h" #include "paddle/phi/api/include/sparse_api.h" #include "paddle/phi/api/lib/api_custom_impl.h" +#include "paddle/phi/core/flags.h" -DECLARE_bool(check_nan_inf); +PHI_DECLARE_bool(check_nan_inf); paddle::small_vector, egr::kSlotSmallVectorSize> SyncBatchNormGradNode::operator()( diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py index aa4afa965b8c1..c5db4aee73de0 100644 --- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py @@ -361,7 +361,8 @@ class {} : public egr::GradNodeBase {{ #include "paddle/fluid/prim/api/composite_backward/composite_backward_api.h" #include "paddle/fluid/prim/api/all.h" #include "paddle/fluid/prim/utils/utils.h" -DECLARE_bool(check_nan_inf); +#include "paddle/phi/core/flags.h" +PHI_DECLARE_bool(check_nan_inf); {} """ @@ -388,8 +389,10 @@ class {} : public egr::GradNodeBase {{ #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/fluid/eager/nan_inf_utils.h" #include "paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h" +#include "paddle/phi/core/flags.h" + DECLARE_bool(check_nan_inf); -DECLARE_string(tensor_operants_mode); +PHI_DECLARE_string(tensor_operants_mode); {} {} """ diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc index e71ae7cf11939..c183c64fa7a1d 100644 --- a/paddle/fluid/eager/nan_inf_utils.cc +++ b/paddle/fluid/eager/nan_inf_utils.cc @@ -17,11 +17,12 @@ #include "paddle/fluid/framework/details/nan_inf_utils_detail.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/core/selected_rows.h" -#include "paddle/phi/core/compat/convert_utils.h" -DECLARE_int32(check_nan_inf_level); +PHI_DECLARE_int32(check_nan_inf_level); namespace egr { static std::unordered_set& nan_inf_check_op_list() { diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc index 18ae5627633a0..ebfed9a6f73f6 100644 --- a/paddle/fluid/framework/custom_operator.cc +++ b/paddle/fluid/framework/custom_operator.cc @@ -44,11 +44,11 @@ limitations under the License. */ #include "paddle/phi/backends/device_manager.h" #endif -#include "gflags/gflags.h" #include "paddle/phi/api/include/operants_manager.h" #include "paddle/phi/api/include/tensor_operants.h" +#include "paddle/phi/core/flags.h" -DECLARE_string(tensor_operants_mode); +PHI_DECLARE_string(tensor_operants_mode); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index 63adcd59912aa..32c4845bd0d57 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -30,7 +30,7 @@ limitations under the License. */ #include "paddle/fluid/platform/timer.h" USE_INT_STAT(STAT_total_feasign_num_in_mem); -DECLARE_bool(enable_ins_parser_file); +PHI_DECLARE_bool(enable_ins_parser_file); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/data_feed.cu b/paddle/fluid/framework/data_feed.cu index f9cf10a1aabbe..b84529066fd8c 100644 --- a/paddle/fluid/framework/data_feed.cu +++ b/paddle/fluid/framework/data_feed.cu @@ -31,13 +31,14 @@ limitations under the License. */ #include "paddle/fluid/framework/fleet/heter_ps/hashtable.h" #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" #include "paddle/fluid/framework/io/fs.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/kernels/gpu/graph_reindex_funcs.h" #include "paddle/phi/kernels/graph_reindex_kernel.h" -DECLARE_bool(enable_opt_get_features); -DECLARE_bool(graph_metapath_split_opt); -DECLARE_int32(gpugraph_storage_mode); -DECLARE_double(gpugraph_hbm_table_load_factor); +PHI_DECLARE_bool(enable_opt_get_features); +PHI_DECLARE_bool(graph_metapath_split_opt); +PHI_DECLARE_int32(gpugraph_storage_mode); +PHI_DECLARE_double(gpugraph_hbm_table_load_factor); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h index 415508944c947..a6cf44234773b 100644 --- a/paddle/fluid/framework/data_feed.h +++ b/paddle/fluid/framework/data_feed.h @@ -48,11 +48,12 @@ limitations under the License. */ #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/phi/core/cuda_stream.h" #endif +#include "paddle/phi/core/flags.h" -DECLARE_int32(record_pool_max_size); -DECLARE_int32(slotpool_thread_num); -DECLARE_bool(enable_slotpool_wait_release); -DECLARE_bool(enable_slotrecord_reset_shrink); +PHI_DECLARE_int32(record_pool_max_size); +PHI_DECLARE_int32(slotpool_thread_num); +PHI_DECLARE_bool(enable_slotpool_wait_release); +PHI_DECLARE_bool(enable_slotrecord_reset_shrink); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc index 5ebd480084ee5..3dc440b1d1c69 100644 --- a/paddle/fluid/framework/data_set.cc +++ b/paddle/fluid/framework/data_set.cc @@ -14,7 +14,6 @@ #include "paddle/fluid/framework/data_set.h" -#include "gflags/gflags.h" #include "google/protobuf/text_format.h" #if (defined PADDLE_WITH_DISTRIBUTE) && (defined PADDLE_WITH_PSCORE) #include "paddle/fluid/distributed/index_dataset/index_sampler.h" @@ -24,6 +23,7 @@ #include "paddle/fluid/framework/io/fs.h" #include "paddle/fluid/platform/monitor.h" #include "paddle/fluid/platform/timer.h" +#include "paddle/phi/core/flags.h" #ifdef PADDLE_WITH_PSCORE #include "paddle/fluid/distributed/ps/wrapper/fleet.h" @@ -37,8 +37,8 @@ USE_INT_STAT(STAT_total_feasign_num_in_mem); USE_INT_STAT(STAT_epoch_finish); -DECLARE_bool(graph_get_neighbor_id); -DECLARE_int32(gpugraph_storage_mode); +PHI_DECLARE_bool(graph_get_neighbor_id); +PHI_DECLARE_int32(gpugraph_storage_mode); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 2afeef1efec40..78b8b2e078773 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -18,9 +18,10 @@ #include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/phi/core/flags.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -DECLARE_bool(sync_nccl_allreduce); +PHI_DECLARE_bool(sync_nccl_allreduce); #endif namespace paddle { diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 6d6efeedb2efa..1f807f235ec33 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/fluid/framework/details/computation_op_handle.h" - #include +#include "paddle/phi/core/flags.h" -DECLARE_bool(allreduce_record_one_event); +PHI_DECLARE_bool(allreduce_record_one_event); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc index de9161b1e3312..29d5697b23f0d 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc @@ -19,9 +19,10 @@ #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/phi/backends/device_memory_aligment.h" +#include "paddle/phi/core/flags.h" DEFINE_bool(skip_fused_all_reduce_check, false, ""); -DECLARE_bool(allreduce_record_one_event); +PHI_DECLARE_bool(allreduce_record_one_event); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc index 0f79d781d1217..15648aa058f07 100644 --- a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc @@ -14,9 +14,10 @@ #include "paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/phi/core/flags.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -DECLARE_bool(sync_nccl_allreduce); +PHI_DECLARE_bool(sync_nccl_allreduce); #endif namespace paddle { diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc index 7890e37e67246..5d6975df9c4d6 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc @@ -20,9 +20,10 @@ #include "paddle/phi/common/amp_type_traits.h" #include "paddle/fluid/framework/convert_utils.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/kernels/funcs/eigen/extensions.h" -DECLARE_int32(check_nan_inf_level); +PHI_DECLARE_int32(check_nan_inf_level); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu index 3e001299e8e38..5569a6f29af90 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu @@ -24,9 +24,10 @@ #include "paddle/fluid/framework/scope.h" #include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/common/memory_utils.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/kernels/funcs/math_cuda_utils.h" -DECLARE_int32(check_nan_inf_level); +PHI_DECLARE_int32(check_nan_inf_level); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.h b/paddle/fluid/framework/details/nan_inf_utils_detail.h index 59865162cc365..f4ee2c20b9d2f 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.h +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.h @@ -20,8 +20,8 @@ #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/place.h" #include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/kernels/funcs/eigen/extensions.h" - #ifdef _WIN32 #include #include @@ -31,7 +31,7 @@ #define MKDIR(path) mkdir(path, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH) #endif -DECLARE_int32(check_nan_inf_level); +PHI_DECLARE_int32(check_nan_inf_level); namespace paddle { namespace framework { namespace details { diff --git a/paddle/fluid/framework/details/nccl_op_handle.h b/paddle/fluid/framework/details/nccl_op_handle.h index 5ed4d80203fb3..db7fc45b246c6 100644 --- a/paddle/fluid/framework/details/nccl_op_handle.h +++ b/paddle/fluid/framework/details/nccl_op_handle.h @@ -28,8 +28,9 @@ #include "paddle/fluid/platform/dynload/rccl.h" #endif #include "paddle/fluid/platform/device/gpu/nccl_helper.h" +#include "paddle/phi/core/flags.h" -DECLARE_bool(sync_nccl_allreduce); +PHI_DECLARE_bool(sync_nccl_allreduce); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index a81269b1ecea8..e17f3cbcf6c56 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -18,9 +18,9 @@ #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/variable_visitor.h" +#include "paddle/fluid/platform/flags.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler/event_tracing.h" - PADDLE_DEFINE_EXPORTED_bool( cpu_deterministic, false, diff --git a/paddle/fluid/framework/details/scope_buffered_monitor.cc b/paddle/fluid/framework/details/scope_buffered_monitor.cc index 88a074763e482..a84fb8eec9eb6 100644 --- a/paddle/fluid/framework/details/scope_buffered_monitor.cc +++ b/paddle/fluid/framework/details/scope_buffered_monitor.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/details/scope_buffered_monitor.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/phi/core/flags.h" namespace paddle { namespace framework { @@ -22,7 +23,7 @@ class Variable; } // namespace framework } // namespace paddle -DECLARE_double(local_exe_sub_scope_limit); +PHI_DECLARE_double(local_exe_sub_scope_limit); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc index d4397225ac81d..d5ab538548361 100644 --- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc @@ -26,8 +26,9 @@ #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/phi/core/flags.h" -DECLARE_bool(sync_nccl_allreduce); +PHI_DECLARE_bool(sync_nccl_allreduce); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/dist_multi_trainer_test.cc b/paddle/fluid/framework/dist_multi_trainer_test.cc index a96a7c3a24f9b..f4c232e2b6b42 100644 --- a/paddle/fluid/framework/dist_multi_trainer_test.cc +++ b/paddle/fluid/framework/dist_multi_trainer_test.cc @@ -22,7 +22,7 @@ #else #define _LINUX #endif -DECLARE_bool(enable_exit_when_partial_worker); +PHI_DECLARE_bool(enable_exit_when_partial_worker); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 573971c6d123b..e0ad2255743c4 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -29,9 +29,10 @@ limitations under the License. */ #include "paddle/fluid/platform/mkldnn_helper.h" #endif #include "paddle/fluid/framework/executor_gc_helper.h" +#include "paddle/phi/core/flags.h" DECLARE_bool(benchmark); -DECLARE_bool(use_mkldnn); +PHI_DECLARE_bool(use_mkldnn); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h b/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h index b4590548d70fb..2c58573f0153c 100644 --- a/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h +++ b/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h @@ -31,6 +31,7 @@ #include "hash_functions.cuh" #include "managed.cuh" #include "managed_allocator.cuh" +#include "paddle/phi/core/flags.h" // TODO: replace this with CUDA_TRY and propagate the error #ifndef CUDA_RT_CALL @@ -51,7 +52,7 @@ } #endif -DECLARE_bool(gpugraph_enable_hbm_table_collision_stat); +PHI_DECLARE_bool(gpugraph_enable_hbm_table_collision_stat); // TODO: can we do this more efficiently? __inline__ __device__ int8_t atomicCAS(int8_t* address, diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h index e67698110b3a3..32c4c404388b5 100644 --- a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h +++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h @@ -22,7 +22,9 @@ #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/phi/core/enforce.h" -DECLARE_bool(gpugraph_load_node_list_into_hbm); +#include "paddle/phi/core/flags.h" + +PHI_DECLARE_bool(gpugraph_load_node_list_into_hbm); namespace paddle { namespace framework { struct GpuPsNodeInfo { diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h index 655e3c2a5cf0d..fbdcb181e0f28 100644 --- a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h +++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h @@ -22,8 +22,9 @@ #include #include #include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/flags.h" -DECLARE_bool(gpugraph_debug_gpu_memory); +PHI_DECLARE_bool(gpugraph_debug_gpu_memory); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h index e32cb1fac4b3a..8bbe85b192a40 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h @@ -22,9 +22,10 @@ #include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/flags.h" #ifdef PADDLE_WITH_HETERPS -DECLARE_double(gpugraph_hbm_table_load_factor); +PHI_DECLARE_double(gpugraph_hbm_table_load_factor); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu index 2ca1eba77e57a..597db9520ca43 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu @@ -18,8 +18,9 @@ #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h" #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h" -DECLARE_int32(gpugraph_storage_mode); -DECLARE_bool(graph_metapath_split_opt); +#include "paddle/phi/core/flags.h" +PHI_DECLARE_int32(gpugraph_storage_mode); +PHI_DECLARE_bool(graph_metapath_split_opt); namespace paddle { namespace framework { #ifdef PADDLE_WITH_HETERPS diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index 61e7f6b0281b4..4c00c1d150205 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -25,15 +25,16 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU_KP #include "paddle/fluid/platform/device/xpu/xpu_info.h" #endif - -DECLARE_double(gpugraph_hbm_table_load_factor); -DECLARE_bool(gpugraph_enable_gpu_direct_access); -DECLARE_bool(gpugraph_enable_segment_merge_grads); -DECLARE_uint64(gpugraph_merge_grads_segment_size); -DECLARE_int32(gpugraph_dedup_pull_push_mode); -DECLARE_bool(enable_tracker_all2all); -DECLARE_bool(enable_all2all_use_fp16); -DECLARE_bool(enable_sparse_inner_gather); +#include "paddle/phi/core/flags.h" + +PHI_DECLARE_double(gpugraph_hbm_table_load_factor); +PHI_DECLARE_bool(gpugraph_enable_gpu_direct_access); +PHI_DECLARE_bool(gpugraph_enable_segment_merge_grads); +PHI_DECLARE_uint64(gpugraph_merge_grads_segment_size); +PHI_DECLARE_int32(gpugraph_dedup_pull_push_mode); +PHI_DECLARE_bool(enable_tracker_all2all); +PHI_DECLARE_bool(enable_all2all_use_fp16); +PHI_DECLARE_bool(enable_sparse_inner_gather); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc index 63e9226fcc095..866a1292bcbd8 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc +++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc @@ -23,10 +23,11 @@ limitations under the License. */ #include "paddle/fluid/platform/device/xpu/enforce_xpu.h" #include "paddle/fluid/platform/device/xpu/xpu_info.h" #endif +#include "paddle/phi/core/flags.h" #include "paddle/utils/string/string_helper.h" -DECLARE_bool(enable_auto_detect_gpu_topo); -DECLARE_bool(enable_auto_rdma_trans); +PHI_DECLARE_bool(enable_auto_detect_gpu_topo); +PHI_DECLARE_bool(enable_auto_rdma_trans); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index 7ab49bc025e27..5b48d54543b74 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -42,9 +42,10 @@ limitations under the License. */ #if defined(PADDLE_WITH_PSCORE) #include "paddle/fluid/distributed/ps/table/depends/feature_value.h" #endif +#include "paddle/phi/core/flags.h" -DECLARE_int32(gpugraph_dedup_pull_push_mode); -DECLARE_int32(gpugraph_storage_mode); +PHI_DECLARE_int32(gpugraph_dedup_pull_push_mode); +PHI_DECLARE_int32(gpugraph_storage_mode); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index 0d4a6c4871d6a..98c44685ea273 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -66,7 +66,9 @@ limitations under the License. */ #include "downpour_accessor.h" // NOLINT #endif #include "paddle/fluid/framework/fleet/heter_ps/log_patch.h" -DECLARE_int32(gpugraph_storage_mode); +#include "paddle/phi/core/flags.h" + +PHI_DECLARE_int32(gpugraph_storage_mode); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc index 40222dff3088a..3296679e1eeeb 100644 --- a/paddle/fluid/framework/garbage_collector.cc +++ b/paddle/fluid/framework/garbage_collector.cc @@ -19,10 +19,11 @@ #include "gflags/gflags.h" #include "paddle/fluid/framework/garbage_collector.h" #include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/phi/core/flags.h" -DECLARE_double(eager_delete_tensor_gb); -DECLARE_double(memory_fraction_of_eager_deletion); -DECLARE_bool(fast_eager_deletion_mode); +PHI_DECLARE_double(eager_delete_tensor_gb); +PHI_DECLARE_double(memory_fraction_of_eager_deletion); +PHI_DECLARE_bool(fast_eager_deletion_mode); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc index b5baeacaec835..ce50efc67008a 100644 --- a/paddle/fluid/framework/hogwild_worker.cc +++ b/paddle/fluid/framework/hogwild_worker.cc @@ -29,8 +29,9 @@ limitations under the License. */ #if defined(PADDLE_WITH_GLOO) #include "paddle/fluid/framework/fleet/gloo_wrapper.h" #endif +#include "paddle/phi/core/flags.h" -DECLARE_bool(enable_exit_when_partial_worker); +PHI_DECLARE_bool(enable_exit_when_partial_worker); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc b/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc index 0c09ceac9955a..9f9d8b2bac258 100644 --- a/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc +++ b/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc @@ -19,7 +19,7 @@ #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h" - +#include "paddle/fluid/platform/flags.h" namespace paddle { namespace framework { class ProgramDesc; diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 5143ccfe4531c..caaba441d1ff3 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -17,7 +17,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/operator.h" - +#include "paddle/fluid/platform/flags.h" PADDLE_DEFINE_EXPORTED_bool(convert_all_blocks, true, "Convert all blocks in program into SSAgraphs"); diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index 7e6ef668fb398..04ff8c0190837 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -28,7 +28,7 @@ limitations under the License. */ #include "paddle/fluid/framework/details/nccl_op_handle.h" #include "paddle/fluid/platform/collective_helper.h" #endif - +#include "paddle/fluid/platform/flags.h" DECLARE_bool(convert_all_blocks); PADDLE_DEFINE_EXPORTED_string(print_sub_graph_dir, "", diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc index 02934922f821e..a0f1d9eed0038 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc @@ -20,6 +20,7 @@ #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h" #include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/program_desc.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(scale); @@ -29,7 +30,7 @@ USE_OP_ITSELF(elementwise_add_grad); PD_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT); -DECLARE_double(eager_delete_tensor_gb); +PHI_DECLARE_double(eager_delete_tensor_gb); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/new_executor/executor_statistics.cc b/paddle/fluid/framework/new_executor/executor_statistics.cc index f773a386af473..c1ba3b193f1de 100644 --- a/paddle/fluid/framework/new_executor/executor_statistics.cc +++ b/paddle/fluid/framework/new_executor/executor_statistics.cc @@ -24,9 +24,9 @@ #include #include "glog/logging.h" +#include "paddle/fluid/platform/flags.h" #include "paddle/fluid/platform/os_info.h" #include "paddle/fluid/platform/profiler/utils.h" -#include "paddle/phi/core/flags.h" DECLARE_bool(use_stream_safe_cuda_allocator); PADDLE_DEFINE_EXPORTED_string(static_executor_perfstat_filepath, diff --git a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc index 8ff8b9528322f..ed2d48f4b1d85 100644 --- a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc +++ b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc @@ -17,9 +17,10 @@ #include "paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.h" #include "paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.h" #include "paddle/fluid/framework/new_executor/garbage_collector/no_event_garbage_collector.h" +#include "paddle/phi/core/flags.h" -DECLARE_bool(fast_eager_deletion_mode); -DECLARE_bool(new_executor_use_cuda_graph); +PHI_DECLARE_bool(fast_eager_deletion_mode); +PHI_DECLARE_bool(new_executor_use_cuda_graph); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc index 9d9a5e126a9ce..3f0cce3c37830 100644 --- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc +++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc @@ -16,7 +16,7 @@ #include #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h" - +#include "paddle/fluid/platform/flags.h" PADDLE_DEFINE_EXPORTED_bool( add_dependency_for_communication_op, true, diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc index 090e973155334..3e630c32278ed 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc @@ -27,6 +27,7 @@ #include "paddle/fluid/operators/controlflow/recurrent_op_helper.h" #include "paddle/fluid/operators/controlflow/while_op_helper.h" #include "paddle/fluid/operators/ops_extra_info.h" +#include "paddle/fluid/platform/flags.h" #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/kernel_context.h" #include "paddle/phi/core/kernel_factory.h" @@ -40,8 +41,8 @@ PADDLE_DEFINE_EXPORTED_bool( false, "Log memory stats after each op runs, just used for debug."); -DECLARE_bool(use_mkldnn); -DECLARE_bool(check_nan_inf); +PHI_DECLARE_bool(use_mkldnn); +PHI_DECLARE_bool(check_nan_inf); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index d61a0a472d873..494a5afa7ed56 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -33,6 +33,7 @@ #include "paddle/fluid/platform/mkldnn_helper.h" #endif #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" +#include "paddle/fluid/platform/flags.h" #include "paddle/phi/backends/device_manager.h" PADDLE_DEFINE_EXPORTED_bool( @@ -51,11 +52,11 @@ PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope, "Use local_scope in new executor(especially used " "in UT), can turn off for better performance"); -DECLARE_bool(check_nan_inf); +PHI_DECLARE_bool(check_nan_inf); DECLARE_bool(benchmark); -DECLARE_bool(new_executor_use_cuda_graph); +PHI_DECLARE_bool(new_executor_use_cuda_graph); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -DECLARE_bool(sync_nccl_allreduce); +PHI_DECLARE_bool(sync_nccl_allreduce); #endif constexpr const char* kExceptionCaught = "ExceptionCaught"; diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index c0c62a6fd2aa8..13ef07ab9855e 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -25,8 +25,7 @@ limitations under the License. */ #include #define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h -#include "gflags/gflags.h" -#include "glog/logging.h" // For VLOG() +#include "glog/logging.h" // For VLOG() #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/details/op_registry.h" #include "paddle/fluid/framework/grad_op_desc_maker.h" @@ -34,6 +33,7 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/shape_inference.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/macros.h" @@ -70,7 +70,7 @@ class Version; } // namespace framework } // namespace paddle -DECLARE_bool(check_kernel_launch); +PHI_DECLARE_bool(check_kernel_launch); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index c6da5403dc5fc..65d31b87ed355 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -39,6 +39,7 @@ limitations under the License. */ #include "paddle/phi/common/scalar.h" #include "paddle/phi/core/compat/get_kerneltype_forvar_utils.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/core/kernel_context.h" #include "paddle/phi/core/kernel_factory.h" #include "paddle/phi/ops/compat/signatures.h" @@ -62,10 +63,10 @@ class DenseTensor; #endif DECLARE_bool(benchmark); -DECLARE_bool(check_nan_inf); +PHI_DECLARE_bool(check_nan_inf); DECLARE_bool(enable_unused_var_check); -DECLARE_bool(run_kp_kernel); -DECLARE_bool(enable_host_event_recorder_hook); +PHI_DECLARE_bool(run_kp_kernel); +PHI_DECLARE_bool(enable_host_event_recorder_hook); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index f055213f7bbc3..23aac65dd1fdb 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -42,6 +42,7 @@ limitations under the License. */ #include "paddle/phi/core/compat/arg_map_context.h" #include "paddle/phi/core/compat/op_utils.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/core/kernel_context.h" #include "paddle/phi/core/kernel_factory.h" #include "paddle/phi/core/macros.h" @@ -59,7 +60,7 @@ namespace phi { class KernelContext; } -DECLARE_int32(inner_op_parallelism); +PHI_DECLARE_int32(inner_op_parallelism); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc index 4c1538a28fedb..c0013cfb99eec 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc @@ -47,10 +47,11 @@ #include "paddle/fluid/operators/cinn/cinn_launch_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/string/string_helper.h" +#include "paddle/phi/core/flags.h" -DECLARE_bool(enable_pe_launch_cinn); -DECLARE_bool(enable_cinn_auto_tune); -DECLARE_string(cinn_subgraph_graphviz_dir); +PHI_DECLARE_bool(enable_pe_launch_cinn); +PHI_DECLARE_bool(enable_cinn_auto_tune); +PHI_DECLARE_string(cinn_subgraph_graphviz_dir); namespace paddle { namespace framework { namespace paddle2cinn { diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc index 4e9257c0613ef..61f74b22f76b7 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc @@ -37,9 +37,10 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/flags.h" -DECLARE_string(allow_cinn_ops); -DECLARE_string(deny_cinn_ops); +PHI_DECLARE_string(allow_cinn_ops); +PHI_DECLARE_string(deny_cinn_ops); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 675ec593d9366..3820667116bc6 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -44,11 +44,12 @@ limitations under the License. */ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" #endif +#include "paddle/fluid/platform/flags.h" -DECLARE_double(eager_delete_tensor_gb); +PHI_DECLARE_double(eager_delete_tensor_gb); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -DECLARE_bool(sync_nccl_allreduce); +PHI_DECLARE_bool(sync_nccl_allreduce); #endif #ifdef WITH_GPERFTOOLS diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index a54110add67a8..31346652da2b4 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include "glog/logging.h" #include "paddle/fluid/framework/threadpool.h" - +#include "paddle/fluid/platform/flags.h" DECLARE_bool(benchmark); PADDLE_DEFINE_EXPORTED_bool( diff --git a/paddle/fluid/framework/unused_var_check.cc b/paddle/fluid/framework/unused_var_check.cc index 1350dba8296ec..ad21fdf45698b 100644 --- a/paddle/fluid/framework/unused_var_check.cc +++ b/paddle/fluid/framework/unused_var_check.cc @@ -18,12 +18,11 @@ limitations under the License. */ #include -#include "gflags/gflags.h" #include "paddle/fluid/framework/no_need_buffer_vars_inference.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/enforce.h" - +#include "paddle/fluid/platform/flags.h" PADDLE_DEFINE_EXPORTED_bool( enable_unused_var_check, false, diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc index b5433a43ef119..a2cea4067a0ea 100644 --- a/paddle/fluid/imperative/basic_engine.cc +++ b/paddle/fluid/imperative/basic_engine.cc @@ -30,10 +30,11 @@ #include "paddle/fluid/imperative/op_base.h" #include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/platform/profiler.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/kernels/autotune/switch_autotune.h" #include "paddle/phi/kernels/funcs/math_function.h" -DECLARE_bool(sort_sum_gradient); +PHI_DECLARE_bool(sort_sum_gradient); namespace paddle { namespace imperative { diff --git a/paddle/fluid/imperative/flags.cc b/paddle/fluid/imperative/flags.cc index 06c2719ebfa63..f66aacc89ec50 100644 --- a/paddle/fluid/imperative/flags.cc +++ b/paddle/fluid/imperative/flags.cc @@ -14,7 +14,7 @@ #include "paddle/fluid/imperative/flags.h" -#include "paddle/phi/core/flags.h" +#include "paddle/fluid/platform/flags.h" PADDLE_DEFINE_EXPORTED_uint64(dygraph_debug, 0, diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 2ac43c39d72ab..6538042a37e96 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -24,12 +24,13 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/kernels/funcs/math_function.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif -DECLARE_bool(use_mkldnn); +PHI_DECLARE_bool(use_mkldnn); namespace paddle { namespace imperative { diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc index 471627e4ca524..0aff56ff88a04 100644 --- a/paddle/fluid/imperative/partial_grad_engine.cc +++ b/paddle/fluid/imperative/partial_grad_engine.cc @@ -33,9 +33,10 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/string/string_helper.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/kernels/funcs/math_function.h" -DECLARE_bool(sort_sum_gradient); +PHI_DECLARE_bool(sort_sum_gradient); namespace paddle { namespace imperative { diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 31a9c28fff68d..cda2fad5d7436 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -33,10 +33,11 @@ #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/supplement_tracing.h" +#include "paddle/phi/core/flags.h" -DECLARE_bool(check_nan_inf); +PHI_DECLARE_bool(check_nan_inf); DECLARE_bool(benchmark); -DECLARE_bool(run_kp_kernel); +PHI_DECLARE_bool(run_kp_kernel); namespace paddle { namespace imperative { diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index bd0f7a060d844..c956230b1447c 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -31,10 +31,11 @@ #include "paddle/fluid/imperative/var_helper.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/core/kernel_context.h" #include "paddle/phi/core/selected_rows.h" -DECLARE_bool(use_mkldnn); +PHI_DECLARE_bool(use_mkldnn); namespace paddle { namespace imperative { diff --git a/paddle/fluid/imperative/profiler.cc b/paddle/fluid/imperative/profiler.cc index fc3066051d853..57f835fe5f6a8 100644 --- a/paddle/fluid/imperative/profiler.cc +++ b/paddle/fluid/imperative/profiler.cc @@ -20,8 +20,7 @@ #include #include // NOLINT - -#include "paddle/phi/core/flags.h" +#include "paddle/fluid/platform/flags.h" PADDLE_DEFINE_EXPORTED_string( tracer_profile_fname, diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 6207fc54f4d16..2d4e6622c05b7 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -30,10 +30,11 @@ #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/string/string_helper.h" #include "paddle/phi/common/place.h" +#include "paddle/phi/core/flags.h" -DECLARE_bool(use_mkldnn); -DECLARE_string(tracer_mkldnn_ops_on); -DECLARE_string(tracer_mkldnn_ops_off); +PHI_DECLARE_bool(use_mkldnn); +PHI_DECLARE_string(tracer_mkldnn_ops_on); +PHI_DECLARE_string(tracer_mkldnn_ops_off); namespace paddle { namespace imperative { diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index b0f53c1f639ac..afb2dcd981fa8 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -24,6 +24,7 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/errors.h" #include "paddle/phi/backends/cpu/cpu_info.h" +#include "paddle/phi/core/flags.h" #include "paddle/utils/string/split.h" #ifdef PADDLE_WITH_TENSORRT @@ -31,7 +32,7 @@ #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -DECLARE_uint64(initial_gpu_memory_in_mb); +PHI_DECLARE_uint64(initial_gpu_memory_in_mb); #endif namespace paddle { diff --git a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc index 44bcc3bf900cb..b6b20a901b2bb 100644 --- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc @@ -17,11 +17,9 @@ limitations under the License. */ */ #include // use glog instead of CHECK to avoid importing other paddle header files. - #include "gflags/gflags.h" #include "utils.h" // NOLINT -DECLARE_double(fraction_of_gpu_memory_to_use); DEFINE_string(modeldir, "", "Directory of the inference model."); DEFINE_string(refer, "", "path to reference result for comparison."); DEFINE_string(data, diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc index fad5c30e9ff22..7850b4edb1098 100644 --- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc @@ -21,9 +21,6 @@ limitations under the License. */ #include "gflags/gflags.h" #include "utils.h" // NOLINT -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -DECLARE_double(fraction_of_gpu_memory_to_use); -#endif DEFINE_string(modeldir, "", "Directory of the inference model."); DEFINE_string(refer, "", "path to reference result for comparison."); DEFINE_string(data, diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 05746ea5123c8..8e1531352137f 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -39,10 +39,11 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/place.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/core/stream.h" #include "paddle/utils/any.h" -DECLARE_bool(trt_ibuilder_cache); +PHI_DECLARE_bool(trt_ibuilder_cache); namespace paddle { namespace inference { diff --git a/paddle/fluid/jit/serializer.cc b/paddle/fluid/jit/serializer.cc index 21a187ad67100..57f457dbd8122 100644 --- a/paddle/fluid/jit/serializer.cc +++ b/paddle/fluid/jit/serializer.cc @@ -25,8 +25,9 @@ #include "paddle/fluid/jit/layer.h" #include "paddle/fluid/jit/property.h" #include "paddle/fluid/jit/serializer_utils.h" +#include "paddle/phi/core/flags.h" -DECLARE_string(jit_engine_type); +PHI_DECLARE_string(jit_engine_type); namespace paddle { namespace jit { diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h index a7cf653980d96..dea6855cf5abb 100644 --- a/paddle/fluid/memory/allocation/allocator.h +++ b/paddle/fluid/memory/allocation/allocator.h @@ -23,8 +23,9 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" #include "paddle/phi/core/allocator.h" +#include "paddle/phi/core/flags.h" -DECLARE_string(allocator_strategy); +PHI_DECLARE_string(allocator_strategy); namespace paddle { namespace memory { diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 1d427008c7e4c..527e843d05bb8 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -14,7 +14,6 @@ #include "paddle/fluid/memory/allocation/allocator_facade.h" -#include "gflags/gflags.h" #include "paddle/fluid/memory/allocation/aligned_allocator.h" #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" @@ -62,6 +61,7 @@ #include "paddle/fluid/memory/allocation/custom_allocator.h" #include "paddle/fluid/platform/device/device_wrapper.h" #endif +#include "paddle/fluid/platform/flags.h" PADDLE_DEFINE_EXPORTED_int64( gpu_allocator_retry_time, @@ -92,8 +92,8 @@ PADDLE_DEFINE_EXPORTED_bool(use_cuda_managed_memory, "managed memory, only available for auto_growth " "strategy"); -DECLARE_string(allocator_strategy); -DECLARE_uint64(auto_growth_chunk_size_in_mb); +PHI_DECLARE_string(allocator_strategy); +PHI_DECLARE_uint64(auto_growth_chunk_size_in_mb); namespace paddle { namespace memory { diff --git a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc index d3f16ec628660..1e09c43c4f12f 100644 --- a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc +++ b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc @@ -15,15 +15,16 @@ #include #include "paddle/fluid/memory/allocation/allocator_facade.h" +#include "paddle/phi/core/flags.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -DECLARE_double(fraction_of_gpu_memory_to_use); -DECLARE_double(fraction_of_cuda_pinned_memory_to_use); -DECLARE_uint64(initial_gpu_memory_in_mb); -DECLARE_uint64(reallocate_gpu_memory_in_mb); +PHI_DECLARE_double(fraction_of_gpu_memory_to_use); +PHI_DECLARE_double(fraction_of_cuda_pinned_memory_to_use); +PHI_DECLARE_uint64(initial_gpu_memory_in_mb); +PHI_DECLARE_uint64(reallocate_gpu_memory_in_mb); DECLARE_int64(gpu_allocator_retry_time); #endif -DECLARE_string(allocator_strategy); +PHI_DECLARE_string(allocator_strategy); namespace paddle { namespace memory { diff --git a/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc b/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc index 85cd851a2140a..63e3eab3256c9 100644 --- a/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc +++ b/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc @@ -15,15 +15,16 @@ #include #include "paddle/fluid/memory/allocation/allocator_facade.h" +#include "paddle/phi/core/flags.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -DECLARE_double(fraction_of_gpu_memory_to_use); -DECLARE_double(fraction_of_cuda_pinned_memory_to_use); -DECLARE_uint64(initial_gpu_memory_in_mb); -DECLARE_uint64(reallocate_gpu_memory_in_mb); +PHI_DECLARE_double(fraction_of_gpu_memory_to_use); +PHI_DECLARE_double(fraction_of_cuda_pinned_memory_to_use); +PHI_DECLARE_uint64(initial_gpu_memory_in_mb); +PHI_DECLARE_uint64(reallocate_gpu_memory_in_mb); DECLARE_int64(gpu_allocator_retry_time); #endif -DECLARE_string(allocator_strategy); +PHI_DECLARE_string(allocator_strategy); namespace paddle { namespace memory { diff --git a/paddle/fluid/memory/allocation/allocator_strategy.cc b/paddle/fluid/memory/allocation/allocator_strategy.cc index 518b31e943048..a133db577ee7c 100644 --- a/paddle/fluid/memory/allocation/allocator_strategy.cc +++ b/paddle/fluid/memory/allocation/allocator_strategy.cc @@ -14,10 +14,10 @@ #include "paddle/fluid/memory/allocation/allocator_strategy.h" -#include "gflags/gflags.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/flags.h" -DECLARE_string(allocator_strategy); +PHI_DECLARE_string(allocator_strategy); namespace paddle { namespace memory { diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc index cf3bb15fdd8a1..ccbaca8ed7188 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc @@ -18,8 +18,8 @@ #include // NOLINT #include "paddle/fluid/memory/allocation/aligned_allocator.h" +#include "paddle/fluid/platform/flags.h" #include "paddle/fluid/platform/profiler/event_tracing.h" -#include "paddle/phi/core/flags.h" PADDLE_DEFINE_EXPORTED_READONLY_bool( free_idle_chunk, diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc index 70c43145cc85d..bfd05b6b323fe 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc @@ -19,17 +19,17 @@ #include #include // NOLINT -#include "gflags/gflags.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/phi/core/flags.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -DECLARE_double(fraction_of_gpu_memory_to_use); -DECLARE_double(fraction_of_cuda_pinned_memory_to_use); +PHI_DECLARE_double(fraction_of_gpu_memory_to_use); +PHI_DECLARE_double(fraction_of_cuda_pinned_memory_to_use); DECLARE_int64(gpu_allocator_retry_time); #endif -DECLARE_string(allocator_strategy); +PHI_DECLARE_string(allocator_strategy); namespace paddle { namespace memory { diff --git a/paddle/fluid/memory/allocation/buddy_allocator.cc b/paddle/fluid/memory/allocation/buddy_allocator.cc index 17edc412271c5..c90c506cc0c4b 100644 --- a/paddle/fluid/memory/allocation/buddy_allocator.cc +++ b/paddle/fluid/memory/allocation/buddy_allocator.cc @@ -16,12 +16,12 @@ limitations under the License. */ #include -#include "gflags/gflags.h" #include "glog/logging.h" +#include "paddle/phi/core/flags.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #define USE_DEVICE -DECLARE_uint64(reallocate_gpu_memory_in_mb); +PHI_DECLARE_uint64(reallocate_gpu_memory_in_mb); #endif #include "paddle/fluid/platform/device/device_wrapper.h" diff --git a/paddle/fluid/memory/allocation/buddy_allocator_test.cc b/paddle/fluid/memory/allocation/buddy_allocator_test.cc index 79f2e17102622..1aeb1722d0ec8 100644 --- a/paddle/fluid/memory/allocation/buddy_allocator_test.cc +++ b/paddle/fluid/memory/allocation/buddy_allocator_test.cc @@ -22,14 +22,14 @@ limitations under the License. */ #include #include -#include "gflags/gflags.h" #include "gtest/gtest.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/phi/core/flags.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -DECLARE_double(fraction_of_gpu_memory_to_use); -DECLARE_uint64(initial_gpu_memory_in_mb); -DECLARE_uint64(reallocate_gpu_memory_in_mb); +PHI_DECLARE_double(fraction_of_gpu_memory_to_use); +PHI_DECLARE_uint64(initial_gpu_memory_in_mb); +PHI_DECLARE_uint64(reallocate_gpu_memory_in_mb); #endif namespace paddle { diff --git a/paddle/fluid/memory/allocation/mmap_allocator.cc b/paddle/fluid/memory/allocation/mmap_allocator.cc index 2e091fa1b2f60..1467d4b81081d 100644 --- a/paddle/fluid/memory/allocation/mmap_allocator.cc +++ b/paddle/fluid/memory/allocation/mmap_allocator.cc @@ -26,8 +26,9 @@ #include "glog/logging.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/flags.h" -DECLARE_bool(use_shm_cache); +PHI_DECLARE_bool(use_shm_cache); namespace paddle { namespace memory { diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index 10dfe27391626..8c9eb889add6c 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -16,7 +16,6 @@ #include -#include "gflags/gflags.h" #include "glog/logging.h" #include "paddle/fluid/memory/allocation/buddy_allocator.h" #include "paddle/fluid/memory/allocation/system_allocator.h" @@ -30,7 +29,7 @@ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" #endif - +#include "paddle/fluid/platform/flags.h" PADDLE_DEFINE_EXPORTED_bool( init_allocated_mem, false, @@ -39,9 +38,9 @@ PADDLE_DEFINE_EXPORTED_bool( "To find this error in time, we use init_allocated_mem to indicate " "that initializing the allocated memory with a small value " "during unit testing."); -DECLARE_double(fraction_of_gpu_memory_to_use); -DECLARE_uint64(initial_gpu_memory_in_mb); -DECLARE_uint64(reallocate_gpu_memory_in_mb); +PHI_DECLARE_double(fraction_of_gpu_memory_to_use); +PHI_DECLARE_uint64(initial_gpu_memory_in_mb); +PHI_DECLARE_uint64(reallocate_gpu_memory_in_mb); DECLARE_bool(benchmark); namespace paddle { diff --git a/paddle/fluid/memory/allocation/system_allocator.cc b/paddle/fluid/memory/allocation/system_allocator.cc index f5069d655f8a7..210be01669775 100644 --- a/paddle/fluid/memory/allocation/system_allocator.cc +++ b/paddle/fluid/memory/allocation/system_allocator.cc @@ -26,11 +26,12 @@ limitations under the License. */ #else #include // for mlock and munlock #endif -#include "gflags/gflags.h" + #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/phi/backends/cpu/cpu_info.h" +#include "paddle/phi/core/flags.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" @@ -39,10 +40,10 @@ limitations under the License. */ #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/profiler/mem_tracing.h" -DECLARE_bool(use_pinned_memory); -DECLARE_double(fraction_of_gpu_memory_to_use); -DECLARE_uint64(initial_gpu_memory_in_mb); -DECLARE_uint64(reallocate_gpu_memory_in_mb); +PHI_DECLARE_bool(use_pinned_memory); +PHI_DECLARE_double(fraction_of_gpu_memory_to_use); +PHI_DECLARE_uint64(initial_gpu_memory_in_mb); +PHI_DECLARE_uint64(reallocate_gpu_memory_in_mb); namespace paddle { namespace memory { diff --git a/paddle/fluid/memory/allocation/system_allocator_test.cc b/paddle/fluid/memory/allocation/system_allocator_test.cc index dbb0165aa6f9f..e04d14f0adfde 100644 --- a/paddle/fluid/memory/allocation/system_allocator_test.cc +++ b/paddle/fluid/memory/allocation/system_allocator_test.cc @@ -14,14 +14,14 @@ limitations under the License. */ #include "paddle/fluid/memory/allocation/system_allocator.h" +#include #include -#include "gflags/gflags.h" -#include "gtest/gtest.h" #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/phi/core/flags.h" -DECLARE_bool(use_pinned_memory); +PHI_DECLARE_bool(use_pinned_memory); void TestAllocator(paddle::memory::detail::SystemAllocator* a, size_t size) { bool freed = false; diff --git a/paddle/fluid/memory/allocation/thread_local_allocator_test.cc b/paddle/fluid/memory/allocation/thread_local_allocator_test.cc index c3deef8da6d51..9627d4a2d6808 100644 --- a/paddle/fluid/memory/allocation/thread_local_allocator_test.cc +++ b/paddle/fluid/memory/allocation/thread_local_allocator_test.cc @@ -19,9 +19,10 @@ #include "gtest/gtest.h" #include "paddle/fluid/memory/malloc.h" +#include "paddle/phi/core/flags.h" -DECLARE_double(fraction_of_gpu_memory_to_use); -DECLARE_string(allocator_strategy); +PHI_DECLARE_double(fraction_of_gpu_memory_to_use); +PHI_DECLARE_string(allocator_strategy); namespace paddle { namespace memory { diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 23f1bb7bdd340..0edcb3726d059 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -29,7 +29,8 @@ limitations under the License. */ #include "paddle/phi/backends/dynload/port.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/infermeta/backward.h" -DECLARE_bool(use_mkldnn); + +PHI_DECLARE_bool(use_mkldnn); namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu index f93cb32a850ef..012edde57294a 100644 --- a/paddle/fluid/operators/batch_norm_op.cu +++ b/paddle/fluid/operators/batch_norm_op.cu @@ -27,9 +27,10 @@ namespace cub = hipcub; #include "paddle/fluid/operators/batch_norm_op.h" #include "paddle/fluid/platform/float16.h" #include "paddle/phi/backends/gpu/gpu_dnn.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/kernels/funcs/math_function.h" -DECLARE_bool(cudnn_batchnorm_spatial_persistent); +PHI_DECLARE_bool(cudnn_batchnorm_spatial_persistent); namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.cc b/paddle/fluid/operators/cinn/cinn_launch_op.cc index c6a4228300720..ad74d88f70e1d 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_op.cc @@ -21,9 +21,10 @@ #include "cinn/runtime/cinn_runtime.h" #include "cinn/runtime/flags.h" #include "paddle/fluid/string/string_helper.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/core/generator.h" -DECLARE_bool(cudnn_deterministic); +PHI_DECLARE_bool(cudnn_deterministic); namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h index d5befbce611c2..59970412ea6a3 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op.h +++ b/paddle/fluid/operators/cinn/cinn_launch_op.h @@ -21,7 +21,6 @@ #include #include "cinn/common/target.h" -#include "gflags/gflags.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" @@ -29,9 +28,10 @@ #include "paddle/fluid/operators/cinn/cinn_launch_context.h" #include "paddle/fluid/operators/cinn/cinn_op_helper.h" #include "paddle/fluid/platform/profiler.h" +#include "paddle/phi/core/flags.h" -DECLARE_bool(enable_pe_launch_cinn); -DECLARE_bool(enable_interpretercore_launch_cinn); +PHI_DECLARE_bool(enable_pe_launch_cinn); +PHI_DECLARE_bool(enable_interpretercore_launch_cinn); namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc index 29ae8bfe3cd9c..9764c31200f9e 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc @@ -20,7 +20,6 @@ limitations under the License. */ #include #include -#include "gflags/gflags.h" #include "gtest/gtest.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h" @@ -29,15 +28,16 @@ limitations under the License. */ #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/init.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(cinn_launch); USE_OP_ITSELF(cinn_instruction_run); USE_OP_ITSELF(elementwise_add); -DECLARE_double(eager_delete_tensor_gb); -DECLARE_bool(enable_pe_launch_cinn); -DECLARE_bool(enable_interpretercore_launch_cinn); -DECLARE_bool(enable_cinn_auto_tune); +PHI_DECLARE_double(eager_delete_tensor_gb); +PHI_DECLARE_bool(enable_pe_launch_cinn); +PHI_DECLARE_bool(enable_interpretercore_launch_cinn); +PHI_DECLARE_bool(enable_cinn_auto_tune); PD_DECLARE_KERNEL(cinn_launch, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(cinn_instruction_run, CPU, ALL_LAYOUT); diff --git a/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc b/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc index 44996e7e4b496..a22df97f1cd57 100644 --- a/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc +++ b/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc @@ -17,8 +17,9 @@ limitations under the License. */ #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif +#include "paddle/phi/core/flags.h" -DECLARE_bool(use_mkldnn); +PHI_DECLARE_bool(use_mkldnn); namespace paddle { namespace framework { class OpDesc; diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.cc b/paddle/fluid/operators/controlflow/conditional_block_op.cc index ee8ec2e276b61..156ba70467380 100644 --- a/paddle/fluid/operators/controlflow/conditional_block_op.cc +++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc @@ -24,7 +24,7 @@ limitations under the License. */ #include "paddle/fluid/platform/mkldnn_helper.h" #endif -DECLARE_bool(use_mkldnn); +PHI_DECLARE_bool(use_mkldnn); namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index 4c7578c010473..cb472fc6948ca 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -22,6 +22,7 @@ #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif +#include "paddle/fluid/platform/flags.h" PADDLE_DEFINE_EXPORTED_bool( cache_inference_while_scope, diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc index 678cb180e79dc..0d88d44defffc 100644 --- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc +++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc @@ -23,10 +23,11 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h" #include "paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" -DECLARE_bool(cudnn_batchnorm_spatial_persistent); +PHI_DECLARE_bool(cudnn_batchnorm_spatial_persistent); namespace framework = paddle::framework; namespace platform = paddle::platform; diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_activation_op.cu index ac2a84e473b0f..a93938fcfd043 100644 --- a/paddle/fluid/operators/fused/fused_bn_activation_op.cu +++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cu @@ -23,10 +23,11 @@ #include "paddle/fluid/operators/fused/fused_bn_activation_op.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/norm_utils.h" -DECLARE_bool(cudnn_batchnorm_spatial_persistent); +PHI_DECLARE_bool(cudnn_batchnorm_spatial_persistent); namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu index 4aae200e550da..1fa7ff1826b07 100644 --- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu +++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu @@ -23,10 +23,11 @@ #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/float16.h" #include "paddle/phi/common/data_type.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/norm_utils.h" -DECLARE_bool(cudnn_batchnorm_spatial_persistent); +PHI_DECLARE_bool(cudnn_batchnorm_spatial_persistent); namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/fused/fused_dropout_helper.h b/paddle/fluid/operators/fused/fused_dropout_helper.h index 8025ba97ac004..17b6f8c445b1e 100644 --- a/paddle/fluid/operators/fused/fused_dropout_helper.h +++ b/paddle/fluid/operators/fused/fused_dropout_helper.h @@ -17,12 +17,13 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fused_dropout_act_bias.h" #include "paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h" #include "paddle/fluid/operators/fused/fused_residual_dropout_bias.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/core/generator.h" #include "paddle/phi/kernels/funcs/dropout_impl_util.h" #include "paddle/phi/kernels/funcs/functors.h" #include "paddle/phi/kernels/layer_norm_kernel.h" -DECLARE_bool(use_fast_math); +PHI_DECLARE_bool(use_fast_math); namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h index 4769433317f0f..459a1f0f7146f 100644 --- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h +++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h @@ -34,6 +34,7 @@ limitations under the License. */ #include "paddle/fluid/platform/dynload/cublasLt.h" #include "paddle/phi/api/include/tensor.h" #include "paddle/phi/backends/gpu/gpu_device_function.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/kernels/funcs/fused_gemm_epilogue.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -43,7 +44,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif -DECLARE_bool(gemm_use_half_precision_compute_type); +PHI_DECLARE_bool(gemm_use_half_precision_compute_type); namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc index 279277e18f702..5a39de9fa4d65 100644 --- a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc +++ b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/pscore/heter_listen_and_serv_op.h" #include "paddle/fluid/framework/op_registry.h" - +#include "paddle/fluid/platform/flags.h" PADDLE_DEFINE_EXPORTED_int32(rpc_send_thread_num, 12, "number of threads for rpc send"); diff --git a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc index f87e66d69da4d..e29844d8f1b70 100644 --- a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc +++ b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc @@ -15,18 +15,19 @@ limitations under the License. */ #include #include +#include #include #include #include #include // NOLINT -#include "gtest/gtest.h" #include "paddle/fluid/distributed/ps/service/heter_client.h" #include "paddle/fluid/distributed/ps/service/heter_server.h" #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/program_desc.h" +#include "paddle/phi/core/flags.h" namespace framework = paddle::framework; namespace platform = paddle::platform; @@ -34,7 +35,7 @@ namespace distributed = paddle::distributed; using MultiVarMsg = ::paddle::distributed::MultiVariableMessage; using VarMsg = ::paddle::distributed::VariableMessage; -DECLARE_double(eager_delete_tensor_gb); +PHI_DECLARE_double(eager_delete_tensor_gb); USE_OP_ITSELF(scale); USE_NO_KERNEL_OP(heter_listen_and_serv); diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h index 513ae0d2785f9..ff6f6ceac54c8 100644 --- a/paddle/fluid/operators/run_program_op.h +++ b/paddle/fluid/operators/run_program_op.h @@ -37,8 +37,9 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/operators/cuda_graph_with_in_out.h" #endif +#include "paddle/phi/core/flags.h" -DECLARE_bool(use_mkldnn); +PHI_DECLARE_bool(use_mkldnn); namespace paddle { namespace operators { diff --git a/paddle/fluid/platform/cpu_info_test.cc b/paddle/fluid/platform/cpu_info_test.cc index e9e45c0292baf..6f7ccd03cff2f 100644 --- a/paddle/fluid/platform/cpu_info_test.cc +++ b/paddle/fluid/platform/cpu_info_test.cc @@ -18,8 +18,9 @@ #include "gflags/gflags.h" #include "gtest/gtest.h" #include "paddle/fluid/string/printf.h" +#include "paddle/phi/core/flags.h" -DECLARE_double(fraction_of_cpu_memory_to_use); +PHI_DECLARE_double(fraction_of_cpu_memory_to_use); TEST(CpuMemoryUsage, Print) { std::stringstream ss; diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc index 095a94cd32e2f..607dcf7622e47 100644 --- a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc +++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc @@ -17,9 +17,10 @@ #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/platform/device_event.h" #include "paddle/phi/backends/context_pool.h" +#include "paddle/phi/core/flags.h" DECLARE_bool(use_stream_safe_cuda_allocator); -DECLARE_bool(new_executor_use_cuda_graph); +PHI_DECLARE_bool(new_executor_use_cuda_graph); namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc index 3373ef51836ea..7f1f2c76bd630 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.cc +++ b/paddle/fluid/platform/device/gpu/gpu_info.cc @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/flags.h" #include "paddle/fluid/platform/lock_guard_ptr.h" #include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/monitor.h" @@ -31,7 +32,6 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler/mem_tracing.h" #include "paddle/fluid/string/split.h" #include "paddle/phi/backends/gpu/gpu_info.h" -#include "paddle/phi/core/flags.h" #ifdef PADDLE_WITH_HIP #include "paddle/fluid/platform/dynload/miopen.h" @@ -46,11 +46,11 @@ limitations under the License. */ #endif #endif -DECLARE_double(fraction_of_gpu_memory_to_use); -DECLARE_uint64(initial_gpu_memory_in_mb); -DECLARE_uint64(reallocate_gpu_memory_in_mb); -DECLARE_bool(enable_cublas_tensor_op_math); -DECLARE_uint64(gpu_memory_limit_mb); +PHI_DECLARE_double(fraction_of_gpu_memory_to_use); +PHI_DECLARE_uint64(initial_gpu_memory_in_mb); +PHI_DECLARE_uint64(reallocate_gpu_memory_in_mb); +PHI_DECLARE_bool(enable_cublas_tensor_op_math); +PHI_DECLARE_uint64(gpu_memory_limit_mb); PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log, false, diff --git a/paddle/fluid/platform/device_code.cc b/paddle/fluid/platform/device_code.cc index f91b420be0d8a..2a410182b48a4 100644 --- a/paddle/fluid/platform/device_code.cc +++ b/paddle/fluid/platform/device_code.cc @@ -22,7 +22,7 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" -DECLARE_string(cuda_dir); +PHI_DECLARE_string(cuda_dir); namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 312febb21fce2..425d4939b565f 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -107,7 +107,7 @@ namespace phi { class ErrorSummary; } // namespace phi -DECLARE_int32(call_stack_level); +PHI_DECLARE_int32(call_stack_level); namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/flags.h b/paddle/fluid/platform/flags.h new file mode 100644 index 0000000000000..32ce438deef44 --- /dev/null +++ b/paddle/fluid/platform/flags.h @@ -0,0 +1,77 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include "paddle/phi/core/flags.h" + +#define __PADDLE_DEFINE_EXPORTED_FLAG( \ + __name, __is_writable, __cpp_type, __gflag_type, __default_value, __doc) \ + DEFINE_##__gflag_type(__name, __default_value, __doc); \ + struct __PaddleRegisterFlag_##__name { \ + __PaddleRegisterFlag_##__name() { \ + using FlagDeclaredType = \ + typename std::remove_reference::type; \ + static_assert(std::is_same::value || \ + std::is_arithmetic::value, \ + "FLAGS should be std::string or arithmetic type"); \ + auto *instance = ::phi::GetMutableExportedFlagInfoMap(); \ + auto &info = (*instance)[#__name]; \ + info.name = #__name; \ + info.value_ptr = &(FLAGS_##__name); \ + info.default_value = static_cast<__cpp_type>(__default_value); \ + info.doc = __doc; \ + info.is_writable = __is_writable; \ + } \ + int Touch() const { return 0; } \ + }; \ + static __PaddleRegisterFlag_##__name __PaddleRegisterFlag_instance##__name; \ + int TouchPaddleFlagRegister_##__name() { \ + return __PaddleRegisterFlag_instance##__name.Touch(); \ + } \ + static_assert(std::is_same<__PaddleRegisterFlag_##__name, \ + ::__PaddleRegisterFlag_##__name>::value, \ + "FLAGS should define in global namespace") + +#define PADDLE_FORCE_LINK_FLAG(__name) \ + extern int TouchPaddleFlagRegister_##__name(); \ + UNUSED static int __paddle_use_flag_##__name = \ + TouchPaddleFlagRegister_##__name() + +#define PADDLE_DEFINE_EXPORTED_bool(name, default_value, doc) \ + __PADDLE_DEFINE_EXPORTED_FLAG(name, true, bool, bool, default_value, doc) +#define PADDLE_DEFINE_EXPORTED_READONLY_bool(name, default_value, doc) \ + __PADDLE_DEFINE_EXPORTED_FLAG(name, false, bool, bool, default_value, doc) + +#define PADDLE_DEFINE_EXPORTED_int32(name, default_value, doc) \ + __PADDLE_DEFINE_EXPORTED_FLAG(name, true, int32_t, int32, default_value, doc) + +#define PADDLE_DEFINE_EXPORTED_int64(name, default_value, doc) \ + __PADDLE_DEFINE_EXPORTED_FLAG(name, true, int64_t, int64, default_value, doc) + +#define PADDLE_DEFINE_EXPORTED_uint64(name, default_value, doc) \ + __PADDLE_DEFINE_EXPORTED_FLAG( \ + name, true, uint64_t, uint64, default_value, doc) + +#define PADDLE_DEFINE_EXPORTED_double(name, default_value, doc) \ + __PADDLE_DEFINE_EXPORTED_FLAG(name, true, double, double, default_value, doc) + +#define PADDLE_DEFINE_EXPORTED_string(name, default_value, doc) \ + __PADDLE_DEFINE_EXPORTED_FLAG( \ + name, true, ::std::string, string, default_value, doc) diff --git a/paddle/fluid/platform/gen_comm_id_helper.cc b/paddle/fluid/platform/gen_comm_id_helper.cc index 308a05be51126..ca9f9d7c4f806 100644 --- a/paddle/fluid/platform/gen_comm_id_helper.cc +++ b/paddle/fluid/platform/gen_comm_id_helper.cc @@ -29,12 +29,12 @@ limitations under the License. */ #include "glog/logging.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/string/split.h" - +#include "paddle/phi/core/flags.h" #if defined(PADDLE_WITH_XPU_BKCL) #include "xpu/bkcl.h" #endif -DECLARE_int32(get_host_by_name_time); +PHI_DECLARE_int32(get_host_by_name_time); namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 4cb92cb7804d7..3fa02f82a6b95 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -53,10 +53,11 @@ limitations under the License. */ #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/platform/flags.h" #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/core/custom_kernel.h" -DECLARE_int32(paddle_num_threads); +PHI_DECLARE_int32(paddle_num_threads); PADDLE_DEFINE_EXPORTED_int32( multiple_of_cupti_buffer_size, 1, diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc index 354259ca91b74..fdaa67e3dd17d 100644 --- a/paddle/fluid/platform/place.cc +++ b/paddle/fluid/platform/place.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/place.h" - +#include "paddle/fluid/platform/flags.h" PADDLE_DEFINE_EXPORTED_bool( benchmark, false, diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 6e06e8aa41ca4..cd8e8ea350f0e 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -32,8 +32,8 @@ limitations under the License. */ #endif #include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/flags.h" #include "paddle/fluid/platform/os_info.h" - PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler, false, "Enable rpc profiler or not."); diff --git a/paddle/fluid/platform/profiler/host_tracer.cc b/paddle/fluid/platform/profiler/host_tracer.cc index ba681c0360618..68ad4b4b34571 100644 --- a/paddle/fluid/platform/profiler/host_tracer.cc +++ b/paddle/fluid/platform/profiler/host_tracer.cc @@ -17,9 +17,9 @@ #include "glog/logging.h" #include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/platform/flags.h" #include "paddle/fluid/platform/profiler/common_event.h" #include "paddle/fluid/platform/profiler/host_event_recorder.h" -#include "paddle/phi/core/flags.h" // Used to filter events, works like glog VLOG(level). // RecordEvent will works if host_trace_level >= level. diff --git a/paddle/fluid/platform/test_limit_gpu_memory.cu b/paddle/fluid/platform/test_limit_gpu_memory.cu index 6bc84a8be9e81..7fdf49ab11bad 100644 --- a/paddle/fluid/platform/test_limit_gpu_memory.cu +++ b/paddle/fluid/platform/test_limit_gpu_memory.cu @@ -12,12 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "gflags/gflags.h" #include "gtest/gtest.h" #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" - -DECLARE_uint64(gpu_memory_limit_mb); +#include "paddle/phi/core/flags.h" +PHI_DECLARE_uint64(gpu_memory_limit_mb); namespace paddle { namespace platform { diff --git a/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h b/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h index b1b24af231f68..eb13902be2068 100644 --- a/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h +++ b/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h @@ -32,7 +32,7 @@ #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/flags.h" -DECLARE_string(tensor_operants_mode); +PHI_DECLARE_string(tensor_operants_mode); namespace paddle { namespace prim { diff --git a/paddle/fluid/prim/utils/utils.cc b/paddle/fluid/prim/utils/utils.cc index aa5255f532adc..0cc160887e78d 100644 --- a/paddle/fluid/prim/utils/utils.cc +++ b/paddle/fluid/prim/utils/utils.cc @@ -13,8 +13,8 @@ // limitations under the License. #include "paddle/fluid/prim/utils/utils.h" +#include "paddle/fluid/platform/flags.h" #include "paddle/fluid/prim/utils/static/static_global_utils.h" -#include "paddle/phi/core/flags.h" PADDLE_DEFINE_EXPORTED_bool(prim_enabled, false, "enable_prim or not"); namespace paddle { diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index 3996ac81ef84c..f438acc5fff93 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -62,11 +62,11 @@ typedef SSIZE_T ssize_t; #include "paddle/fluid/pybind/cuda_streams_py.h" #endif -#include "gflags/gflags.h" #include "paddle/phi/api/include/operants_manager.h" #include "paddle/phi/api/include/tensor_operants.h" +#include "paddle/phi/core/flags.h" -DECLARE_string(tensor_operants_mode); +PHI_DECLARE_string(tensor_operants_mode); namespace paddle { namespace pybind { diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 11dca753092fd..1da7bd774531b 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -59,10 +59,11 @@ typedef SSIZE_T ssize_t; #include "paddle/fluid/memory/allocation/mmap_allocator.h" #include "paddle/fluid/pybind/tensor_py.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/kernels/funcs/math_function.h" -DECLARE_bool(set_to_1d); +PHI_DECLARE_bool(set_to_1d); namespace paddle { namespace pybind { diff --git a/paddle/fluid/pybind/parallel_executor.cc b/paddle/fluid/pybind/parallel_executor.cc index 0c8898b524fae..553701906f59e 100644 --- a/paddle/fluid/pybind/parallel_executor.cc +++ b/paddle/fluid/pybind/parallel_executor.cc @@ -169,11 +169,12 @@ limitations under the License. */ #include "paddle/fluid/pybind/eager_utils.h" #include "paddle/fluid/pybind/parallel_executor.h" #include "paddle/phi/api/ext/op_meta_info.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/kernels/autotune/cache.h" #include "paddle/phi/kernels/autotune/switch_autotune.h" #include "pybind11/stl.h" -DECLARE_bool(use_mkldnn); +PHI_DECLARE_bool(use_mkldnn); // disable auto conversion to list in Python PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray); diff --git a/paddle/fluid/pybind/place.cc b/paddle/fluid/pybind/place.cc index 1c27d70d1bea7..59ed791c4522c 100644 --- a/paddle/fluid/pybind/place.cc +++ b/paddle/fluid/pybind/place.cc @@ -169,11 +169,12 @@ limitations under the License. */ #include "paddle/fluid/pybind/eager_utils.h" #include "paddle/fluid/pybind/place.h" #include "paddle/phi/api/ext/op_meta_info.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/kernels/autotune/cache.h" #include "paddle/phi/kernels/autotune/switch_autotune.h" #include "pybind11/stl.h" -DECLARE_bool(use_mkldnn); +PHI_DECLARE_bool(use_mkldnn); // disable auto conversion to list in Python PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 359c2266f8ea3..f4dfb133c1c36 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -193,11 +193,12 @@ limitations under the License. */ #include "paddle/phi/api/ext/op_meta_info.h" #include "paddle/phi/api/include/operants_manager.h" #include "paddle/phi/api/include/tensor_operants.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/kernels/autotune/cache.h" #include "paddle/phi/kernels/autotune/switch_autotune.h" #include "pybind11/stl.h" -DECLARE_bool(use_mkldnn); +PHI_DECLARE_bool(use_mkldnn); // disable auto conversion to list in Python PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray); diff --git a/paddle/fluid/pybind/reader_py.cc b/paddle/fluid/pybind/reader_py.cc index 8b8c29b8b1234..0640a070cbcec 100644 --- a/paddle/fluid/pybind/reader_py.cc +++ b/paddle/fluid/pybind/reader_py.cc @@ -32,9 +32,10 @@ #include "paddle/fluid/operators/reader/py_reader.h" #include "paddle/fluid/platform/place.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/flags.h" #include "pybind11/stl.h" -DECLARE_bool(reader_queue_speed_test_mode); +PHI_DECLARE_bool(reader_queue_speed_test_mode); // disable auto conversion to list in Python PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray); diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc index cf10befbafaf4..799e48b1594de 100644 --- a/paddle/fluid/pybind/tensor.cc +++ b/paddle/fluid/pybind/tensor.cc @@ -170,12 +170,13 @@ limitations under the License. */ #include "paddle/fluid/pybind/eager_utils.h" #include "paddle/fluid/pybind/tensor.h" #include "paddle/phi/api/ext/op_meta_info.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/kernels/autotune/cache.h" #include "paddle/phi/kernels/autotune/switch_autotune.h" #include "pybind11/stl.h" -DECLARE_bool(use_mkldnn); -DECLARE_bool(use_shm_cache); +PHI_DECLARE_bool(use_mkldnn); +PHI_DECLARE_bool(use_shm_cache); // disable auto conversion to list in Python PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray); diff --git a/paddle/phi/api/profiler/profiler.cc b/paddle/phi/api/profiler/profiler.cc index 906ca12c7c581..f355872e881ee 100644 --- a/paddle/phi/api/profiler/profiler.cc +++ b/paddle/phi/api/profiler/profiler.cc @@ -33,13 +33,13 @@ limitations under the License. */ #include "paddle/phi/backends/dynload/nvtx.h" #endif -DEFINE_bool(enable_host_event_recorder_hook, - false, - "enable HostEventRecorder, hook Profiler"); +PHI_DEFINE_bool(enable_host_event_recorder_hook, + false, + "enable HostEventRecorder, hook Profiler"); -DEFINE_bool(enable_record_op_info, - false, - "enable operator supplement info recorder"); +PHI_DEFINE_bool(enable_record_op_info, + false, + "enable operator supplement info recorder"); namespace phi { diff --git a/paddle/phi/api/profiler/profiler.h b/paddle/phi/api/profiler/profiler.h index 6863df8c635e7..fe5e3ec070a2c 100644 --- a/paddle/phi/api/profiler/profiler.h +++ b/paddle/phi/api/profiler/profiler.h @@ -23,12 +23,12 @@ limitations under the License. */ #include #include #include -#include "gflags/gflags.h" #include "paddle/phi/api/profiler/event_tracing.h" #include "paddle/phi/api/profiler/supplement_tracing.h" +#include "paddle/phi/core/flags.h" -DECLARE_bool(enable_host_event_recorder_hook); +PHI_DECLARE_bool(enable_host_event_recorder_hook); namespace phi { diff --git a/paddle/phi/backends/cpu/cpu_info.cc b/paddle/phi/backends/cpu/cpu_info.cc index 9d6b4e46df07d..0e91898fc3b93 100644 --- a/paddle/phi/backends/cpu/cpu_info.cc +++ b/paddle/phi/backends/cpu/cpu_info.cc @@ -43,9 +43,9 @@ DECLARE_double(fraction_of_cuda_pinned_memory_to_use); // between host and device. Allocates too much would reduce the amount // of memory available to the system for paging. So, by default, we // should set false to use_pinned_memory. -PADDLE_DEFINE_EXPORTED_bool(use_pinned_memory, - true, - "If set, allocate cpu pinned memory."); +PHI_DEFINE_EXPORTED_bool(use_pinned_memory, + true, + "If set, allocate cpu pinned memory."); namespace phi { namespace backends { diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc index e3ec61576b2ac..85248360361ea 100644 --- a/paddle/phi/backends/dynload/dynamic_loader.cc +++ b/paddle/phi/backends/dynload/dynamic_loader.cc @@ -26,69 +26,73 @@ limitations under the License. */ // TODO(wilber): The phi computing library requires a component to manage flags // (maybe not use gflags). -#include "gflags/gflags.h" #include "glog/logging.h" +#include "paddle/phi/core/flags.h" -DEFINE_string(cudnn_dir, - "", - "Specify path for loading libcudnn.so. For instance, " - "/usr/local/cudnn/lib. If empty [default], dlopen " - "will search cudnn from LD_LIBRARY_PATH"); +PHI_DEFINE_string(cudnn_dir, + "", + "Specify path for loading libcudnn.so. For instance, " + "/usr/local/cudnn/lib. If empty [default], dlopen " + "will search cudnn from LD_LIBRARY_PATH"); -DEFINE_string( +PHI_DEFINE_string( cuda_dir, "", "Specify path for loading cuda library, such as libcublas, libcublasLt " "libcurand, libcusolver. For instance, /usr/local/cuda/lib64. " "If default, dlopen will search cuda from LD_LIBRARY_PATH"); -DEFINE_string(nccl_dir, - "", - "Specify path for loading nccl library, such as libnccl.so. " - "For instance, /usr/local/cuda/lib64. If default, " - "dlopen will search cuda from LD_LIBRARY_PATH"); +PHI_DEFINE_string(nccl_dir, + "", + "Specify path for loading nccl library, such as libnccl.so. " + "For instance, /usr/local/cuda/lib64. If default, " + "dlopen will search cuda from LD_LIBRARY_PATH"); -DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so."); +PHI_DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so."); -DEFINE_string( +PHI_DEFINE_string( tensorrt_dir, "", "Specify path for loading tensorrt library, such as libnvinfer.so."); -DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so."); +PHI_DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so."); -DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so."); +PHI_DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so."); -DEFINE_string(mkl_dir, - "", - "Specify path for loading libmkl_rt.so. " - "For insrance, /opt/intel/oneapi/mkl/latest/lib/intel64/." - "If default, " - "dlopen will search mkl from LD_LIBRARY_PATH"); +PHI_DEFINE_string(mkl_dir, + "", + "Specify path for loading libmkl_rt.so. " + "For insrance, /opt/intel/oneapi/mkl/latest/lib/intel64/." + "If default, " + "dlopen will search mkl from LD_LIBRARY_PATH"); -DEFINE_string(op_dir, "", "Specify path for loading user-defined op library."); +PHI_DEFINE_string(op_dir, + "", + "Specify path for loading user-defined op library."); -DEFINE_string(cusparselt_dir, "", "Specify path for loading libcusparseLt.so."); +PHI_DEFINE_string(cusparselt_dir, + "", + "Specify path for loading libcusparseLt.so."); #ifdef PADDLE_WITH_HIP -DEFINE_string(miopen_dir, - "", - "Specify path for loading libMIOpen.so. For instance, " - "/opt/rocm/miopen/lib. If empty [default], dlopen " - "will search miopen from LD_LIBRARY_PATH"); - -DEFINE_string(rocm_dir, - "", - "Specify path for loading rocm library, such as librocblas, " - "libmiopen, libhipsparse. For instance, /opt/rocm/lib. " - "If default, dlopen will search rocm from LD_LIBRARY_PATH"); - -DEFINE_string(rccl_dir, - "", - "Specify path for loading rccl library, such as librccl.so. " - "For instance, /opt/rocm/rccl/lib. If default, " - "dlopen will search rccl from LD_LIBRARY_PATH"); +PHI_DEFINE_string(miopen_dir, + "", + "Specify path for loading libMIOpen.so. For instance, " + "/opt/rocm/miopen/lib. If empty [default], dlopen " + "will search miopen from LD_LIBRARY_PATH"); + +PHI_DEFINE_string(rocm_dir, + "", + "Specify path for loading rocm library, such as librocblas, " + "libmiopen, libhipsparse. For instance, /opt/rocm/lib. " + "If default, dlopen will search rocm from LD_LIBRARY_PATH"); + +PHI_DEFINE_string(rccl_dir, + "", + "Specify path for loading rccl library, such as librccl.so. " + "For instance, /opt/rocm/rccl/lib. If default, " + "dlopen will search rccl from LD_LIBRARY_PATH"); #endif namespace phi { diff --git a/paddle/phi/backends/xpu/xpu_info.cc b/paddle/phi/backends/xpu/xpu_info.cc index a8fbf7e7e6b7b..121c05a606917 100644 --- a/paddle/phi/backends/xpu/xpu_info.cc +++ b/paddle/phi/backends/xpu/xpu_info.cc @@ -25,7 +25,7 @@ limitations under the License. */ // flags. #include "paddle/phi/core/flags.h" -PADDLE_DEFINE_EXPORTED_string( +PHI_DEFINE_EXPORTED_string( selected_xpus, "", "A list of device ids separated by comma, like: 0,1,2,3. " diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc index 0b248a968785a..058ab7159e5ee 100644 --- a/paddle/phi/core/flags.cc +++ b/paddle/phi/core/flags.cc @@ -31,9 +31,9 @@ ExportedFlagInfoMap *GetMutableExportedFlagInfoMap() { } // namespace phi -PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism, - 0, - "number of threads for inner op"); +PHI_DEFINE_EXPORTED_int32(inner_op_parallelism, + 0, + "number of threads for inner op"); /** * NOTE(paddle-dev): This file is designed to define all public FLAGS. @@ -48,9 +48,9 @@ PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism, * instance to 2 * Note: */ -PADDLE_DEFINE_EXPORTED_int32(paddle_num_threads, - 1, - "Number of threads for each paddle instance."); +PHI_DEFINE_EXPORTED_int32(paddle_num_threads, + 1, + "Number of threads for each paddle instance."); /** * Low Precision Op related FLAG @@ -63,11 +63,11 @@ PADDLE_DEFINE_EXPORTED_int32(paddle_num_threads, * - 1, return the low precision op list of current module. * - 2, return the op list of current module. */ -PADDLE_DEFINE_EXPORTED_int32(low_precision_op_list, - 0, - "Setting the level of low precision op" - "list printing. It will be return the " - "low precision op list of current module."); +PHI_DEFINE_EXPORTED_int32(low_precision_op_list, + 0, + "Setting the level of low precision op" + "list printing. It will be return the " + "low precision op list of current module."); /** * Operator related FLAG @@ -77,7 +77,7 @@ PADDLE_DEFINE_EXPORTED_int32(low_precision_op_list, * Example: * Note: Used to debug. Checking whether operator produce NAN/INF or not. */ -PADDLE_DEFINE_EXPORTED_bool( +PHI_DEFINE_EXPORTED_bool( check_nan_inf, false, "Checking whether operator produce NAN/INF or not. It will be " @@ -99,7 +99,7 @@ PADDLE_DEFINE_EXPORTED_bool( * overflowing float16's limit. * - 3, print the information of all tensors. */ -PADDLE_DEFINE_EXPORTED_int32( +PHI_DEFINE_EXPORTED_int32( check_nan_inf_level, 0, "Setting the check and print level when FLAGS_check_nan_inf is set."); @@ -112,7 +112,7 @@ PADDLE_DEFINE_EXPORTED_int32( * Example: * Note: Used to debug. Checking whether operator produce NAN/INF or not. */ -PADDLE_DEFINE_EXPORTED_bool( +PHI_DEFINE_EXPORTED_bool( enable_opt_get_features, false, "Checking whether operator produce NAN/INF or not. It will be " @@ -130,7 +130,7 @@ PADDLE_DEFINE_EXPORTED_bool( * Example: * Note: whether to use Tensor Core, faster but it may loss precision. */ -PADDLE_DEFINE_EXPORTED_bool( +PHI_DEFINE_EXPORTED_bool( enable_cublas_tensor_op_math, false, "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, " @@ -150,7 +150,7 @@ PADDLE_DEFINE_EXPORTED_bool( * Note: whether to use fp16 compute type when the input and output is fp16, * faster but it may loss precision. */ -PADDLE_DEFINE_EXPORTED_bool( +PHI_DEFINE_EXPORTED_bool( gemm_use_half_precision_compute_type, false, "Whether to use fp16 compute type when the input and output is fp16, " @@ -166,7 +166,7 @@ PADDLE_DEFINE_EXPORTED_bool( * cards * Note: A list of device ids separated by comma, like: 0,1,2,3 */ -PADDLE_DEFINE_EXPORTED_string( +PHI_DEFINE_EXPORTED_string( selected_gpus, "", "A list of device ids separated by comma, like: 0,1,2,3. " @@ -195,7 +195,7 @@ PADDLE_DEFINE_EXPORTED_string( * layer specification Once you change the layer specifications * (such as M, N and K), it will re-search again. */ -PADDLE_DEFINE_EXPORTED_int64( +PHI_DEFINE_EXPORTED_int64( cublaslt_exhaustive_search_times, 0, "The times of exhaustive search for cuBlasLt matmul with/without " @@ -210,7 +210,7 @@ PADDLE_DEFINE_EXPORTED_int64( * Example: FLAGS_enable_api_kernel_fallback=true would allow kernel of current * backend fallback to CPU one when not found */ -PADDLE_DEFINE_EXPORTED_bool( +PHI_DEFINE_EXPORTED_bool( enable_api_kernel_fallback, true, "Whether enable api kernel fallback to CPU one when not found"); @@ -225,7 +225,7 @@ PADDLE_DEFINE_EXPORTED_bool( * Note: whether to use deterministic algorithm in cudnn. * If true, it will slow down some operators such as conv and pooling. */ -PADDLE_DEFINE_EXPORTED_bool( +PHI_DEFINE_EXPORTED_bool( cudnn_deterministic, false, "Whether allow using an autotuning algorithm for convolution " @@ -241,7 +241,7 @@ PADDLE_DEFINE_EXPORTED_bool( * Note: whether to use deterministic algorithm in embedding op. * If true, it will use deterministic CUDA kernel in embedding op. */ -PADDLE_DEFINE_EXPORTED_bool( +PHI_DEFINE_EXPORTED_bool( embedding_deterministic, false, "Whether allow using an deterministic algorithm for embedding " @@ -260,10 +260,9 @@ PADDLE_DEFINE_EXPORTED_bool( * increased. * Users need to balance memory and speed. */ -PADDLE_DEFINE_EXPORTED_int64( - conv_workspace_size_limit, - phi::backends::gpu::kDefaultConvWorkspaceSizeLimitMB, - "cuDNN convolution workspace limit in MB unit."); +PHI_DEFINE_EXPORTED_int64(conv_workspace_size_limit, + phi::backends::gpu::kDefaultConvWorkspaceSizeLimitMB, + "cuDNN convolution workspace limit in MB unit."); /** * CUDNN related FLAG @@ -279,7 +278,7 @@ PADDLE_DEFINE_EXPORTED_int64( * layer specification. Once you change the layer specifications * (such as batch size, feature map size), it will search again. */ -PADDLE_DEFINE_EXPORTED_bool( +PHI_DEFINE_EXPORTED_bool( cudnn_exhaustive_search, false, "Whether enable exhaustive search for cuDNN convolution or " @@ -293,10 +292,10 @@ PADDLE_DEFINE_EXPORTED_bool( * Example: * Note: only used to predict for advanced developer */ -PADDLE_DEFINE_EXPORTED_int64(cudnn_exhaustive_search_times, - -1, - "Exhaustive search times for cuDNN convolution, " - "default is -1, not exhaustive search"); +PHI_DEFINE_EXPORTED_int64(cudnn_exhaustive_search_times, + -1, + "Exhaustive search times for cuDNN convolution, " + "default is -1, not exhaustive search"); /** * CUDNN related FLAG @@ -314,7 +313,7 @@ PADDLE_DEFINE_EXPORTED_int64(cudnn_exhaustive_search_times, * certain * input data range. */ -PADDLE_DEFINE_EXPORTED_bool( +PHI_DEFINE_EXPORTED_bool( cudnn_batchnorm_spatial_persistent, false, "Whether enable CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode for cudnn " @@ -333,7 +332,7 @@ PADDLE_DEFINE_EXPORTED_bool( * https://github.com/PaddlePaddle/Paddle/issues/15049 * If you want to change this default value, why?(gongwb) */ -PADDLE_DEFINE_EXPORTED_bool( +PHI_DEFINE_EXPORTED_bool( sync_nccl_allreduce, true, "If set true, will call `cudaStreamSynchronize(nccl_stream)`" @@ -352,10 +351,10 @@ PADDLE_DEFINE_EXPORTED_bool( * into the queue, and then the communicator takes the gradients out * of the queue and sends them after merging. */ -PADDLE_DEFINE_EXPORTED_int32(communicator_max_merge_var_num, - 20, - "max var num to merge and send"); -PADDLE_DEFINE_EXPORTED_bool( +PHI_DEFINE_EXPORTED_int32(communicator_max_merge_var_num, + 20, + "max var num to merge and send"); +PHI_DEFINE_EXPORTED_bool( communicator_is_sgd_optimizer, true, "gradient sent to the server is the sum of the gradients " @@ -373,9 +372,9 @@ PADDLE_DEFINE_EXPORTED_bool( * space. It is used to avoid training much faster than communication, * so that too many gradients are not sent out in time. */ -PADDLE_DEFINE_EXPORTED_int32(communicator_send_queue_size, - 20, - "queue size to recv gradient before send"); +PHI_DEFINE_EXPORTED_int32(communicator_send_queue_size, + 20, + "queue size to recv gradient before send"); #endif /** @@ -387,10 +386,9 @@ PADDLE_DEFINE_EXPORTED_int32(communicator_send_queue_size, * Note: Control the number of threads used for distributed modules. * If it is not set, it is set to a hard thread. */ -PADDLE_DEFINE_EXPORTED_int32( - dist_threadpool_size, - 0, - "number of threads used for distributed executed."); +PHI_DEFINE_EXPORTED_int32(dist_threadpool_size, + 0, + "number of threads used for distributed executed."); /** * Garbage collector related FLAG @@ -411,7 +409,7 @@ PADDLE_DEFINE_EXPORTED_int32( // Disable gc by default when inference library is built static const double kDefaultEagerDeleteTensorGB = 0; -PADDLE_DEFINE_EXPORTED_double( +PHI_DEFINE_EXPORTED_double( eager_delete_tensor_gb, kDefaultEagerDeleteTensorGB, "Memory size threshold (GB) when the garbage collector clear tensors." @@ -429,7 +427,7 @@ PADDLE_DEFINE_EXPORTED_double( * has finished, which will make the garbage collection strategy faster. * Only works when garbage collection strategy is enabled. */ -PADDLE_DEFINE_EXPORTED_bool( +PHI_DEFINE_EXPORTED_bool( fast_eager_deletion_mode, true, "Fast eager deletion mode. If enabled, memory would release " @@ -453,7 +451,7 @@ PADDLE_DEFINE_EXPORTED_bool( * largest FLAGS_memory_fraction_of_eager_deletion ratio will be released. * The flag is only valid when running parallel data compilers. */ -PADDLE_DEFINE_EXPORTED_double( +PHI_DEFINE_EXPORTED_double( memory_fraction_of_eager_deletion, 1.0, "Fraction of eager deletion. If less than 1.0, all variables in " @@ -471,7 +469,7 @@ PADDLE_DEFINE_EXPORTED_double( * Note: For selecting allocator policy of PaddlePaddle. */ static constexpr char kDefaultAllocatorStrategy[] = "auto_growth"; -PADDLE_DEFINE_EXPORTED_string( +PHI_DEFINE_EXPORTED_string( allocator_strategy, kDefaultAllocatorStrategy, "The allocation strategy, enum in [naive_best_fit, auto_growth]. " @@ -499,10 +497,10 @@ PADDLE_DEFINE_EXPORTED_string( * size as the memory block will be allocated from the CUDA pinned * request util the CPU does not have enough memory. */ -PADDLE_DEFINE_EXPORTED_double(fraction_of_cpu_memory_to_use, - 1, - "Default use 100% of CPU memory for PaddlePaddle," - "reserve the rest for page tables, etc"); +PHI_DEFINE_EXPORTED_double(fraction_of_cpu_memory_to_use, + 1, + "Default use 100% of CPU memory for PaddlePaddle," + "reserve the rest for page tables, etc"); /** * Memory related FLAG @@ -516,10 +514,9 @@ PADDLE_DEFINE_EXPORTED_double(fraction_of_cpu_memory_to_use, * FLAGS_fraction_of_cpu_memory_to_use*(total physical memory) * as memory block sizes. */ -PADDLE_DEFINE_EXPORTED_uint64( - initial_cpu_memory_in_mb, - 500ul, - "Initial CPU memory for PaddlePaddle, in MD unit."); +PHI_DEFINE_EXPORTED_uint64(initial_cpu_memory_in_mb, + 500ul, + "Initial CPU memory for PaddlePaddle, in MD unit."); /** * Memory related FLAG @@ -534,7 +531,7 @@ PADDLE_DEFINE_EXPORTED_uint64( * size as the memory block will be allocated from the CPU * request util the CPU does not have enough memory. */ -PADDLE_DEFINE_EXPORTED_double( +PHI_DEFINE_EXPORTED_double( fraction_of_cuda_pinned_memory_to_use, 0.5, "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle," @@ -570,7 +567,7 @@ constexpr static float fraction_of_gpu_memory_to_use = 0.92f; // which may lead to insufficient memory left for paddle constexpr static float fraction_of_gpu_memory_to_use = 0.5f; #endif -PADDLE_DEFINE_EXPORTED_double( +PHI_DEFINE_EXPORTED_double( fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use, "Allocate a trunk of gpu memory that is this fraction of the " @@ -591,7 +588,7 @@ PADDLE_DEFINE_EXPORTED_double( * FLAGS_reallocate_gpu_memory_in_mb will be requested from the GPU until * the GPU has no remaining memory. */ -PADDLE_DEFINE_EXPORTED_uint64( +PHI_DEFINE_EXPORTED_uint64( initial_gpu_memory_in_mb, 0ul, "Allocate a trunk of gpu memory whose byte size is specified by " @@ -614,14 +611,14 @@ PADDLE_DEFINE_EXPORTED_uint64( * Note: If the allocated GPU memory blocks are exhausted, * additional GPU memory blocks are reallocated */ -PADDLE_DEFINE_EXPORTED_uint64( +PHI_DEFINE_EXPORTED_uint64( reallocate_gpu_memory_in_mb, 0ul, "If this flag is set, Paddle will reallocate the gpu memory with " "size specified by this flag. Else Paddle will reallocate by " "FLAGS_fraction_of_gpu_memory_to_use"); -PADDLE_DEFINE_EXPORTED_uint64( +PHI_DEFINE_EXPORTED_uint64( gpu_memory_limit_mb, 0UL, "The maximum gpu memory limit that the process can allocate. " @@ -641,7 +638,7 @@ PADDLE_DEFINE_EXPORTED_uint64( * The real chunk size is max(request_size, * FLAGS_auto_growth_chunk_size_in_mb). */ -PADDLE_DEFINE_EXPORTED_uint64( +PHI_DEFINE_EXPORTED_uint64( auto_growth_chunk_size_in_mb, 0ul, "The minimal chunk size of GPU memory block in auto_growth allocator. " @@ -658,7 +655,7 @@ PADDLE_DEFINE_EXPORTED_uint64( * Example: * Note: */ -PADDLE_DEFINE_EXPORTED_double( +PHI_DEFINE_EXPORTED_double( local_exe_sub_scope_limit, 256.0, // MBytes "The memory up limit of sub-scopes of local execution scope for " @@ -666,7 +663,7 @@ PADDLE_DEFINE_EXPORTED_double( "you should set FLAGS_local_exe_sub_scope_limit=-1. " "The default value is 256 MBytes."); -PADDLE_DEFINE_EXPORTED_bool( +PHI_DEFINE_EXPORTED_bool( reader_queue_speed_test_mode, false, "If set true, the queue.pop will only get data from queue but not " @@ -680,7 +677,7 @@ PADDLE_DEFINE_EXPORTED_bool( * Example: * Note: */ -PADDLE_DEFINE_EXPORTED_bool(use_mkldnn, false, "Use MKLDNN to run"); +PHI_DEFINE_EXPORTED_bool(use_mkldnn, false, "Use MKLDNN to run"); /** * Debug related FLAG @@ -702,7 +699,7 @@ static const int32_t kDefaultCallStackLevel = 2; static const int32_t kDefaultCallStackLevel = 1; #endif -PADDLE_DEFINE_EXPORTED_int32( +PHI_DEFINE_EXPORTED_int32( call_stack_level, kDefaultCallStackLevel, "Determine the call stack to print when error or exeception happens." @@ -723,10 +720,10 @@ PADDLE_DEFINE_EXPORTED_int32( * Note: If True, gradients are summed by the reverse order of * the forward execution sequence. */ -PADDLE_DEFINE_EXPORTED_bool(sort_sum_gradient, - false, - "Sum gradients by the reverse order of " - "the forward execution sequence."); +PHI_DEFINE_EXPORTED_bool(sort_sum_gradient, + false, + "Sum gradients by the reverse order of " + "the forward execution sequence."); /** * Performance related FLAG @@ -736,7 +733,7 @@ PADDLE_DEFINE_EXPORTED_bool(sort_sum_gradient, * Example: * Note: The maximum number of inplace grad_add. */ -PADDLE_DEFINE_EXPORTED_int32( +PHI_DEFINE_EXPORTED_int32( max_inplace_grad_add, 0, "The maximum number of inplace grad_add. When doing " @@ -752,7 +749,7 @@ PADDLE_DEFINE_EXPORTED_int32( * Now, just set true by default in 2.5 transition time * which will be removed in future (2.6 or 2.7) . */ -PADDLE_DEFINE_EXPORTED_bool(set_to_1d, true, "set 0D Tensor to 1D numpy"); +PHI_DEFINE_EXPORTED_bool(set_to_1d, true, "set 0D Tensor to 1D numpy"); /** * Debug related FLAG @@ -762,9 +759,9 @@ PADDLE_DEFINE_EXPORTED_bool(set_to_1d, true, "set 0D Tensor to 1D numpy"); * Example: * Note: Holds list of operation types with OneDNN kernels to be enabled. */ -PADDLE_DEFINE_EXPORTED_string(tracer_mkldnn_ops_on, - "", - "List of OneDNN operation types to be turned on"); +PHI_DEFINE_EXPORTED_string(tracer_mkldnn_ops_on, + "", + "List of OneDNN operation types to be turned on"); /** * Debug related FLAG @@ -774,10 +771,9 @@ PADDLE_DEFINE_EXPORTED_string(tracer_mkldnn_ops_on, * Example: * Note: Holds list of operation types with OneDNN kernels to be disabled. */ -PADDLE_DEFINE_EXPORTED_string( - tracer_mkldnn_ops_off, - "", - "List of OneDNN operation types to be turned off"); +PHI_DEFINE_EXPORTED_string(tracer_mkldnn_ops_off, + "", + "List of OneDNN operation types to be turned off"); /** * Debug related FLAG @@ -788,7 +784,7 @@ PADDLE_DEFINE_EXPORTED_string( * Note: Check kernel launch status after every kernel compute. */ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PADDLE_DEFINE_EXPORTED_bool( +PHI_DEFINE_EXPORTED_bool( check_kernel_launch, false, "Check kernel launch status after every kernel compute"); @@ -803,13 +799,13 @@ PADDLE_DEFINE_EXPORTED_bool( * Note: Disable cudnn in conv2d. */ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PADDLE_DEFINE_EXPORTED_bool(conv2d_disable_cudnn, - false, - "Disable cudnn in conv2d"); +PHI_DEFINE_EXPORTED_bool(conv2d_disable_cudnn, + false, + "Disable cudnn in conv2d"); -PADDLE_DEFINE_EXPORTED_bool(use_fast_math, - false, - "Whether to use fast math GPU functions."); +PHI_DEFINE_EXPORTED_bool(use_fast_math, + false, + "Whether to use fast math GPU functions."); #endif /** @@ -822,9 +818,9 @@ PADDLE_DEFINE_EXPORTED_bool(use_fast_math, */ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) || \ defined(PADDLE_WITH_HIP) -PADDLE_DEFINE_EXPORTED_int32(get_host_by_name_time, - 120, - "The maximum time for get host by name time"); +PHI_DEFINE_EXPORTED_int32(get_host_by_name_time, + 120, + "The maximum time for get host by name time"); #endif /** @@ -836,7 +832,7 @@ PADDLE_DEFINE_EXPORTED_int32(get_host_by_name_time, * program when using Fleet APIs. * Note: Apply IR pass to program. Be only useful when using Fleet APIs. */ -PADDLE_DEFINE_EXPORTED_bool( +PHI_DEFINE_EXPORTED_bool( apply_pass_to_program, false, "It controls whether to apply IR pass to program when using Fleet APIs"); @@ -850,10 +846,10 @@ PADDLE_DEFINE_EXPORTED_bool( * Note: Control whether load graph node and edge with multi threads parallely * If it is not set, load graph data with one thread */ -PADDLE_DEFINE_EXPORTED_bool(graph_load_in_parallel, - false, - "It controls whether load graph node and edge with " - "mutli threads parallely."); +PHI_DEFINE_EXPORTED_bool(graph_load_in_parallel, + false, + "It controls whether load graph node and edge with " + "mutli threads parallely."); /** * Distributed related FLAG @@ -864,10 +860,10 @@ PADDLE_DEFINE_EXPORTED_bool(graph_load_in_parallel, * Note: Control whether load graph node and edge with multi threads parallely * If it is not set, load graph data with one thread */ -PADDLE_DEFINE_EXPORTED_bool(graph_metapath_split_opt, - false, - "It controls whether load graph node and edge with " - "mutli threads parallely."); +PHI_DEFINE_EXPORTED_bool(graph_metapath_split_opt, + false, + "It controls whether load graph node and edge with " + "mutli threads parallely."); /** * Distributed related FLAG @@ -878,7 +874,7 @@ PADDLE_DEFINE_EXPORTED_bool(graph_metapath_split_opt, * Note: Control get all neighbor id when running sub part graph * If it is not set, do not need get neighbor id when run all part graph */ -PADDLE_DEFINE_EXPORTED_bool( +PHI_DEFINE_EXPORTED_bool( graph_get_neighbor_id, false, "It controls get all neighbor id when running sub part graph."); @@ -892,7 +888,7 @@ PADDLE_DEFINE_EXPORTED_bool( * Note: Control whether exit trainer when an worker has no ins. * If it is not set, trainer will exit until all worker finish train. */ -PADDLE_DEFINE_EXPORTED_bool( +PHI_DEFINE_EXPORTED_bool( enable_exit_when_partial_worker, false, "It controls whether exit trainer when an worker has no ins."); @@ -905,9 +901,9 @@ PADDLE_DEFINE_EXPORTED_bool( * Example: * Note: represent gpugraph storage mode, 1 for full hbm, 2 for hbm + mem + ssd. */ -PADDLE_DEFINE_EXPORTED_int32(gpugraph_storage_mode, - 1, - "gpugraph storage mode, default 1"); +PHI_DEFINE_EXPORTED_int32(gpugraph_storage_mode, + 1, + "gpugraph storage mode, default 1"); /** * KP kernel related FLAG @@ -918,9 +914,9 @@ PADDLE_DEFINE_EXPORTED_int32(gpugraph_storage_mode, * Op. * Note: */ -PADDLE_DEFINE_EXPORTED_bool(run_kp_kernel, - false, - "It controls whether to run PaddlePaddle using KP"); +PHI_DEFINE_EXPORTED_bool(run_kp_kernel, + false, + "It controls whether to run PaddlePaddle using KP"); /** * Distributed related FLAG @@ -933,12 +929,12 @@ PADDLE_DEFINE_EXPORTED_bool(run_kp_kernel, * multiple events. Currently, only fuse allreduce supports this. * Otherwise, the precision may be wrong. */ -PADDLE_DEFINE_EXPORTED_bool(allreduce_record_one_event, - false, - "It controls whether the allreduce operations " - "would only wait one event instead of multiple " - "events. Currently, only fuse allreduce supports " - "this. Otherwise, the precision may be wrong."); +PHI_DEFINE_EXPORTED_bool(allreduce_record_one_event, + false, + "It controls whether the allreduce operations " + "would only wait one event instead of multiple " + "events. Currently, only fuse allreduce supports " + "this. Otherwise, the precision may be wrong."); #ifdef PADDLE_WITH_CINN /* @@ -948,8 +944,9 @@ PADDLE_DEFINE_EXPORTED_bool(allreduce_record_one_event, * Value Range: bool, default=false * Example: FLAGS_use_cinn=true would run PaddlePaddle using CINN */ -PADDLE_DEFINE_EXPORTED_bool( - use_cinn, false, "It controls whether to run PaddlePaddle using CINN"); +PHI_DEFINE_EXPORTED_bool(use_cinn, + false, + "It controls whether to run PaddlePaddle using CINN"); /* * CINN related FLAG @@ -959,10 +956,10 @@ PADDLE_DEFINE_EXPORTED_bool( * Example: FLAGS_allow_cinn_ops="mul;relu" would only cover `mul` and `relu` * when using CINN */ -PADDLE_DEFINE_EXPORTED_string(allow_cinn_ops, - "", - "It controls the cinn op subset to be used, " - "which has the highest priority."); +PHI_DEFINE_EXPORTED_string(allow_cinn_ops, + "", + "It controls the cinn op subset to be used, " + "which has the highest priority."); /* * CINN related FLAG @@ -972,9 +969,9 @@ PADDLE_DEFINE_EXPORTED_string(allow_cinn_ops, * Example: FLAGS_deny_cinn_ops="mul;relu" would block `mul` and `relu` two ops * when using CINN */ -PADDLE_DEFINE_EXPORTED_string(deny_cinn_ops, - "", - "It controls the cinn op subset to be not used."); +PHI_DEFINE_EXPORTED_string(deny_cinn_ops, + "", + "It controls the cinn op subset to be not used."); /* * CINN related FLAG @@ -985,10 +982,10 @@ PADDLE_DEFINE_EXPORTED_string(deny_cinn_ops, * instructions of a paddle graph with ParallelExecutor, otherwise with the * CINN compiled runtime program in sequential order. */ -PADDLE_DEFINE_EXPORTED_bool(enable_pe_launch_cinn, - true, - "It controls whether to execute cinn compiled " - "program with ParallelExecutor"); +PHI_DEFINE_EXPORTED_bool(enable_pe_launch_cinn, + true, + "It controls whether to execute cinn compiled " + "program with ParallelExecutor"); /* * CINN related FLAG @@ -999,10 +996,10 @@ PADDLE_DEFINE_EXPORTED_bool(enable_pe_launch_cinn, * compiled instructions of a paddle graph with InterpreterCore, otherwise with * the CINN compiled runtime program in sequential order. */ -PADDLE_DEFINE_EXPORTED_bool(enable_interpretercore_launch_cinn, - true, - "It controls whether to execute cinn compiled " - "program with InterpreterCore"); +PHI_DEFINE_EXPORTED_bool(enable_interpretercore_launch_cinn, + true, + "It controls whether to execute cinn compiled " + "program with InterpreterCore"); /* * CINN related FLAG @@ -1012,10 +1009,10 @@ PADDLE_DEFINE_EXPORTED_bool(enable_interpretercore_launch_cinn, * Example: FLAGS_enable_cinn_auto_tune=true would use CINN with its * auto-tune feature enabled */ -PADDLE_DEFINE_EXPORTED_bool(enable_cinn_auto_tune, - false, - "It controls whether to use cinn with " - "its auto-tune feature enabled"); +PHI_DEFINE_EXPORTED_bool(enable_cinn_auto_tune, + false, + "It controls whether to use cinn with " + "its auto-tune feature enabled"); /* * CINN related FLAG @@ -1026,10 +1023,10 @@ PADDLE_DEFINE_EXPORTED_bool(enable_cinn_auto_tune, * CINN sub-graph into "./cinn_graph/", and each sub-graph will save into * "fusion_groups_*"" directory */ -PADDLE_DEFINE_EXPORTED_string(cinn_subgraph_graphviz_dir, - "", - "Specify the directory path of dot file of " - "graph, which is used for debug."); +PHI_DEFINE_EXPORTED_string(cinn_subgraph_graphviz_dir, + "", + "Specify the directory path of dot file of " + "graph, which is used for debug."); #endif @@ -1041,9 +1038,9 @@ PADDLE_DEFINE_EXPORTED_string(cinn_subgraph_graphviz_dir, * Example: FLAGS_new_executor_use_cuda_graph=true would allow * new executor to use CUDA Graph. */ -PADDLE_DEFINE_EXPORTED_bool(new_executor_use_cuda_graph, - false, - "Use CUDA Graph in new executor"); +PHI_DEFINE_EXPORTED_bool(new_executor_use_cuda_graph, + false, + "Use CUDA Graph in new executor"); DEFINE_int32(record_pool_max_size, 2000000, @@ -1058,56 +1055,56 @@ DEFINE_bool(enable_slotrecord_reset_shrink, DEFINE_bool(enable_ins_parser_file, false, "enable parser ins file, default false"); -PADDLE_DEFINE_EXPORTED_bool( +PHI_DEFINE_EXPORTED_bool( gpugraph_enable_hbm_table_collision_stat, false, "enable hash collisions stat for hbm table, default false"); -PADDLE_DEFINE_EXPORTED_double(gpugraph_hbm_table_load_factor, - 0.75, - "the load factor of hbm table, default 0.75"); -PADDLE_DEFINE_EXPORTED_bool( +PHI_DEFINE_EXPORTED_double(gpugraph_hbm_table_load_factor, + 0.75, + "the load factor of hbm table, default 0.75"); +PHI_DEFINE_EXPORTED_bool( gpugraph_enable_gpu_direct_access, false, "enable direct access bwtween multi gpu cards, default false"); -PADDLE_DEFINE_EXPORTED_bool( +PHI_DEFINE_EXPORTED_bool( gpugraph_enable_segment_merge_grads, false, "enable segment merge gradients while push sparse, default false"); -PADDLE_DEFINE_EXPORTED_uint64( +PHI_DEFINE_EXPORTED_uint64( gpugraph_merge_grads_segment_size, 128, "segment size with segment gradient merge, default 128"); -PADDLE_DEFINE_EXPORTED_uint64(gpugraph_slot_feasign_max_num, - 5, - "max feasign number in one slot, default 5"); -PADDLE_DEFINE_EXPORTED_int32( +PHI_DEFINE_EXPORTED_uint64(gpugraph_slot_feasign_max_num, + 5, + "max feasign number in one slot, default 5"); +PHI_DEFINE_EXPORTED_int32( gpugraph_dedup_pull_push_mode, 0, "enable dedup keys while pull push sparse, default 0"); -PADDLE_DEFINE_EXPORTED_bool(gpugraph_load_node_list_into_hbm, - true, - "enable load_node_list_into_hbm, default true"); -PADDLE_DEFINE_EXPORTED_int32(gpugraph_sparse_table_storage_mode, - 0, - "parse_table_storage_mode, default 0"); -PADDLE_DEFINE_EXPORTED_bool(enable_auto_detect_gpu_topo, - true, - "enable auto detect gpu topo, default true"); -PADDLE_DEFINE_EXPORTED_bool(enable_auto_rdma_trans, - true, - "enable auto gpu rdma trans, default true"); -PADDLE_DEFINE_EXPORTED_bool(enable_tracker_all2all, - false, - "enable tracker all2all log, default false"); -PADDLE_DEFINE_EXPORTED_bool(enable_all2all_use_fp16, - false, - "enable all2all use fp16, default false"); -PADDLE_DEFINE_EXPORTED_bool(enable_sparse_inner_gather, - false, - "enable sparse inner gather, default false"); -PADDLE_DEFINE_EXPORTED_bool(gpugraph_debug_gpu_memory, - false, - "enable debug gpu memory, default false"); +PHI_DEFINE_EXPORTED_bool(gpugraph_load_node_list_into_hbm, + true, + "enable load_node_list_into_hbm, default true"); +PHI_DEFINE_EXPORTED_int32(gpugraph_sparse_table_storage_mode, + 0, + "parse_table_storage_mode, default 0"); +PHI_DEFINE_EXPORTED_bool(enable_auto_detect_gpu_topo, + true, + "enable auto detect gpu topo, default true"); +PHI_DEFINE_EXPORTED_bool(enable_auto_rdma_trans, + true, + "enable auto gpu rdma trans, default true"); +PHI_DEFINE_EXPORTED_bool(enable_tracker_all2all, + false, + "enable tracker all2all log, default false"); +PHI_DEFINE_EXPORTED_bool(enable_all2all_use_fp16, + false, + "enable all2all use fp16, default false"); +PHI_DEFINE_EXPORTED_bool(enable_sparse_inner_gather, + false, + "enable sparse inner gather, default false"); +PHI_DEFINE_EXPORTED_bool(gpugraph_debug_gpu_memory, + false, + "enable debug gpu memory, default false"); /** * ProcessGroupNCCL related FLAG * Name: nccl_blocking_wait @@ -1117,7 +1114,7 @@ PADDLE_DEFINE_EXPORTED_bool(gpugraph_debug_gpu_memory, * Note: nccl blocking wait. */ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PADDLE_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait"); +PHI_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait"); #endif /** @@ -1127,7 +1124,7 @@ PADDLE_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait"); * Value Range: bool, default=false * Example: */ -PADDLE_DEFINE_EXPORTED_bool(use_autotune, false, "Whether enable autotune."); +PHI_DEFINE_EXPORTED_bool(use_autotune, false, "Whether enable autotune."); /** * Conv Search cache max number related FLAG @@ -1136,9 +1133,9 @@ PADDLE_DEFINE_EXPORTED_bool(use_autotune, false, "Whether enable autotune."); * Value Range: int32, default=1000000 * Example: */ -PADDLE_DEFINE_EXPORTED_int32(search_cache_max_number, - 1000000, - "search_cache_max_number."); +PHI_DEFINE_EXPORTED_int32(search_cache_max_number, + 1000000, + "search_cache_max_number."); /** * Preformance related FLAG @@ -1149,7 +1146,7 @@ PADDLE_DEFINE_EXPORTED_int32(search_cache_max_number, * Note: If True, EinsumOp will be optimimzed by innercache reuse, which * uses more gpu memory. */ -PADDLE_DEFINE_EXPORTED_bool( +PHI_DEFINE_EXPORTED_bool( einsum_opt, false, "EinsumOp backward will be speedup at the expense of more gpu memory."); @@ -1165,9 +1162,9 @@ PADDLE_DEFINE_EXPORTED_bool( * FLAGS_jit_engine_type == New, using InterpreterEngine by default * FLAGS_jit_engine_type == Predictor, using inference Predictor by default */ -PADDLE_DEFINE_EXPORTED_string(jit_engine_type, - "Predictor", - "Choose default funciton type in JitLayer."); +PHI_DEFINE_EXPORTED_string(jit_engine_type, + "Predictor", + "Choose default funciton type in JitLayer."); /** * Custom Device NPU related FLAG @@ -1177,7 +1174,7 @@ PADDLE_DEFINE_EXPORTED_string(jit_engine_type, * Example: * Note: Enable NPU Storage Format for Ascend910 performance improvement. */ -PADDLE_DEFINE_EXPORTED_bool(npu_storage_format, false, ""); +PHI_DEFINE_EXPORTED_bool(npu_storage_format, false, ""); #ifdef PADDLE_WITH_CUDNN_FRONTEND /** @@ -1188,7 +1185,7 @@ PADDLE_DEFINE_EXPORTED_bool(npu_storage_format, false, ""); * Example: * Note: Enable CUDNNv8 Frontend API for CUDNN kernels. */ -PADDLE_DEFINE_EXPORTED_bool(enable_cudnn_frontend, false, ""); +PHI_DEFINE_EXPORTED_bool(enable_cudnn_frontend, false, ""); /** * CUDNNv8 related FLAG @@ -1201,7 +1198,7 @@ PADDLE_DEFINE_EXPORTED_bool(enable_cudnn_frontend, false, ""); * N times before it is actually added in the cache. It is useful when * the result of exhaustive search is unstable. */ -PADDLE_DEFINE_EXPORTED_int32(cudnn_cache_saturation_count, 1, ""); +PHI_DEFINE_EXPORTED_int32(cudnn_cache_saturation_count, 1, ""); #endif // PADDLE_WITH_CUDNN_FRONTEND /** @@ -1213,9 +1210,9 @@ PADDLE_DEFINE_EXPORTED_int32(cudnn_cache_saturation_count, 1, ""); * Note: This FLAG is only enabled when CI is running. If True, a persistent * IBuilder is added to avoid TensorRT unload/reload kernels. */ -PADDLE_DEFINE_EXPORTED_bool(trt_ibuilder_cache, - false, - "Add a persistent ibuilder."); +PHI_DEFINE_EXPORTED_bool(trt_ibuilder_cache, + false, + "Add a persistent ibuilder."); /** * mmap_allocator related FLAG @@ -1226,9 +1223,9 @@ PADDLE_DEFINE_EXPORTED_bool(trt_ibuilder_cache, * Note: . If True, mmap_allocator will cache shm file to decrease munmap * operation. */ -PADDLE_DEFINE_EXPORTED_bool(use_shm_cache, - false, - "Use shm cache in mmap_allocator."); +PHI_DEFINE_EXPORTED_bool(use_shm_cache, + false, + "Use shm cache in mmap_allocator."); /** * Tensor operants related FLAG @@ -1242,6 +1239,6 @@ PADDLE_DEFINE_EXPORTED_bool(use_shm_cache, * - phi mode: tensor operants with only phi forward API; * - static mode: tensor operants within static graph. */ -PADDLE_DEFINE_EXPORTED_string(tensor_operants_mode, - "eager", - "Tensor operants mode"); +PHI_DEFINE_EXPORTED_string(tensor_operants_mode, + "eager", + "Tensor operants mode"); diff --git a/paddle/phi/core/flags.h b/paddle/phi/core/flags.h index e9ace6206255d..0112be93b7f92 100644 --- a/paddle/phi/core/flags.h +++ b/paddle/phi/core/flags.h @@ -24,25 +24,127 @@ #include "paddle/utils/variant.h" +#if defined(_WIN32) && defined(BUILD_PHI_SHARED) +#define PHI_EXPORT_FLAG __declspec(dllexport) +#define PHI_IMPORT_FLAG __declspec(dllimport) +#else +#define PHI_EXPORT_FLAG +#define PHI_IMPORT_FLAG +#endif // _WIN32 + +// We redefine the gflags' macro for exporting global variable + +// ----------------------------DECLARE FLAGS---------------------------- +// clang-format off +#define PHI_DECLARE_VARIABLE(type, shorttype, name) \ + namespace fL##shorttype { \ + extern PHI_IMPORT_FLAG type FLAGS_##name; \ + } \ + using fL##shorttype::FLAGS_##name +// clang-format on + +#define PHI_DECLARE_bool(name) PHI_DECLARE_VARIABLE(bool, B, name) + +#define PHI_DECLARE_int32(name) \ + PHI_DECLARE_VARIABLE(::GFLAGS_NAMESPACE::int32, I, name) + +#define PHI_DECLARE_uint32(name) \ + PHI_DECLARE_VARIABLE(::GFLAGS_NAMESPACE::uint32, U, name) + +#define PHI_DECLARE_int64(name) \ + PHI_DECLARE_VARIABLE(::GFLAGS_NAMESPACE::int64, I64, name) + +#define PHI_DECLARE_uint64(name) \ + PHI_DECLARE_VARIABLE(::GFLAGS_NAMESPACE::uint64, U64, name) + +#define PHI_DECLARE_double(name) PHI_DECLARE_VARIABLE(double, D, name) + +#define PHI_DECLARE_string(name) \ + /* We always want to import declared variables, dll or no */ \ + namespace fLS { \ + extern PHI_IMPORT_FLAG ::fLS::clstring& FLAGS_##name; \ + } \ + using fLS::FLAGS_##name + +// ----------------------------DEFINE FLAGS---------------------------- +#define PHI_DEFINE_VARIABLE(type, shorttype, name, value, help) \ + namespace fL##shorttype { \ + static const type FLAGS_nono##name = value; \ + PHI_EXPORT_FLAG type FLAGS_##name = FLAGS_nono##name; \ + static type FLAGS_no##name = FLAGS_nono##name; \ + static GFLAGS_NAMESPACE::FlagRegisterer o_##name( \ + #name, \ + MAYBE_STRIPPED_HELP(help), \ + __FILE__, \ + &FLAGS_##name, \ + &FLAGS_no##name); \ + } /* NOLINT */ \ + using fL##shorttype::FLAGS_##name + +#define PHI_DEFINE_bool(name, val, txt) \ + namespace fLB { \ + typedef ::fLB::CompileAssert FLAG_##name##_value_is_not_a_bool \ + [(sizeof(::fLB::IsBoolFlag(val)) != sizeof(double)) ? 1 : -1]; \ + } \ + PHI_DEFINE_VARIABLE(bool, B, name, val, txt) + +#define PHI_DEFINE_int32(name, val, txt) \ + PHI_DEFINE_VARIABLE(GFLAGS_NAMESPACE::int32, I, name, val, txt) + +#define PHI_DEFINE_uint32(name, val, txt) \ + PHI_DEFINE_VARIABLE(GFLAGS_NAMESPACE::uint32, U, name, val, txt) + +#define PHI_DEFINE_int64(name, val, txt) \ + PHI_DEFINE_VARIABLE(GFLAGS_NAMESPACE::int64, I64, name, val, txt) + +#define PHI_DEFINE_uint64(name, val, txt) \ + PHI_DEFINE_VARIABLE(GFLAGS_NAMESPACE::uint64, U64, name, val, txt) + +#define PHI_DEFINE_double(name, val, txt) \ + PHI_DEFINE_VARIABLE(double, D, name, val, txt) + +#define PHI_DEFINE_string(name, val, txt) \ + namespace fLS { \ + using ::fLS::clstring; \ + using ::fLS::StringFlagDestructor; \ + static union { \ + void* align; \ + char s[sizeof(clstring)]; \ + } s_##name[2]; \ + clstring* const FLAGS_no##name = \ + ::fLS::dont_pass0toDEFINE_string(s_##name[0].s, val); \ + static GFLAGS_NAMESPACE::FlagRegisterer o_##name( \ + #name, \ + MAYBE_STRIPPED_HELP(txt), \ + __FILE__, \ + FLAGS_no##name, \ + new (s_##name[1].s) clstring(*FLAGS_no##name)); \ + static StringFlagDestructor d_##name(s_##name[0].s, s_##name[1].s); \ + extern PHI_EXPORT_FLAG clstring& FLAGS_##name; \ + using fLS::FLAGS_##name; \ + clstring& FLAGS_##name = *FLAGS_no##name; \ + } /* NOLINT */ \ + using fLS::FLAGS_##name + namespace phi { struct FlagInfo { using ValueType = paddle::variant; std::string name; - mutable void *value_ptr; + mutable void* value_ptr; ValueType default_value; std::string doc; bool is_writable; }; using ExportedFlagInfoMap = std::map; -const ExportedFlagInfoMap &GetExportedFlagInfoMap(); -ExportedFlagInfoMap *GetMutableExportedFlagInfoMap(); +const ExportedFlagInfoMap& GetExportedFlagInfoMap(); +ExportedFlagInfoMap* GetMutableExportedFlagInfoMap(); -#define __PADDLE_DEFINE_EXPORTED_FLAG( \ +#define __PHI_DEFINE_EXPORTED_FLAG( \ __name, __is_writable, __cpp_type, __gflag_type, __default_value, __doc) \ - DEFINE_##__gflag_type(__name, __default_value, __doc); \ + PHI_DEFINE_##__gflag_type(__name, __default_value, __doc); \ struct __PaddleRegisterFlag_##__name { \ __PaddleRegisterFlag_##__name() { \ using FlagDeclaredType = \ @@ -50,8 +152,8 @@ ExportedFlagInfoMap *GetMutableExportedFlagInfoMap(); static_assert(std::is_same::value || \ std::is_arithmetic::value, \ "FLAGS should be std::string or arithmetic type"); \ - auto *instance = ::phi::GetMutableExportedFlagInfoMap(); \ - auto &info = (*instance)[#__name]; \ + auto* instance = ::phi::GetMutableExportedFlagInfoMap(); \ + auto& info = (*instance)[#__name]; \ info.name = #__name; \ info.value_ptr = &(FLAGS_##__name); \ info.default_value = static_cast<__cpp_type>(__default_value); \ @@ -73,26 +175,25 @@ ExportedFlagInfoMap *GetMutableExportedFlagInfoMap(); UNUSED static int __paddle_use_flag_##__name = \ TouchPaddleFlagRegister_##__name() -#define PADDLE_DEFINE_EXPORTED_bool(name, default_value, doc) \ - __PADDLE_DEFINE_EXPORTED_FLAG(name, true, bool, bool, default_value, doc) -#define PADDLE_DEFINE_EXPORTED_READONLY_bool(name, default_value, doc) \ - __PADDLE_DEFINE_EXPORTED_FLAG(name, false, bool, bool, default_value, doc) +#define PHI_DEFINE_EXPORTED_bool(name, default_value, doc) \ + __PHI_DEFINE_EXPORTED_FLAG(name, true, bool, bool, default_value, doc) +#define PHI_DEFINE_EXPORTED_READONLY_bool(name, default_value, doc) \ + __PHI_DEFINE_EXPORTED_FLAG(name, false, bool, bool, default_value, doc) -#define PADDLE_DEFINE_EXPORTED_int32(name, default_value, doc) \ - __PADDLE_DEFINE_EXPORTED_FLAG(name, true, int32_t, int32, default_value, doc) +#define PHI_DEFINE_EXPORTED_int32(name, default_value, doc) \ + __PHI_DEFINE_EXPORTED_FLAG(name, true, int32_t, int32, default_value, doc) -#define PADDLE_DEFINE_EXPORTED_int64(name, default_value, doc) \ - __PADDLE_DEFINE_EXPORTED_FLAG(name, true, int64_t, int64, default_value, doc) +#define PHI_DEFINE_EXPORTED_int64(name, default_value, doc) \ + __PHI_DEFINE_EXPORTED_FLAG(name, true, int64_t, int64, default_value, doc) -#define PADDLE_DEFINE_EXPORTED_uint64(name, default_value, doc) \ - __PADDLE_DEFINE_EXPORTED_FLAG( \ - name, true, uint64_t, uint64, default_value, doc) +#define PHI_DEFINE_EXPORTED_uint64(name, default_value, doc) \ + __PHI_DEFINE_EXPORTED_FLAG(name, true, uint64_t, uint64, default_value, doc) -#define PADDLE_DEFINE_EXPORTED_double(name, default_value, doc) \ - __PADDLE_DEFINE_EXPORTED_FLAG(name, true, double, double, default_value, doc) +#define PHI_DEFINE_EXPORTED_double(name, default_value, doc) \ + __PHI_DEFINE_EXPORTED_FLAG(name, true, double, double, default_value, doc) -#define PADDLE_DEFINE_EXPORTED_string(name, default_value, doc) \ - __PADDLE_DEFINE_EXPORTED_FLAG( \ +#define PHI_DEFINE_EXPORTED_string(name, default_value, doc) \ + __PHI_DEFINE_EXPORTED_FLAG( \ name, true, ::std::string, string, default_value, doc) } // namespace phi diff --git a/paddle/phi/kernels/funcs/jit/gen_base.cc b/paddle/phi/kernels/funcs/jit/gen_base.cc index eadf636179636..1e5731e60ceaf 100644 --- a/paddle/phi/kernels/funcs/jit/gen_base.cc +++ b/paddle/phi/kernels/funcs/jit/gen_base.cc @@ -25,7 +25,7 @@ #define posix_memalign_free free #endif -DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file"); +PHI_DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file"); namespace phi { namespace jit { diff --git a/paddle/phi/kernels/funcs/jit/gen_base.h b/paddle/phi/kernels/funcs/jit/gen_base.h index 8795d36fbc12d..c72c0c52792c7 100644 --- a/paddle/phi/kernels/funcs/jit/gen_base.h +++ b/paddle/phi/kernels/funcs/jit/gen_base.h @@ -23,9 +23,10 @@ #endif #include "gflags/gflags.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/kernels/funcs/jit/kernel_base.h" -DECLARE_bool(dump_jitcode); +PHI_DECLARE_bool(dump_jitcode); namespace phi { namespace jit { diff --git a/paddle/utils/pybind.cc b/paddle/utils/pybind.cc index 7e93dcff1400a..24f4c2bd22a8d 100644 --- a/paddle/utils/pybind.cc +++ b/paddle/utils/pybind.cc @@ -14,10 +14,10 @@ #include "paddle/utils/pybind.h" -#include "gflags/gflags.h" #include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/flags.h" -DECLARE_string(tensor_operants_mode); +PHI_DECLARE_string(tensor_operants_mode); namespace paddle { namespace pybind { diff --git a/test/cpp/inference/test_helper.h b/test/cpp/inference/test_helper.h index f0ed56d1e19f6..0685b90de1fa2 100644 --- a/test/cpp/inference/test_helper.h +++ b/test/cpp/inference/test_helper.h @@ -24,8 +24,9 @@ limitations under the License. */ #include "paddle/fluid/platform/errors.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/phi/backends/dynload/port.h" +#include "paddle/phi/core/flags.h" -DECLARE_bool(use_mkldnn); +PHI_DECLARE_bool(use_mkldnn); namespace paddle { bool gpu_place_used(const paddle::PaddlePlace& place) { diff --git a/test/cpp/prim/test_eager_prim.cc b/test/cpp/prim/test_eager_prim.cc index d2cdeb80495c1..9d2166d457643 100644 --- a/test/cpp/prim/test_eager_prim.cc +++ b/test/cpp/prim/test_eager_prim.cc @@ -14,7 +14,6 @@ #include -#include "gflags/gflags.h" #include "glog/logging.h" #include "gtest/gtest.h" #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" @@ -22,12 +21,13 @@ #include "paddle/fluid/eager/backward.h" #include "paddle/fluid/prim/utils/utils.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_meta.h" #include "test/cpp/eager/test_utils.h" #include "test/cpp/prim/init_env_utils.h" -DECLARE_string(tensor_operants_mode); +PHI_DECLARE_string(tensor_operants_mode); PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(tanh, CPU, ALL_LAYOUT); diff --git a/test/cpp/prim/test_static_prim.cc b/test/cpp/prim/test_static_prim.cc index 1ae7303d10d75..936ae9babfe36 100644 --- a/test/cpp/prim/test_static_prim.cc +++ b/test/cpp/prim/test_static_prim.cc @@ -25,10 +25,11 @@ #include "paddle/fluid/prim/utils/utils.h" #include "paddle/phi/api/include/operants_manager.h" #include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/core/kernel_registry.h" DECLARE_bool(prim_enabled); -DECLARE_string(tensor_operants_mode); +PHI_DECLARE_string(tensor_operants_mode); PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(tanh, CPU, ALL_LAYOUT); From c1a61fc0447f16c419a5b86ee35649adda929996 Mon Sep 17 00:00:00 2001 From: Zero Rains Date: Tue, 25 Apr 2023 16:01:18 +0800 Subject: [PATCH 055/405] =?UTF-8?q?=E3=80=90PaddlePaddle=20Hackathon=204?= =?UTF-8?q?=20No.33=E3=80=91=E4=B8=BA=20Paddle=20=E4=BC=98=E5=8C=96=20Hist?= =?UTF-8?q?ogram=20op=20=E5=9C=A8=20GPU=20=E4=B8=8A=E7=9A=84=E8=AE=A1?= =?UTF-8?q?=E7=AE=97=E6=80=A7=E8=83=BD=20(#53112)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * create KernelMinMax to optimize the performance of histogram op in GPU * change to block and warp wise operation * remove the time in DtoH * fix a bug --- paddle/phi/kernels/gpu/histogram_kernel.cu | 105 ++++++++++++++------- 1 file changed, 73 insertions(+), 32 deletions(-) diff --git a/paddle/phi/kernels/gpu/histogram_kernel.cu b/paddle/phi/kernels/gpu/histogram_kernel.cu index 111b13f11dd0e..aa10aea35f867 100644 --- a/paddle/phi/kernels/gpu/histogram_kernel.cu +++ b/paddle/phi/kernels/gpu/histogram_kernel.cu @@ -18,8 +18,7 @@ #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/funcs/eigen/common.h" -#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/math_cuda_utils.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace phi { @@ -46,8 +45,8 @@ template __global__ void KernelHistogram(const T* input, const int total_elements, const int64_t nbins, - const T min_value, - const T max_value, + const T* min_value, + const T* max_value, int64_t* output) { extern __shared__ int64_t buf_hist[]; for (int i = threadIdx.x; i < nbins; i += blockDim.x) { @@ -58,9 +57,9 @@ __global__ void KernelHistogram(const T* input, CUDA_KERNEL_LOOP(input_index, total_elements) { // const IndexType input_index = threadIdx.x + blockIdx.x * blockDim.x; const auto input_value = input[input_index]; - if (input_value >= min_value && input_value <= max_value) { + if (input_value >= *min_value && input_value <= *max_value) { const IndexType output_index = - GetBin(input_value, min_value, max_value, nbins); + GetBin(input_value, *min_value, *max_value, nbins); phi::CudaAtomicAdd(&buf_hist[output_index], 1); } } @@ -71,6 +70,60 @@ __global__ void KernelHistogram(const T* input, } } +template +__global__ void KernelMinMax(const T* input, + const int numel, + const int block_num, + T* min_ptr, + T* max_ptr) { + int64_t index = threadIdx.x + blockIdx.x * blockDim.x; + int64_t i = index; + T min_value = static_cast(i < numel ? input[i] : input[0]); + T max_value = static_cast(i < numel ? input[i] : input[0]); + + for (; i < numel; i += blockDim.x * gridDim.x) { + T value = static_cast(input[i]); + min_value = value < min_value ? value : min_value; + max_value = value > max_value ? value : max_value; + } + if (max_ptr && min_ptr) { + __syncthreads(); + T block_min_value = phi::funcs::BlockReduceMin(min_value, FINAL_MASK); + T block_max_value = phi::funcs::BlockReduceMax(max_value, FINAL_MASK); + + if (threadIdx.x == 0) { + min_ptr[blockIdx.x] = block_min_value; + max_ptr[blockIdx.x] = block_max_value; + } + } + __syncthreads(); + if (index == 0) { + if (min_ptr && max_ptr) { + min_value = min_ptr[0]; + max_value = max_ptr[0]; + for (int64_t i = 1; i < block_num; i++) { + min_ptr[0] = min_ptr[i] < min_value ? min_ptr[i] : min_value; + max_ptr[0] = max_ptr[i] > max_value ? max_ptr[i] : max_value; + } + if (min_ptr[0] == max_ptr[0]) { + min_ptr[0] = min_ptr[0] - 1; + max_ptr[0] = max_ptr[0] + 1; + } + } + } +} + +template +__global__ void KernelMinMax(const T min_value, + const T max_value, + T* min_ptr, + T* max_ptr) { + if (threadIdx.x == 0 && blockIdx.x == 0) { + min_ptr[0] = min_value; + max_ptr[0] = max_value; + } +} + template void HistogramKernel(const Context& dev_ctx, const DenseTensor& input, @@ -93,32 +146,20 @@ void HistogramKernel(const Context& dev_ctx, T output_min = static_cast(minval); T output_max = static_cast(maxval); - - if (output_min == output_max) { - auto input_x = phi::EigenVector::Flatten(input); - - DenseTensor input_min_t, input_max_t; - input_min_t.Resize({1}); - input_max_t.Resize({1}); - auto* input_min_data = dev_ctx.template Alloc(&input_min_t); - auto* input_max_data = dev_ctx.template Alloc(&input_max_t); - auto input_min_scala = phi::EigenScalar::From(input_min_t); - auto input_max_scala = phi::EigenScalar::From(input_max_t); - - auto* place = dev_ctx.eigen_device(); - input_min_scala.device(*place) = input_x.minimum(); - input_max_scala.device(*place) = input_x.maximum(); - - DenseTensor input_min_cpu, input_max_cpu; - phi::Copy(dev_ctx, input_min_t, phi::CPUPlace(), true, &input_min_cpu); - phi::Copy(dev_ctx, input_max_t, phi::CPUPlace(), true, &input_max_cpu); - - output_min = input_min_cpu.data()[0]; - output_max = input_max_cpu.data()[0]; - } + DenseTensor min_max; + int block_num = GET_BLOCKS(input_numel); + min_max.Resize({2 * block_num}); + auto* min_block_ptr = dev_ctx.template Alloc(&min_max); + auto* max_block_ptr = min_block_ptr + block_num; if (output_min == output_max) { - output_min = output_min - 1; - output_max = output_max + 1; + KernelMinMax<<>>( + input_data, input_numel, block_num, min_block_ptr, max_block_ptr); + } else { + KernelMinMax<<<1, 1, 0, dev_ctx.stream()>>>( + output_min, output_max, min_block_ptr, max_block_ptr); } PADDLE_ENFORCE_EQ((std::isinf(static_cast(output_min)) || @@ -142,7 +183,7 @@ void HistogramKernel(const Context& dev_ctx, PADDLE_CUDA_NUM_THREADS, nbins * sizeof(int64_t), stream>>>( - input_data, input_numel, nbins, output_min, output_max, out_data); + input_data, input_numel, nbins, min_block_ptr, max_block_ptr, out_data); } } // namespace phi From f6f48780e44bbdf4773cd71144a8c1dd971c3d1f Mon Sep 17 00:00:00 2001 From: huangjiyi <43315610+huangjiyi@users.noreply.github.com> Date: Tue, 25 Apr 2023 16:13:59 +0800 Subject: [PATCH 056/405] Register fluid xpu kerenls to phi [part 1] (#53187) * update * fix bug * Revert "affine_channel_op" --- cmake/operators.cmake | 1 + .../operators/beam_search_decode_op_xpu.cc | 22 ++++++++++--------- paddle/fluid/operators/beam_search_op_xpu.cc | 15 +++++++------ .../collective/c_allgather_op_xpu.cc | 17 ++++++++------ .../collective/c_allreduce_max_op_xpu.cc | 18 ++++++++++----- .../collective/c_allreduce_min_op_xpu.cc | 16 +++++++++++--- .../operators/collective/c_allreduce_op.h | 4 ++++ .../collective/c_allreduce_prod_op_xpu.cc | 16 +++++++++++--- .../collective/c_allreduce_sum_op_xpu.cc | 18 ++++++++++----- .../collective/c_broadcast_op_xpu.cc | 11 ++++++---- .../operators/collective/c_concat_op_xpu.cc | 7 +++--- .../collective/c_embedding_op_xpu.cc | 14 +++++------- .../operators/collective/c_identity_op_xpu.cc | 15 ++++++++----- .../collective/c_reduce_max_op_xpu.cc | 10 +++++++-- .../collective/c_reduce_min_op_xpu.cc | 10 +++++++-- .../fluid/operators/collective/c_reduce_op.h | 4 ++++ .../collective/c_reduce_prod_op_xpu.cc | 10 +++++++-- .../collective/c_reduce_sum_op_xpu.cc | 11 +++++++--- .../collective/mp_allreduce_sum_op_xpu.cc | 18 ++++++++++----- 19 files changed, 161 insertions(+), 76 deletions(-) diff --git a/cmake/operators.cmake b/cmake/operators.cmake index a9cac23e8f0b0..e22a747688b76 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -470,6 +470,7 @@ function(op_library TARGET) foreach(xpu_src ${xpu_cc_srcs}) set(op_name "") find_register(${xpu_src} "REGISTER_OP_XPU_KERNEL" op_name) + find_phi_register(${xpu_src} ${pybind_file} "PD_REGISTER_STRUCT_KERNEL") if(NOT ${op_name} EQUAL "") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, XPU);\n") set(pybind_flag 1) diff --git a/paddle/fluid/operators/beam_search_decode_op_xpu.cc b/paddle/fluid/operators/beam_search_decode_op_xpu.cc index b78cec34cb900..5fd2b2fc6fa35 100644 --- a/paddle/fluid/operators/beam_search_decode_op_xpu.cc +++ b/paddle/fluid/operators/beam_search_decode_op_xpu.cc @@ -20,7 +20,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class BeamSearchDecodeXPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -111,13 +111,15 @@ class BeamSearchDecodeXPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_XPU_KERNEL( - beam_search_decode, - ops::BeamSearchDecodeXPUKernel, - ops::BeamSearchDecodeXPUKernel, - ops::BeamSearchDecodeXPUKernel, - ops::BeamSearchDecodeXPUKernel, - ops::BeamSearchDecodeXPUKernel); +namespace plat = paddle::platform; + +PD_REGISTER_STRUCT_KERNEL(beam_search_decode, + XPU, + ALL_LAYOUT, + ops::BeamSearchDecodeXPUKernel, + float, + double, + plat::float16, + int, + int64_t) {} #endif diff --git a/paddle/fluid/operators/beam_search_op_xpu.cc b/paddle/fluid/operators/beam_search_op_xpu.cc index 9f1d1488d9a64..4140fc6f2910d 100644 --- a/paddle/fluid/operators/beam_search_op_xpu.cc +++ b/paddle/fluid/operators/beam_search_op_xpu.cc @@ -18,11 +18,12 @@ limitations under the License. */ #include "paddle/fluid/operators/beam_search_op.h" namespace ops = paddle::operators; -using XPUCtx = paddle::platform::XPUDeviceContext; - -REGISTER_OP_XPU_KERNEL(beam_search, - ops::BeamSearchOpKernel, - ops::BeamSearchOpKernel, - ops::BeamSearchOpKernel, - ops::BeamSearchOpKernel); +PD_REGISTER_STRUCT_KERNEL(beam_search, + XPU, + ALL_LAYOUT, + ops::BeamSearchOpKernel, + float, + double, + int, + int64_t) {} #endif diff --git a/paddle/fluid/operators/collective/c_allgather_op_xpu.cc b/paddle/fluid/operators/collective/c_allgather_op_xpu.cc index 107f5ccd1b563..1e7d3f3a9fec1 100644 --- a/paddle/fluid/operators/collective/c_allgather_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_allgather_op_xpu.cc @@ -22,7 +22,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class CAllGatherOpXPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -80,9 +80,12 @@ class CAllGatherOpXPUKernel : public framework::OpKernel { namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_XPU_KERNEL(c_allgather, - ops::CAllGatherOpXPUKernel, - ops::CAllGatherOpXPUKernel, - ops::CAllGatherOpXPUKernel, - ops::CAllGatherOpXPUKernel, - ops::CAllGatherOpXPUKernel); +PD_REGISTER_STRUCT_KERNEL(c_allgather, + XPU, + ALL_LAYOUT, + ops::CAllGatherOpXPUKernel, + float, + double, + plat::float16, + int, + int64_t) {} diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc index 86527cf0e6e1e..8c648b4ae4a37 100644 --- a/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc @@ -14,10 +14,18 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_allreduce_op.h" +namespace paddle { +namespace operators { +DEFINE_C_ALLREDUCE_XPU_KERNEL(CAllReduceMax, kRedMax) +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; namespace plat = paddle::platform; - -REGISTER_OP_XPU_KERNEL(c_allreduce_max, - ops::CAllReduceOpXPUKernel, - ops::CAllReduceOpXPUKernel, - ops::CAllReduceOpXPUKernel) +PD_REGISTER_STRUCT_KERNEL(c_allreduce_max, + XPU, + ALL_LAYOUT, + ops::CAllReduceMaxXPUKernel, + float, + int, + plat::float16) {} diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc index b8a8cfab3f75f..f9be16781af70 100644 --- a/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc @@ -14,8 +14,18 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_allreduce_op.h" +namespace paddle { +namespace operators { +DEFINE_C_ALLREDUCE_XPU_KERNEL(CAllReduceMin, kRedMin) +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; namespace plat = paddle::platform; - -REGISTER_OP_XPU_KERNEL(c_allreduce_min, - ops::CAllReduceOpXPUKernel) +PD_REGISTER_STRUCT_KERNEL(c_allreduce_min, + XPU, + ALL_LAYOUT, + ops::CAllReduceMinXPUKernel, + float, + int, + plat::float16) {} diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index a57486584463d..82cb3a3772e3a 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -223,6 +223,10 @@ class CAllReduceOpXPUKernel : public framework::OpKernel { } }; +#define DEFINE_C_ALLREDUCE_XPU_KERNEL(op_name, red_type) \ + template \ + class op_name##XPUKernel : public CAllReduceOpXPUKernel {}; + template class CAllReduceOpCUDAKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc index 6e2b2df297313..5558b1722093a 100644 --- a/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc @@ -14,8 +14,18 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_allreduce_op.h" +namespace paddle { +namespace operators { +DEFINE_C_ALLREDUCE_XPU_KERNEL(CAllReduceProd, kRedProd) +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; namespace plat = paddle::platform; - -REGISTER_OP_XPU_KERNEL(c_allreduce_prod, - ops::CAllReduceOpXPUKernel) +PD_REGISTER_STRUCT_KERNEL(c_allreduce_prod, + XPU, + ALL_LAYOUT, + ops::CAllReduceProdXPUKernel, + float, + int, + plat::float16) {} diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc index a4d1c62e821ec..1d4c5f63b5850 100644 --- a/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc @@ -14,10 +14,18 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_allreduce_op.h" +namespace paddle { +namespace operators { +DEFINE_C_ALLREDUCE_XPU_KERNEL(CAllReduceSum, kRedSum) +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; namespace plat = paddle::platform; - -REGISTER_OP_XPU_KERNEL(c_allreduce_sum, - ops::CAllReduceOpXPUKernel, - ops::CAllReduceOpXPUKernel, - ops::CAllReduceOpXPUKernel) +PD_REGISTER_STRUCT_KERNEL(c_allreduce_sum, + XPU, + ALL_LAYOUT, + ops::CAllReduceSumXPUKernel, + float, + int, + plat::float16) {} diff --git a/paddle/fluid/operators/collective/c_broadcast_op_xpu.cc b/paddle/fluid/operators/collective/c_broadcast_op_xpu.cc index 3f12adcea3f8f..676743e22c6cd 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_broadcast_op_xpu.cc @@ -22,7 +22,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class CBroadcastOpXPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -113,6 +113,9 @@ class CBroadcastOpXPUKernel : public framework::OpKernel { namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_XPU_KERNEL(c_broadcast, - ops::CBroadcastOpXPUKernel, - ops::CBroadcastOpXPUKernel); +PD_REGISTER_STRUCT_KERNEL(c_broadcast, + XPU, + ALL_LAYOUT, + ops::CBroadcastOpXPUKernel, + float, + plat::float16) {} diff --git a/paddle/fluid/operators/collective/c_concat_op_xpu.cc b/paddle/fluid/operators/collective/c_concat_op_xpu.cc index b31be9bde44dc..7ecf5d08dba84 100644 --- a/paddle/fluid/operators/collective/c_concat_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_concat_op_xpu.cc @@ -28,7 +28,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class CConcatOpXPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -118,6 +118,5 @@ class CConcatOpXPUKernel : public framework::OpKernel { namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_XPU_KERNEL(c_concat, - ops::CConcatOpXPUKernel, - ops::CConcatOpXPUKernel); +PD_REGISTER_STRUCT_KERNEL( + c_concat, XPU, ALL_LAYOUT, ops::CConcatOpXPUKernel, float, plat::float16) {} diff --git a/paddle/fluid/operators/collective/c_embedding_op_xpu.cc b/paddle/fluid/operators/collective/c_embedding_op_xpu.cc index 8590ff257305f..b46a561532d99 100644 --- a/paddle/fluid/operators/collective/c_embedding_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_embedding_op_xpu.cc @@ -15,7 +15,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class CEmbeddingOpXPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -68,7 +68,7 @@ class CEmbeddingOpXPUKernel : public framework::OpKernel { } }; -template +template class CEmbeddingGradOpXPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -140,9 +140,7 @@ class CEmbeddingGradOpXPUKernel : public framework::OpKernel { namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_XPU_KERNEL( - c_embedding, - ops::CEmbeddingOpXPUKernel); -REGISTER_OP_XPU_KERNEL( - c_embedding_grad, - ops::CEmbeddingGradOpXPUKernel); +PD_REGISTER_STRUCT_KERNEL( + c_embedding, XPU, ALL_LAYOUT, ops::CEmbeddingOpXPUKernel, float) {} +PD_REGISTER_STRUCT_KERNEL( + c_embedding_grad, XPU, ALL_LAYOUT, ops::CEmbeddingGradOpXPUKernel, float) {} diff --git a/paddle/fluid/operators/collective/c_identity_op_xpu.cc b/paddle/fluid/operators/collective/c_identity_op_xpu.cc index d92c90e3f6385..a627d2748b6d7 100644 --- a/paddle/fluid/operators/collective/c_identity_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_identity_op_xpu.cc @@ -14,9 +14,12 @@ limitations under the License. */ namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_XPU_KERNEL(c_identity, - ops::CIdentityOpKernel, - ops::CIdentityOpKernel, - ops::CIdentityOpKernel, - ops::CIdentityOpKernel, - ops::CIdentityOpKernel); +PD_REGISTER_STRUCT_KERNEL(c_identity, + XPU, + ALL_LAYOUT, + ops::CIdentityOpKernel, + float, + double, + int, + int64_t, + plat::float16) {} diff --git a/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc index 3ae1af1d08315..6712a6eb500ee 100644 --- a/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc @@ -14,8 +14,14 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_reduce_op.h" +namespace paddle { +namespace operators { +DEFINE_C_REDUCE_XPU_KERNEL(CReduceMax, kRedMax); +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_XPU_KERNEL(c_reduce_max, - ops::CReduceOpXPUKernel) +PD_REGISTER_STRUCT_KERNEL( + c_reduce_max, XPU, ALL_LAYOUT, ops::CReduceMaxXPUKernel, float) {} diff --git a/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc index 500ea2abe6ab5..440c2b85acde3 100644 --- a/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc @@ -14,8 +14,14 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_reduce_op.h" +namespace paddle { +namespace operators { +DEFINE_C_REDUCE_XPU_KERNEL(CReduceMin, kRedMin); +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_XPU_KERNEL(c_reduce_min, - ops::CReduceOpXPUKernel) +PD_REGISTER_STRUCT_KERNEL( + c_reduce_min, XPU, ALL_LAYOUT, ops::CReduceMinXPUKernel, float) {} diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h index 4ff84af7a6ee2..09b9b9f5a5b2b 100644 --- a/paddle/fluid/operators/collective/c_reduce_op.h +++ b/paddle/fluid/operators/collective/c_reduce_op.h @@ -198,6 +198,10 @@ class CReduceOpXPUKernel : public framework::OpKernel { } }; +#define DEFINE_C_REDUCE_XPU_KERNEL(op_name, red_type) \ + template \ + class op_name##XPUKernel : public CReduceOpXPUKernel {}; + template class CReduceOpCUDAKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc index 15ef32289655e..1541918396d07 100644 --- a/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc @@ -14,8 +14,14 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_reduce_op.h" +namespace paddle { +namespace operators { +DEFINE_C_REDUCE_XPU_KERNEL(CReduceProd, kRedProd); +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_XPU_KERNEL(c_reduce_prod, - ops::CReduceOpXPUKernel) +PD_REGISTER_STRUCT_KERNEL( + c_reduce_prod, XPU, ALL_LAYOUT, ops::CReduceProdXPUKernel, float) {} diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc index 9f35a6866eeb1..230dca3503538 100644 --- a/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc @@ -14,9 +14,14 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_reduce_op.h" +namespace paddle { +namespace operators { +DEFINE_C_REDUCE_XPU_KERNEL(CReduceSum, kRedSum); +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_XPU_KERNEL(c_reduce_sum, - ops::CReduceOpXPUKernel, - ops::CReduceOpXPUKernel) +PD_REGISTER_STRUCT_KERNEL( + c_reduce_sum, XPU, ALL_LAYOUT, ops::CReduceSumXPUKernel, float) {} diff --git a/paddle/fluid/operators/collective/mp_allreduce_sum_op_xpu.cc b/paddle/fluid/operators/collective/mp_allreduce_sum_op_xpu.cc index 39dcc3470f6ac..9638bf68d1717 100644 --- a/paddle/fluid/operators/collective/mp_allreduce_sum_op_xpu.cc +++ b/paddle/fluid/operators/collective/mp_allreduce_sum_op_xpu.cc @@ -14,10 +14,18 @@ #include "paddle/fluid/operators/collective/c_allreduce_op.h" +namespace paddle { +namespace operators { +DEFINE_C_ALLREDUCE_XPU_KERNEL(CAllReduceSum, kRedSum) +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; namespace plat = paddle::platform; - -REGISTER_OP_XPU_KERNEL(mp_allreduce_sum, - ops::CAllReduceOpXPUKernel, - ops::CAllReduceOpXPUKernel, - ops::CAllReduceOpXPUKernel) +PD_REGISTER_STRUCT_KERNEL(mp_allreduce_sum, + XPU, + ALL_LAYOUT, + ops::CAllReduceSumXPUKernel, + float, + int, + plat::float16) {} From 00f747f2cce7251d23f7688c334de98bd5692dfa Mon Sep 17 00:00:00 2001 From: Yuanle Liu Date: Tue, 25 Apr 2023 16:20:07 +0800 Subject: [PATCH 057/405] [Paddle Inference] add generic plugin for p_norm (#53278) --- .../tensorrt/dynamic_shape_infermeta.cc | 51 +++++-- .../dynamic_shape_infermeta_registry.h | 1 + .../tensorrt/plugin/generic_plugin.cu | 2 +- test/ir/inference/test_trt_convert_p_norm.py | 134 ++++++++++++++++++ 4 files changed, 177 insertions(+), 11 deletions(-) create mode 100644 test/ir/inference/test_trt_convert_p_norm.py diff --git a/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta.cc b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta.cc index 8752bdf793329..fe0c4e0d18ef4 100644 --- a/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta.cc +++ b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta.cc @@ -14,7 +14,7 @@ #include "paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_factory.h" #include "paddle/fluid/inference/tensorrt/helper.h" -#include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/enforce.h" #include "paddle/phi/kernels/funcs/unfold_functor.h" namespace paddle { @@ -322,20 +322,51 @@ nvinfer1::DimsExprs PNormInferMeta( int nb_inputs, nvinfer1::IExprBuilder& expr_builder, // NOLINT const framework::OpDesc& op_desc) { - const nvinfer1::DimsExprs x_dim = inputs[0]; - std::vector reduce_dims; - std::vector keep_dims; - bool asvector = PADDLE_GET_CONST(bool, op_desc.GetAttr("asvector")); bool keepdim = PADDLE_GET_CONST(bool, op_desc.GetAttr("keepdim")); int axis = PADDLE_GET_CONST(int, op_desc.GetAttr("axis")); + auto x_dim = inputs[0]; + auto x_rank = x_dim.nbDims; + + PADDLE_ENFORCE_GE(axis, + -x_rank, + phi::errors::InvalidArgument( + "Attr(axis) value should be in range [-R, R-1], R is " + "the rank of Input(X). But received axis: %d, R: %d. " + "Current Input(X)'s shape is=[%s].", + axis, + x_rank, + x_dim.d)); + PADDLE_ENFORCE_LT(axis, + x_rank, + phi::errors::InvalidArgument( + "Attr(axis) value should be in range [-R, R-1], R is " + "the rank of Input(X). But received axis: %d, R: %d. " + "Current Input(X)'s shape is=[%s].", + axis, + x_rank, + x_dim.d)); + + // TODO(liuyuanle): support asvector = True + PADDLE_ENFORCE_EQ( + asvector, + false, + phi::errors::InvalidArgument( + "p_norm only support asvector=false, but received asvector: %d.", + asvector)); + + std::vector reduce_dims; + if (asvector) { reduce_dims.emplace_back(expr_builder.constant(1)); - keep_dims.emplace_back(expr_builder.constant(1)); if (keepdim) { for (int i = 1; i < x_dim.nbDims; ++i) { - keep_dims.emplace_back(expr_builder.constant(1)); + reduce_dims.emplace_back(expr_builder.constant(1)); + } + x_dim.nbDims = reduce_dims.size(); + for (size_t i = 0; i < reduce_dims.size(); i++) { + x_dim.d[i] = reduce_dims[i]; } } } else { @@ -347,12 +378,11 @@ nvinfer1::DimsExprs PNormInferMeta( reduce_dims.emplace_back(expr_builder.constant(1)); } } - keep_dims[axis] = expr_builder.constant(1); + x_dim.d[axis] = expr_builder.constant(1); nvinfer1::DimsExprs output; if (keepdim) { - output.nbDims = keep_dims.size(); - for (int i = 0; i < output.nbDims; i++) output.d[i] = keep_dims[i]; + output = x_dim; } else { output.nbDims = reduce_dims.size(); for (int i = 0; i < output.nbDims; i++) output.d[i] = reduce_dims[i]; @@ -396,6 +426,7 @@ PD_REGISTER_DYNAMIC_INFER_META_FN(inverse, UnchangedInferMeta); PD_REGISTER_DYNAMIC_INFER_META_FN(moe, MoeInferMeta); PD_REGISTER_DYNAMIC_INFER_META_FN(pad3d, Pad3dInferMeta); PD_REGISTER_DYNAMIC_INFER_META_FN(grid_sampler, GridSamplerInferMeta); +PD_REGISTER_DYNAMIC_INFER_META_FN(p_norm, PNormInferMeta); } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_registry.h b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_registry.h index c0ddaf5d983ef..db9b6e9313b5d 100644 --- a/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_registry.h +++ b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_registry.h @@ -28,6 +28,7 @@ USE_TRT_DYNAMIC_INFER_META_FN(scatter_nd_add); USE_TRT_DYNAMIC_INFER_META_FN(pad3d); USE_TRT_DYNAMIC_INFER_META_FN(inverse); USE_TRT_DYNAMIC_INFER_META_FN(grid_sampler); +USE_TRT_DYNAMIC_INFER_META_FN(p_norm); } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu index e97aa95a6aa03..f9bbb27958676 100644 --- a/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu @@ -472,7 +472,7 @@ int GenericPlugin::enqueue(const nvinfer1::PluginTensorDesc* input_desc, cudaStream_t stream) TRT_NOEXCEPT { platform::CUDAPlace place(platform::GetCurrentDeviceId()); - // [TODO]now generic plugin do not support FP16 and INT8 precision + // [TODO]now generic plugin do not support INT8 precision auto protoType2PhiType = [&](int proto_type, nvinfer1::DataType nv_dtype) -> std::pair { diff --git a/test/ir/inference/test_trt_convert_p_norm.py b/test/ir/inference/test_trt_convert_p_norm.py new file mode 100644 index 0000000000000..5aa48135a89f6 --- /dev/null +++ b/test/ir/inference/test_trt_convert_p_norm.py @@ -0,0 +1,134 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from functools import partial +from typing import Any, Dict, List + +import numpy as np +from program_config import ProgramConfig, TensorConfig +from trt_layer_auto_scan_test import TrtLayerAutoScanTest + +import paddle.inference as paddle_infer + + +class TrtConvertCeluTest(TrtLayerAutoScanTest): + def sample_program_configs(self): + def generate_input1(dims, attrs: List[Dict[str, Any]]): + if dims == 1: + return np.ones([3]).astype(np.float32) + elif dims == 2: + return np.ones([3, 64]).astype(np.float32) + elif dims == 3: + return np.ones([3, 64, 64]).astype(np.float32) + else: + return np.ones([1, 3, 64, 64]).astype(np.float32) + + for dims in [2, 3, 4]: + # TODO(liuyuanle): support asvector = True + for asvector in [False]: + for keepdim in [False, True]: + for porder in [0, 1, 2, 3]: + for axis in [-1]: + self.dims = dims + + dics = [ + { + "asvector": asvector, + "keepdim": keepdim, + "axis": axis, + "porder": porder, + } + ] + + ops_config = [ + { + "op_type": "p_norm", + "op_inputs": { + "X": ["input_data"], + }, + "op_outputs": {"Out": ["output_data"]}, + "op_attrs": dics[0], + } + ] + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "input_data": TensorConfig( + data_gen=partial( + generate_input1, dims, dics + ) + ) + }, + outputs=["output_data"], + ) + + yield program_config + + def sample_predictor_configs( + self, program_config + ) -> (paddle_infer.Config, List[int], float): + def generate_dynamic_shape(attrs): + if self.dims == 1: + self.dynamic_shape.min_input_shape = {"input_data": [1]} + self.dynamic_shape.max_input_shape = {"input_data": [128]} + self.dynamic_shape.opt_input_shape = {"input_data": [64]} + elif self.dims == 2: + self.dynamic_shape.min_input_shape = {"input_data": [1, 32]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 64]} + self.dynamic_shape.opt_input_shape = {"input_data": [3, 64]} + elif self.dims == 3: + self.dynamic_shape.min_input_shape = {"input_data": [1, 32, 32]} + self.dynamic_shape.max_input_shape = { + "input_data": [10, 64, 64] + } + self.dynamic_shape.opt_input_shape = {"input_data": [3, 64, 64]} + else: + self.dynamic_shape.min_input_shape = { + "input_data": [1, 3, 32, 32] + } + self.dynamic_shape.max_input_shape = { + "input_data": [4, 3, 64, 64] + } + self.dynamic_shape.opt_input_shape = { + "input_data": [1, 3, 64, 64] + } + + def generate_trt_nodes_num(attrs, dynamic_shape): + return 1, 2 + + attrs = [ + program_config.ops[i].attrs for i in range(len(program_config.ops)) + ] + + # for dynamic_shape mode + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True + ), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True + ), (1e-3, 1e-3) + + def test(self): + self.run_test() + + +if __name__ == "__main__": + unittest.main() From 503f422eaa7a7c4aaeb87a202e3877168c77a1b9 Mon Sep 17 00:00:00 2001 From: wuhuachaocoding <77733235+wuhuachaocoding@users.noreply.github.com> Date: Tue, 25 Apr 2023 16:22:20 +0800 Subject: [PATCH 058/405] add mp_sync config. (#53254) --- .../framework/distributed_strategy.proto | 1 + .../fleet/base/distributed_strategy.py | 6 ++ .../hybrid_parallel_optimizer.py | 95 +++++++++++++------ .../fleet/hybrid_parallel_mp_model.py | 52 +++++++++- .../fleet/test_fleet_distributed_strategy.py | 30 ++++++ 5 files changed, 150 insertions(+), 34 deletions(-) diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index d0e494ad49491..6b093e9ee03b8 100755 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -55,6 +55,7 @@ message MpConfig { optional bool sync_param= 1 [ default = false ]; optional bool sync_grad= 2 [ default = false ]; optional bool sync_moment= 3 [ default = false ]; + optional string sync_mode= 4 [ default = 'broadcast' ]; } message PpConfig { diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py index 14e2fc09d3390..b751989102483 100755 --- a/python/paddle/distributed/fleet/base/distributed_strategy.py +++ b/python/paddle/distributed/fleet/base/distributed_strategy.py @@ -146,6 +146,8 @@ def __init__(self): self.strategy.sync_nccl_allreduce = bool(_global_flags()[key]) self.hybrid_parallel_order = ['dp', 'pp', 'sharding', 'mp'] + self.sync_param_name = ["embedding", "layer_norm", ".b_"] + self.__lock_attr = True logger.info("distributed strategy initialized") @@ -1698,6 +1700,10 @@ def hybrid_configs(self, configs): ) if "mp_configs" in configs: + if "sync_param_name" in configs["mp_configs"]: + self.sync_param_name = configs["mp_configs"]["sync_param_name"] + configs["mp_configs"].pop("sync_param_name") + assign_configs_value( self.strategy.hybrid_configs.mp_configs, configs["mp_configs"] ) diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py index ab1b270e2fd88..3254bffb254c6 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py @@ -303,9 +303,20 @@ def __init__(self, optimizer, hcg, strategy): inner_opt._grad_clip, hcg ) - def _filter_fn(self, param): + def _insert_sync(self, sync_var, src, mp_group, sync_mode): + if sync_mode == "broadcast": + paddle.distributed.broadcast( + sync_var, src=src, group=mp_group, sync_op=True + ) + else: + paddle.distributed.all_reduce( + sync_var, group=mp_group, sync_op=True + ) + sync_var.scale_(1.0 / mp_group.nranks) + + def _filter_fn(self, param, strategy): p_name = param.name - tar_param = ["embedding", "layer_norm", ".b_"] + tar_param = strategy.sync_param_name if param.is_distributed is False: for tar in tar_param: if tar in p_name: @@ -329,26 +340,48 @@ def _step(self, parameters_list): or mp_configs.sync_moment ): params = sorted( - [p for p in parameters_list if self._filter_fn(p)], + [ + p + for p in parameters_list + if self._filter_fn(p, fleet.fleet._user_defined_strategy) + ], key=lambda p: p.name, ) + # Grad sync before opt if mp_group.nranks > 1 and mp_configs and mp_configs.sync_grad: for p in params: - if p.grad is None: - continue - paddle.distributed.broadcast( - p.grad, src=src_rank, group=mp_group, sync_op=True - ) + if hasattr(p, "main_grad") and p.main_grad is not None: + assert p.grad is None + self._insert_sync( + p.main_grad, src_rank, mp_group, mp_configs.sync_mode + ) + elif p.grad is not None: + self._insert_sync( + p.grad, src_rank, mp_group, mp_configs.sync_mode + ) self._inner_opt.step() if mp_group.nranks > 1 and mp_configs and mp_configs.sync_param: for p in params: - paddle.distributed.broadcast( - p, src=src_rank, group=mp_group, sync_op=True - ) + # Param sync after opt + self._insert_sync(p, src_rank, mp_group, mp_configs.sync_mode) + + # Master param sync after opt + if ( + hasattr(self._inner_opt, "_multi_precision") + and self._inner_opt._multi_precision + and p.name in self._inner_opt._master_weights + ): + self._insert_sync( + self._inner_opt._master_weights[p.name], + src_rank, + mp_group, + mp_configs.sync_mode, + ) + # Moment sync after opt if mp_group.nranks > 1 and mp_configs and mp_configs.sync_moment: for p in params: # support opt state of adam and adamw to broadcast now. @@ -357,28 +390,30 @@ def _step(self, parameters_list): (paddle.optimizer.Adam, paddle.optimizer.AdamW), ): if ( - self._inner_opt._multi_precision - and p.name in self._master_weights + p.name + in self._inner_opt._accumulators[ + self._inner_opt._moment1_acc_str + ] ): - paddle.distributed.broadcast( - self._inner_opt._master_weights[p.name], - src=src_rank, - group=mp_group, - sync_op=True, + moment1 = self._inner_opt._get_accumulator( + self._inner_opt._moment1_acc_str, p + ) + self._insert_sync( + moment1, src_rank, mp_group, mp_configs.sync_mode ) - moment1 = self._inner_opt._get_accumulator( - self._inner_opt._moment1_acc_str, p - ) - moment2 = self._inner_opt._get_accumulator( - self._inner_opt._moment2_acc_str, p - ) - paddle.distributed.broadcast( - moment1, src=src_rank, group=mp_group, sync_op=True - ) - paddle.distributed.broadcast( - moment2, src=src_rank, group=mp_group, sync_op=True - ) + if ( + p.name + in self._inner_opt._accumulators[ + self._inner_opt._moment2_acc_str + ] + ): + moment2 = self._inner_opt._get_accumulator( + self._inner_opt._moment2_acc_str, p + ) + self._insert_sync( + moment2, src_rank, mp_group, mp_configs.sync_mode + ) @no_grad() @framework.dygraph_only diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_model.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_model.py index 26e740bfa6b79..82efb6fa46657 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_model.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_model.py @@ -202,6 +202,7 @@ def build_model_optimizer_train( self, batchs, fp16=False, + amp_level="O1", mp_sync_param=False, mp_sync_grad=False, mp_sync_moment=False, @@ -232,6 +233,11 @@ def build_model_optimizer_train( learning_rate=0.1, parameters=model.parameters() ) + if fp16 and amp_level == "O2": + model, optimizer = paddle.amp.decorate( + models=model, optimizers=optimizer, level='O2' + ) + strategy = fleet.fleet._user_defined_strategy strategy.hybrid_configs = { "dp_degree": self.data_parallel_size, @@ -246,15 +252,15 @@ def build_model_optimizer_train( model = fleet.distributed_model(model) optimizer = fleet.distributed_optimizer(optimizer) - return self.train_batch(batchs, model, optimizer, fp16) + return self.train_batch(batchs, model, optimizer, fp16, amp_level) - def train_batch(self, batchs, model, optimizer, fp16=False): + def train_batch(self, batchs, model, optimizer, fp16=False, amp_level="O1"): losses = [] if fp16: scaler = paddle.amp.GradScaler(init_loss_scaling=1024) scaler = fleet.distributed_scaler(scaler) for batch in batchs: - with paddle.amp.auto_cast(enable=fp16, level='O1'): + with paddle.amp.auto_cast(enable=fp16, level=amp_level): output = model(batch) loss = output.mean() losses.append(loss.numpy()) @@ -295,7 +301,7 @@ def mp_sync_base( for i in range(len(losses)): np.testing.assert_allclose(losses[i], losses_sync[i], rtol=1e-6) - # test fp16 + # test fp16 O1 losses_fp16 = self.build_model_optimizer_train(batchs, fp16=True) losses_sync_fp16 = self.build_model_optimizer_train( batchs, @@ -310,6 +316,24 @@ def mp_sync_base( losses_fp16[i], losses_sync_fp16[i], rtol=1e-6 ) + # test fp16 O2 + losses_fp16_O2 = self.build_model_optimizer_train( + batchs, fp16=True, amp_level="O2" + ) + losses_sync_fp16_O2 = self.build_model_optimizer_train( + batchs, + fp16=True, + amp_level="O2", + mp_sync_param=mp_sync_param, + mp_sync_grad=mp_sync_grad, + mp_sync_moment=mp_sync_moment, + ) + + for i in range(len(losses_fp16_O2)): + np.testing.assert_allclose( + losses_fp16_O2[i], losses_sync_fp16_O2[i], rtol=1e-6 + ) + def test_mp_sync_param(self): self.mp_sync_base(mp_sync_param=True) @@ -325,6 +349,26 @@ def test_mp_sync_all(self): ) +class TestDistMPSyncModelTraning(TestDistMPSyncTraning): + def setUp(self): + strategy = fleet.DistributedStrategy() + self.model_parallel_size = 2 + self.data_parallel_size = 1 + strategy.hybrid_configs = { + "dp_degree": self.data_parallel_size, + "mp_degree": self.model_parallel_size, + "pp_degree": 1, + "mp_configs": { + "sync_param": False, + "sync_grad": False, + "sync_moment": False, + "sync_mode": "average", + "sync_param_name": ["embedding", "layer_norm", ".b_"], + }, + } + fleet.init(is_collective=True, strategy=strategy) + + class TestDistMPTraning(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_distributed_strategy.py index 99f235b5887fd..ba49cbf125a62 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_distributed_strategy.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_distributed_strategy.py @@ -84,6 +84,36 @@ def test_hybrid_parallel_configs(self): self.assertEqual(strategy.hybrid_configs["mp_degree"], 2) self.assertEqual(strategy.hybrid_configs["pp_degree"], 4) + def test_hybrid_parallel_mp_configs(self): + strategy = paddle.distributed.fleet.DistributedStrategy() + strategy.hybrid_configs = { + "dp_degree": 1, + "mp_degree": 2, + "pp_degree": 4, + "mp_configs": { + "sync_param": True, + "sync_grad": False, + "sync_moment": False, + "sync_mode": "broadcast", + "sync_param_name": ["embedding", "layer_norm", ".w", ".b_"], + }, + } + self.assertEqual(strategy.hybrid_configs["dp_degree"], 1) + self.assertEqual(strategy.hybrid_configs["mp_degree"], 2) + self.assertEqual(strategy.hybrid_configs["pp_degree"], 4) + self.assertEqual(strategy.hybrid_configs["mp_configs"].sync_param, True) + self.assertEqual(strategy.hybrid_configs["mp_configs"].sync_grad, False) + self.assertEqual( + strategy.hybrid_configs["mp_configs"].sync_moment, False + ) + self.assertEqual( + strategy.hybrid_configs["mp_configs"].sync_mode, "broadcast" + ) + + self.assertEqual( + strategy.sync_param_name, ["embedding", "layer_norm", ".w", ".b_"] + ) + def test_hybrid_parallel_configs_order(self): strategy = paddle.distributed.fleet.DistributedStrategy() strategy.hybrid_configs = { From be1b3fc3649b694af3aaaa468d2df20e35ede3c7 Mon Sep 17 00:00:00 2001 From: sprouteer <89541335+sprouteer@users.noreply.github.com> Date: Tue, 25 Apr 2023 16:38:41 +0800 Subject: [PATCH 059/405] [XPU][BUG] Fix link_xpu_op_max_pass bug (#53258) --- .../framework/ir/xpu/conv2d_xpu_fuse_pass.cc | 17 +-- .../framework/ir/xpu/link_xpu_op_max_pass.cc | 105 +++++++++++++----- 2 files changed, 88 insertions(+), 34 deletions(-) diff --git a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc index af5ec2c651c8a..263ea5a09ca76 100644 --- a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc @@ -110,7 +110,8 @@ Conv2dXPUPattern::Conv2dXPUPattern(PDPattern* pattern, ->assert_is_op_input(conv_type_, "Filter") ->AsInput(); auto conv_out = pattern->NewNode(conv_out_repr()) - ->assert_is_op_output(conv_type_, "Output"); + ->assert_is_op_output(conv_type_, "Output") + ->assert_has_n_outputs(1); conv->LinksFrom({input, conv_filter}).LinksTo({conv_out}); // ew_bias_add op PDNode* ew_bias_add = nullptr; @@ -190,12 +191,12 @@ Conv2dXPUPattern::Conv2dXPUPattern(PDPattern* pattern, // ew_branch_add op if (with_branch_) { if (with_branch_x_) { - bn_out->assert_is_op_input("elementwise_add", "Y")->AsIntermediate(); + bn_out->assert_is_op_input("elementwise_add", "Y"); ew_branch_add_in = pattern->NewNode(ew_branch_add_in_repr()) ->assert_is_op_input("elementwise_add", "X") ->AsInput(); } else if (with_branch_y_) { - bn_out->assert_is_op_input("elementwise_add", "X")->AsIntermediate(); + bn_out->assert_is_op_input("elementwise_add", "X"); ew_branch_add_in = pattern->NewNode(ew_branch_add_in_repr()) ->assert_is_op_input("elementwise_add", "Y") ->AsInput(); @@ -221,13 +222,15 @@ Conv2dXPUPattern::Conv2dXPUPattern(PDPattern* pattern, } // act op if (!act_type_.empty()) { - ew_branch_add_out->assert_is_op_input(act_type_, "X")->AsIntermediate(); + ew_branch_add_out->assert_is_op_input(act_type_, "X"); act = pattern->NewNode(act_repr())->assert_is_op(act_type_); - act_out = pattern->NewNode(act_out_repr()) - ->assert_is_op_output(act_type_, "Out") - ->assert_var_not_persistable(); + act_out = + pattern->NewNode(act_out_repr())->assert_is_op_output(act_type_, "Out"); act->LinksFrom({ew_branch_add_out}).LinksTo({act_out}); + } else { + act_out = ew_branch_add_out; } + act_out->AsOutput(); } } // namespace patterns diff --git a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc index e100db36c2735..5b3f1332907a9 100644 --- a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc +++ b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc @@ -41,30 +41,39 @@ namespace patterns { struct FusionXPUOpPattern : public PatternBase { FusionXPUOpPattern(PDPattern* pattern, const std::string& name_scope, - const std::string& op_type); + const std::string& op_type, + bool with_branch); // declare operator node's name PATTERN_DECL_NODE(fusion_op); // declare variable node's name - PATTERN_DECL_NODE(out); - PATTERN_DECL_NODE(out_max); + PATTERN_DECL_NODE(input); + PATTERN_DECL_NODE(branch); private: std::string op_type_; + bool with_branch_{false}; }; FusionXPUOpPattern::FusionXPUOpPattern(PDPattern* pattern, const std::string& name_scope, - const std::string& op_type) - : PatternBase(pattern, name_scope, name_scope), op_type_(op_type) { + const std::string& op_type, + bool with_branch) + : PatternBase(pattern, name_scope, name_scope), + op_type_(op_type), + with_branch_(with_branch) { auto* fusion_op = pattern->NewNode(fusion_op_repr())->assert_is_op(op_type_); - auto* out = pattern->NewNode(out_repr()) - ->assert_is_op_output(op_type_, "out") - ->assert_var_not_persistable(); - auto* out_max = pattern->NewNode(out_max_repr()) - ->assert_is_op_output(op_type_, "out_max") - ->assert_var_not_persistable(); - fusion_op->LinksTo({out, out_max}); + auto* input = + pattern->NewNode(input_repr())->assert_is_op_input(op_type_, "x"); + + PDNode* branch = nullptr; + if (with_branch_) { + branch = + pattern->NewNode(branch_repr())->assert_is_op_input(op_type_, "branch"); + fusion_op->LinksFrom({input, branch}); + } else { + fusion_op->LinksFrom({input}); + } } } // namespace patterns @@ -74,7 +83,9 @@ class LinkXPUOpMaxPass : public FusePassBase { void ApplyImpl(ir::Graph* graph) const override; private: - void ApplyImpl(ir::Graph* graph, const std::string& op_type) const; + void ApplyImpl(ir::Graph* graph, + const std::string& op_type, + bool with_branch) const; const std::string name_scope_{"multi_encoder_xpu_slice_fuse_pass"}; // ops with x_max/out_max @@ -89,8 +100,7 @@ Origin subgraph: out0 out0_max | \ - fusion_xpu_op1 - + fusion_op Fused subgraph: fusion_xpu_op0 / \ @@ -98,36 +108,77 @@ Fused subgraph: out0 out0_max | | \ / - fusion_xpu_op1 + fusion_op + +Origin subgraph1: + fusion_xpu_op0 fusion_xpu_op1 + / \ / \ + | | | | + out0 out0_max out1 out1_max + | | + (x) \ / (branch) + fusion_xpu_op2 +Fused subgraph1: + fusion_xpu_op0 fusion_xpu_op1 + / \ / \ + | | | | + out0 out0_max out1 out1_max + | | | | + (x) \ |(x_max) |(branch) /(branch_max) + \ | | / + \ | | / + \ | | / + fusion_xpu_op2 */ void LinkXPUOpMaxPass::ApplyImpl(ir::Graph* graph) const { Init(name_scope_, graph); for (auto op_type : op_types_) { - ApplyImpl(graph, op_type); + for (auto with_branch : {true, false}) { + ApplyImpl(graph, op_type, with_branch); + } } } void LinkXPUOpMaxPass::ApplyImpl(ir::Graph* graph, - const std::string& op_type) const { + const std::string& op_type, + bool with_branch) const { PADDLE_ENFORCE_NOT_NULL( graph, platform::errors::PreconditionNotMet("graph should not be null.")); GraphPatternDetector gpd; patterns::FusionXPUOpPattern pattern( - gpd.mutable_pattern(), name_scope_, op_type); + gpd.mutable_pattern(), name_scope_, op_type, with_branch); int found_subgraph_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { VLOG(4) << "handle LinkXPUOpMaxPass fuse"; GET_IR_NODE(fusion_op); - GET_IR_NODE(out); - GET_IR_NODE(out_max); - for (auto next_op : out->outputs) { - auto* next_op_desc = next_op->Op(); - if (op_types_.count(next_op_desc->Type()) == 0) continue; - next_op_desc->SetInput("x_max", {out_max->Name()}); - IR_NODE_LINK_TO(out_max, next_op); - found_subgraph_count++; + GET_IR_NODE(input); + GET_IR_NODE(branch); + + auto* fusion_op_desc = fusion_op->Op(); + if (input->inputs[0]->Op()->HasOutput("out_max")) { + auto input_max_name = input->inputs[0]->Op()->Output("out_max"); + for (auto max_node : input->inputs[0]->outputs) { + if (input_max_name[0] == max_node->Name()) { + fusion_op_desc->SetInput("x_max", {max_node->Name()}); + IR_NODE_LINK_TO(max_node, fusion_op); + found_subgraph_count++; + } + } + } + + if (with_branch) { + if (branch->inputs[0]->Op()->HasOutput("out_max")) { + auto branch_max_name = branch->inputs[0]->Op()->Output("out_max"); + for (auto max_node : branch->inputs[0]->outputs) { + if (branch_max_name[0] == max_node->Name()) { + fusion_op_desc->SetInput("branch_max", {max_node->Name()}); + IR_NODE_LINK_TO(max_node, fusion_op); + found_subgraph_count++; + } + } + } } }; From dda6b9d59c8b04ac92678bb98226f3b742c20258 Mon Sep 17 00:00:00 2001 From: ccrrong <101700995+ccrrong@users.noreply.github.com> Date: Tue, 25 Apr 2023 16:47:19 +0800 Subject: [PATCH 060/405] update tile_grad composite rule (#53261) --- paddle/fluid/operators/tile_op.cc | 19 +++++++++++++++---- .../composite_backward_api.h | 17 +---------------- paddle/phi/infermeta/unary.cc | 6 +++--- 3 files changed, 19 insertions(+), 23 deletions(-) diff --git a/paddle/fluid/operators/tile_op.cc b/paddle/fluid/operators/tile_op.cc index 2fcf702728501..26657ce42f303 100644 --- a/paddle/fluid/operators/tile_op.cc +++ b/paddle/fluid/operators/tile_op.cc @@ -171,14 +171,25 @@ class TileCompositeGradOpMaker : public prim::CompositeGradOpMakerBase { paddle::Tensor x = this->GetSingleForwardInput("X"); paddle::Tensor out_grad = this->GetSingleOutputGrad("Out"); paddle::Tensor x_grad = this->GetSingleInputGrad("X"); + paddle::optional tensor_repeat_times = + this->GetOptionalSingleForwardInput("RepeatTimes"); + paddle::optional tensor_repeat_times_attr = + this->GetOptionalSingleForwardInput("repeat_times_tensor"); auto dx_ptr = this->GetOutputPtr(&x_grad); std::string dx_name = this->GetOutputName(x_grad); auto repeat_times = this->Attr>("repeat_times"); - VLOG(6) << "Runing tile_grad composite func"; - prim::tile_grad( - x, out_grad, paddle::experimental::IntArray(repeat_times), dx_ptr); - this->RecoverOutputName(x_grad, dx_name); + if (tensor_repeat_times.is_initialized() || + tensor_repeat_times_attr.is_initialized()) { + PADDLE_THROW(platform::errors::Unimplemented( + "We don't support RepeatTimes from tensor or repeat_times_tensor for " + "tile composite grad for now. ")); + } else { + VLOG(6) << "Runing tile_grad composite func"; + prim::tile_grad( + x, out_grad, paddle::experimental::IntArray(repeat_times), dx_ptr); + this->RecoverOutputName(x_grad, dx_name); + } } }; diff --git a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h index a42c41c1ba229..5e1e490c1b73a 100644 --- a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h +++ b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h @@ -1774,22 +1774,7 @@ void tile_grad(const Tensor& x, if (x_grad) { auto repeat_times_data = repeat_times.GetData(); auto out_grad_shape = phi::vectorize(out_grad.dims()); - auto x_shape = phi::vectorize(x.dims()); - - if (repeat_times_data.size() < x_shape.size()) { - int diff = x_shape.size() - repeat_times_data.size(); - repeat_times_data.insert(repeat_times_data.begin(), diff, 1); - } else { - int diff = repeat_times_data.size() - x_shape.size(); - x_shape.insert(x_shape.begin(), diff, 1); - } - for (int i = 0; i < static_cast(out_grad_shape.size()); i++) { - if (out_grad_shape[i] == -1) { - out_grad_shape[i] = x_shape[i] * repeat_times_data[i]; - } - } - auto result = reshape(out_grad, out_grad_shape); - + auto result = out_grad; for (int i = 0; i < static_cast(repeat_times_data.size()); i++) { int size = out_grad_shape[i] / repeat_times_data[i]; std::vector sections(repeat_times_data[i], size); diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index ea27eba513051..08b2c830d195d 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -4220,7 +4220,7 @@ void TileInferMeta(const MetaTensor& x, auto repeat_times_data = repeat_times.GetData(); auto x_dims = x.dims(); if (repeat_times_data.size() == 0) { - repeat_times_data = std::vector(x_dims.size(), -1); + repeat_times_data = std::vector(x_dims.size(), 1); } PADDLE_ENFORCE_LE( @@ -4253,10 +4253,10 @@ void TileInferMeta(const MetaTensor& x, auto x_dim_vec = phi::vectorize(x_dims); if (x_dim_vec.size() > repeat_times_data.size()) { auto diff = x_dim_vec.size() - repeat_times_data.size(); - repeat_times_data.insert(repeat_times_data.begin(), diff, -1); + repeat_times_data.insert(repeat_times_data.begin(), diff, 1); } else { auto diff = repeat_times_data.size() - x_dim_vec.size(); - x_dim_vec.insert(x_dim_vec.begin(), diff, -1); + x_dim_vec.insert(x_dim_vec.begin(), diff, 1); } for (size_t i = 0; i < repeat_times_data.size(); ++i) { if (x_dim_vec[i] == -1 || repeat_times_data[i] == -1) { From b1668b0e3126c17c906a0bb3972d89a8124788c4 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Tue, 25 Apr 2023 16:59:49 +0800 Subject: [PATCH 061/405] Fix calling for tensor.data. --- paddle/fluid/operators/fused/fused_gate_attention.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h index 3579caf1bc99d..84e687e4b9ccf 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention.h +++ b/paddle/fluid/operators/fused/fused_gate_attention.h @@ -1303,7 +1303,7 @@ class FlashAttnWithGating { num_splits, softmax_lse->data(), softmax_d.data(), - bias_d.data(), + nonbatched_bias ? bias_d.data() : nullptr, nullptr, &workspace_size, stream, @@ -1316,7 +1316,8 @@ class FlashAttnWithGating { if (!succ) { PADDLE_THROW(phi::errors::External(phi::dynload::flash_attn_error())); } - LOG(INFO) << WaitWithDebugInfo(dev_ctx_); + LOG(INFO) << WaitWithDebugInfo(dev_ctx_) + << "Get workspace_size=" << workspace_size; phi::DenseTensor workspace = CreateWorkspace(workspace_size); succ = phi::dynload::flash_attn_bwd_with_bias_and_mask( @@ -1345,8 +1346,8 @@ class FlashAttnWithGating { num_splits, softmax_lse->data(), softmax_d.data(), - bias_d.data(), - workspace.data(), + nonbatched_bias ? bias_d.data() : nullptr, + (workspace_size > 0) ? static_cast(workspace.data()) : nullptr, &workspace_size, stream, seed, From 6c15247291fcc2d80f8a1f440797abb9b26aa1c1 Mon Sep 17 00:00:00 2001 From: Zenghui Yuan <56808883+huizyuan@users.noreply.github.com> Date: Tue, 25 Apr 2023 17:26:36 +0800 Subject: [PATCH 062/405] Fix some problems in Paddle english instruction doc files. (#53145) * Fix some problems in Paddle english instruction doc files. * fix some new questions * fix cn doc problems, test=document_fix --- python/paddle/fluid/framework.py | 9 ++++++--- python/paddle/tensor/math.py | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 63ab3a65bb6b9..cab57eae045b3 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -718,7 +718,8 @@ def is_compiled_with_cinn(): """ Whether this whl package can be used to run the model on CINN. - Returns (bool): `True` if CINN is currently available, otherwise `False`. + Returns: + Bool: `True` if CINN is currently available, otherwise `False`. Examples: .. code-block:: python @@ -733,7 +734,8 @@ def is_compiled_with_cuda(): """ Whether this whl package can be used to run the model on GPU. - Returns (bool): `True` if CUDA is currently available, otherwise `False`. + Returns: + Bool: `True` if CUDA is currently available, otherwise `False`. Examples: .. code-block:: python @@ -748,7 +750,8 @@ def is_compiled_with_rocm(): """ Whether this whl package can be used to run the model on AMD or Hygon GPU(ROCm). - Returns (bool): `True` if ROCm is currently available, otherwise `False`. + Returns: + Bool: `True` if ROCm is currently available, otherwise `False`. Examples: .. code-block:: python diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index 9f5212abf9147..ece8463f35e63 100644 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -3057,7 +3057,7 @@ def __check_input(x, offset, axis1, axis2): def diagonal(x, offset=0, axis1=0, axis2=1, name=None): """ - This OP computes the diagonals of the input tensor x. + Computes the diagonals of the input tensor x. If ``x`` is 2D, returns the diagonal. If ``x`` has larger dimensions, diagonals be taken from the 2D planes specified by axis1 and axis2. From 3b9730314ccbaa6523ec0a7057205df95e2eb6a8 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Tue, 25 Apr 2023 21:53:26 +0800 Subject: [PATCH 063/405] Change all LOG(INFO) to VLOG and fix the backward. --- .../operators/fused/fused_gate_attention.h | 264 ++++++------------ .../fused/fused_gate_attention_op.cu | 5 - 2 files changed, 93 insertions(+), 176 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h index 84e687e4b9ccf..cf4fbdd3a1739 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention.h +++ b/paddle/fluid/operators/fused/fused_gate_attention.h @@ -57,10 +57,14 @@ void AllocWithDebugInfo(const phi::GPUContext& dev_ctx, } inline std::string TensorDebugString(const phi::DenseTensor* t) { - if (t && t->initialized()) { - std::stringstream ss; - ss << "shape=[" << t->dims() << "], ptr=" << t->data(); - return ss.str(); + if (t) { + if (t->initialized()) { + std::stringstream ss; + ss << "shape=[" << t->dims() << "], ptr=" << t->data(); + return ss.str(); + } else { + return "not initialized"; + } } else { return "nullptr"; } @@ -366,34 +370,6 @@ struct GateAttentionGradConfig : public GateAttentionConfig { phi::DenseTensor qk_out_grad; }; -#define DEBUG_HERE printf("[%s, %d]: Run here!\n", __func__, __LINE__); -#define DEBUG_DATA_INT(name, x) \ - do { \ - printf( \ - "[%s, %d]: %s = %d\n", __func__, __LINE__, name, static_cast(x)); \ - } \ - whilie(0); - -#define DEBUG_DATA_FlOAT(name, x) \ - do { \ - printf("[%s, %d]: %s = %f\n", \ - __func__, \ - __LINE__, \ - std::string(name), \ - static_cast(x)); \ - } \ - whilie(0); - -#define DEBUG_DIMS(x) \ - do { \ - printf("[%s, %d]: dims is : [", __func__, __LINE__); \ - for (int i = 0; i < x.size(); ++i) { \ - printf("%d, ", x[i]); \ - } \ - printf(" ]\n"); \ - } \ - whilie(0); - template __global__ void FlashAttRange(int start, int step, int size, T* out1, T* out2) { CUDA_KERNEL_LOOP(index, size) { @@ -402,30 +378,6 @@ __global__ void FlashAttRange(int start, int step, int size, T* out1, T* out2) { } } -static void GetFlashAttnDimsString(const std::string& prefix, - const phi::DDim dim_val) { - // if (VLOG_IS_ON(4)) { - std::ostringstream out_string; - out_string << "FlashAttn - " << prefix << ".dims() is ["; - for (int i = 0; i < dim_val.size(); ++i) { - out_string << dim_val[i] << ", "; - } - out_string << "]\n"; - VLOG(4) << out_string.str(); - std::cout << out_string.str(); - // } -} - -#define DBGPTR(ptr, prefix) \ - do { \ - std::ostringstream out_string; \ - void* data = static_cast(ptr); \ - out_string << "[" << __func__ << ", " << __LINE__ << "]: " << prefix \ - << "`s addr is "; \ - out_string << ptr << std::endl; \ - std::cout << out_string.str(); \ - } while (0); - template class FMHAGateRef { public: @@ -943,9 +895,6 @@ class FlashAttnWithGating { void ComputeForward(const phi::DenseTensor* nonbatched_bias, const phi::DenseTensor* src_mask, - phi::DenseTensor* q_transpose_out, - phi::DenseTensor* k_transpose_out, - phi::DenseTensor* v_transpose_out, phi::DenseTensor* qkv_transpose_out, phi::DenseTensor* softmax_lse, phi::DenseTensor* fmha_out, @@ -955,15 +904,13 @@ class FlashAttnWithGating { qkv_transpose_out->dtype() == DataType::BFLOAT16 ? true : false; if (std::is_same::value) { - LOG(INFO) << "T is phi::dtype::float16."; + VLOG(4) << "T is phi::dtype::float16."; } else if (std::is_same::value) { - LOG(INFO) << "T is phi::dtype::bfloat16."; + VLOG(4) << "T is phi::dtype::bfloat16."; } else if (std::is_same::value) { - LOG(INFO) << "T is float."; + VLOG(4) << "T is float."; } - LOG(INFO) << "Use flash attention: merge_qkv=" << merge_qkv_; - PADDLE_ENFORCE_NOT_NULL( qkv_transpose_out, platform::errors::NotFound("The input qkv_transpose_out can not be " @@ -981,10 +928,9 @@ class FlashAttnWithGating { seq_batch_size * static_cast(config->seq_len_r), static_cast(config->num_heads), static_cast(config->head_dim)}); - LOG(INFO) << WaitWithDebugInfo(dev_ctx_) - << "1: Reshape qkv_transpose_out: [" - << config->qkv_transpose_out_dims << "] -> [" - << qkv_transpose_out->dims() << "]"; + VLOG(5) << WaitWithDebugInfo(dev_ctx_) << "Reshape qkv_transpose_out: [" + << config->qkv_transpose_out_dims << "] -> [" + << qkv_transpose_out->dims() << "]"; // q_size == k_size int64_t q_size = config->GetQuerySize(); @@ -1014,52 +960,53 @@ class FlashAttnWithGating { dims_[dims_rank - 3], dims_[dims_rank - 2], dims_[dims_rank - 1]}); - GetFlashAttnDimsString(prefix, temp_mask.dims()); + VLOG(6) << prefix << ": " << TensorDebugString(dst_tensor); } }; - auto& qkv_dims = qkv_transpose_out->dims(); - dims_merge_func(src_mask, &temp_mask, "mask_dim"); - dims_merge_func(nonbatched_bias, &temp_bias, "bias_dim"); - GetFlashAttnDimsString("qkv_transpose_out", qkv_dims); - LOG(INFO) << WaitWithDebugInfo(dev_ctx_) - << "3: Merge dimensions for mask and bias"; + dims_merge_func(src_mask, &temp_mask, "temp_mask"); + dims_merge_func(nonbatched_bias, &temp_bias, "temp_bias"); // 4. flash_attn parameter setting. + auto& qkv_dims = qkv_transpose_out->dims(); int batch_size_ = seq_batch_size; - int total_q_ = qkv_dims[1]; // q.dims()[0] - int total_k_ = qkv_dims[1]; // q.dims()[0] - int num_heads_ = qkv_dims[2]; // q.dims()[1] - int head_size_ = qkv_dims[3]; // q.dims()[2] - int max_seqlen_q_ = batch_size_; - int max_seqlen_k_ = batch_size_; + int total_q_ = qkv_dims[1]; // q.dims()[0] + int total_k_ = qkv_dims[1]; // q.dims()[0] + int num_heads_ = qkv_dims[2]; // q.dims()[1] + int head_size_ = qkv_dims[3]; // q.dims()[2] + int max_seqlen_q_ = config->seq_len_r; // batch_size_; + int max_seqlen_k_ = config->m_size; // batch_size_; int num_splits = 0; // 0 for an internal heuristic, which is optimal - LOG(INFO) << "[Flash_attn Fwd] batch_size : " << batch_size_; - LOG(INFO) << "[Flash_attn Fwd] total_q : " << total_q_; - LOG(INFO) << "[Flash_attn Fwd] total_k : " << total_k_; - LOG(INFO) << "[Flash_attn Fwd] num_heads : " << num_heads_; - LOG(INFO) << "[Flash_attn Fwd] head_size : " << head_size_; - LOG(INFO) << "[Flash_attn Fwd] max_seqlen_q : " << max_seqlen_q_; - LOG(INFO) << "[Flash_attn Fwd] max_seqlen_k : " << max_seqlen_k_; - LOG(INFO) << WaitWithDebugInfo(dev_ctx_) - << "4: Init flash-attention parameters"; + VLOG(6) << "batch_size : " << batch_size_; + VLOG(6) << "total_q : " << total_q_; + VLOG(6) << "total_k : " << total_k_; + VLOG(6) << "num_heads : " << num_heads_; + VLOG(6) << "head_size : " << head_size_; + VLOG(6) << "max_seqlen_q : " << max_seqlen_q_; + VLOG(6) << "max_seqlen_k : " << max_seqlen_k_; // 5. construct softmax_lse int softmax_lse_last_dim = ((max_seqlen_q_ + 16 - 1) / 16) * 16; softmax_lse->Resize({batch_size_, num_heads_, softmax_lse_last_dim}); AllocWithDebugInfo(dev_ctx_, "flash_attn: softmax_lse", softmax_lse); - LOG(INFO) << WaitWithDebugInfo(dev_ctx_) - << "5: Allocate softmax_lse: shape=[" << softmax_lse->dims() - << "]"; + VLOG(5) << WaitWithDebugInfo(dev_ctx_) << "Allocate softmax_lse: shape=[" + << softmax_lse->dims() << "]"; // 6. construct random seed auto seed_offset_pair = GenerateSeedOffsetPair(batch_size_, num_heads_); uint64_t seed = seed_offset_pair.first; uint64_t offset = seed_offset_pair.second; - - GetFlashAttnDimsString("softmax_lse", softmax_lse->dims()); - GetFlashAttnDimsString("cu_seq_q", cu_seq_q.dims()); - GetFlashAttnDimsString("cu_seq_k", cu_seq_k.dims()); - LOG(INFO) << WaitWithDebugInfo(dev_ctx_) << "6: Construct random seed"; + VLOG(5) << WaitWithDebugInfo(dev_ctx_) << "Construct random seed"; + + VLOG(6) << "cu_seq_q: " << TensorDebugString(&cu_seq_q); + VLOG(6) << "cu_seq_k: " << TensorDebugString(&cu_seq_k); + VLOG(6) << "temp_bias: " << TensorDebugString(&temp_bias); + VLOG(6) << "temp_mask: " << TensorDebugString(&temp_mask); + VLOG(6) << "nonbatched_bias: " << TensorDebugString(nonbatched_bias); + VLOG(6) << "src_mask: " << TensorDebugString(src_mask); + VLOG(6) << "qkv_transpose_out: " << TensorDebugString(qkv_transpose_out); + VLOG(6) << "softmax_lse: " << TensorDebugString(softmax_lse); + VLOG(6) << "fmha_out: " << TensorDebugString(fmha_out); + VLOG(6) << "gate_out: " << TensorDebugString(gate_out); // 7. flas_attn part one, get temp worksapce size. float p_dropout = 0.f; @@ -1070,7 +1017,7 @@ class FlashAttnWithGating { static_cast(q_ptr), static_cast(k_ptr), static_cast(v_ptr), - nullptr, // for calculation workspace size + nullptr, // set out to nullptr to calculate workspace size cu_seq_q.data(), cu_seq_k.data(), total_q_, @@ -1100,17 +1047,11 @@ class FlashAttnWithGating { if (!succ) { PADDLE_THROW(phi::errors::External(phi::dynload::flash_attn_error())); } - LOG(INFO) << WaitWithDebugInfo(dev_ctx_) - << "7: Get workspace_size=" << workspace_size; + VLOG(5) << WaitWithDebugInfo(dev_ctx_) + << "Get workspace_size=" << workspace_size; + // 8. Run flash-attention kernel. phi::DenseTensor workspace = CreateWorkspace(workspace_size); - - LOG(INFO) << "qkv_transpose_out: " << TensorDebugString(qkv_transpose_out); - LOG(INFO) << "src_mask: " << TensorDebugString(src_mask); - LOG(INFO) << "fmha_out: " << TensorDebugString(fmha_out); - LOG(INFO) << "gate_out: " << TensorDebugString(gate_out); - - // 8. flas_attn part two, run impl. succ = phi::dynload::flash_attn_fwd_with_bias_and_mask( static_cast(q_ptr), static_cast(k_ptr), @@ -1145,7 +1086,8 @@ class FlashAttnWithGating { if (!succ) { PADDLE_THROW(phi::errors::External(phi::dynload::flash_attn_error())); } - LOG(INFO) << WaitWithDebugInfo(dev_ctx_) << "8: Run SUCCESS"; + VLOG(5) << "[ComputeForward]" << WaitWithDebugInfo(dev_ctx_) + << "Run SUCCESS!!!"; if (config->has_gating) { gate_out->Resize(config->gate_out_dims); @@ -1165,11 +1107,11 @@ class FlashAttnWithGating { qkv_transpose_out->dtype() == DataType::BFLOAT16 ? true : false; if (std::is_same::value) { - std::cout << "[Grad]: T is phi::dtype::float16. \n"; + VLOG(4) << "[Grad]: T is phi::dtype::float16."; } else if (std::is_same::value) { - std::cout << "[Grad]: T is phi::dtype::bfloat16. \n"; + VLOG(4) << "[Grad]: T is phi::dtype::bfloat16."; } else if (std::is_same::value) { - std::cout << "[Grad]: T is float. \n"; + VLOG(4) << "[Grad]: T is float."; } PADDLE_ENFORCE_NOT_NULL( @@ -1193,7 +1135,7 @@ class FlashAttnWithGating { int seq_batch_size = static_cast(config->batch_size) * static_cast(config->seq_len_m); - LOG(INFO) << WaitWithDebugInfo(dev_ctx_); + VLOG(5) << WaitWithDebugInfo(dev_ctx_); // 2. Init with cu_seq_q and cu_seq_k for flash_attn. phi::DenseTensor cu_seq_q, cu_seq_k; @@ -1217,59 +1159,55 @@ class FlashAttnWithGating { dims_[dims_rank - 3], dims_[dims_rank - 2], dims_[dims_rank - 1]}); - GetFlashAttnDimsString(prefix, temp_mask.dims()); + VLOG(6) << prefix << ": " << TensorDebugString(dst_tensor); } }; - dims_merge_func(src_mask, &temp_mask, "[Grad] mask_dim"); - dims_merge_func(nonbatched_bias, &temp_bias, "[Grad] bias_dim"); + dims_merge_func(src_mask, &temp_mask, "temp_mask"); + dims_merge_func(nonbatched_bias, &temp_bias, "temp_bias"); phi::DDim qkv_dims({3, seq_batch_size * static_cast(config->seq_len_r), static_cast(config->num_heads), static_cast(config->head_dim)}); int batch_size_ = seq_batch_size; - int total_q_ = qkv_dims[1]; // q.dims()[0] - int total_k_ = qkv_dims[1]; // q.dims()[0] - int num_heads_ = qkv_dims[2]; // q.dims()[1] - int head_size_ = qkv_dims[3]; // q.dims()[2] - int max_seqlen_q_ = batch_size_; - int max_seqlen_k_ = batch_size_; - VLOG(6) << "[Flash_attn Grad] batch_size : " << batch_size_; - VLOG(6) << "[Flash_attn Grad] total_q : " << total_q_; - VLOG(6) << "[Flash_attn Grad] total_k : " << total_k_; - VLOG(6) << "[Flash_attn Grad] num_heads : " << num_heads_; - VLOG(6) << "[Flash_attn Grad] head_size : " << head_size_; - VLOG(6) << "[Flash_attn Grad] max_seqlen_q : " << max_seqlen_q_; - VLOG(6) << "[Flash_attn Grad] max_seqlen_k : " << max_seqlen_k_; + int total_q_ = qkv_dims[1]; // q.dims()[0] + int total_k_ = qkv_dims[1]; // q.dims()[0] + int num_heads_ = qkv_dims[2]; // q.dims()[1] + int head_size_ = qkv_dims[3]; // q.dims()[2] + int max_seqlen_q_ = config->seq_len_r; // batch_size_; + int max_seqlen_k_ = config->m_size; // batch_size_; + VLOG(6) << "batch_size : " << batch_size_; + VLOG(6) << "total_q : " << total_q_; + VLOG(6) << "total_k : " << total_k_; + VLOG(6) << "num_heads : " << num_heads_; + VLOG(6) << "head_size : " << head_size_; + VLOG(6) << "max_seqlen_q : " << max_seqlen_q_; + VLOG(6) << "max_seqlen_k : " << max_seqlen_k_; // 5. construct softmax_lse - int last_q_dim = ((max_seqlen_q_ + 16 - 1) / 16) * 16; - // softmax_lse->Resize({batch_size_, num_heads_, last_q_dim}); - // AllocWithDebugInfo( - // dev_ctx_, "flash_attn: softmax_lse", softmax_lse); - LOG(INFO) << WaitWithDebugInfo(dev_ctx_); - - phi::DenseTensor softmax_d = phi::Empty( - dev_ctx_, {batch_size_, num_heads_, last_q_dim}); - LOG(INFO) << WaitWithDebugInfo(dev_ctx_); + phi::DenseTensor softmax_d; + softmax_d.Resize(softmax_lse->dims()); + AllocWithDebugInfo(dev_ctx_, "d_softmax_lse", &softmax_d); + VLOG(5) << WaitWithDebugInfo(dev_ctx_); phi::DenseTensor bias_d; if (nonbatched_bias) { bias_d = phi::Empty( dev_ctx_, {batch_size_, num_heads_, max_seqlen_q_, max_seqlen_k_}); } - LOG(INFO) << WaitWithDebugInfo(dev_ctx_); + VLOG(5) << WaitWithDebugInfo(dev_ctx_); // 6. construct random seed auto seed_offset_pair = GenerateSeedOffsetPair(batch_size_, num_heads_); uint64_t seed = seed_offset_pair.first; uint64_t offset = seed_offset_pair.second; - LOG(INFO) << "fmha_out: " << TensorDebugString(fmha_out); - LOG(INFO) << "fmha_out_grad: " << TensorDebugString(fmha_out_grad); - LOG(INFO) << "softmax_lse: " << TensorDebugString(softmax_lse); - LOG(INFO) << "softmax_d: " << TensorDebugString(&softmax_d); - LOG(INFO) << "bias_d: " << TensorDebugString(&bias_d); + VLOG(6) << "fmha_out: " << TensorDebugString(fmha_out); + VLOG(6) << "fmha_out_grad: " << TensorDebugString(fmha_out_grad); + VLOG(6) << "softmax_lse: " << TensorDebugString(softmax_lse); + VLOG(6) << "softmax_d: " << TensorDebugString(&softmax_d); + VLOG(6) << "nonbatched_bias: " << TensorDebugString(nonbatched_bias); + VLOG(6) << "bias_d: " << TensorDebugString(&bias_d); // 7. flas_attn part one, get temp worksapce size. uint64_t workspace_size; @@ -1284,7 +1222,7 @@ class FlashAttnWithGating { static_cast(q_grad_ptr), static_cast(k_grad_ptr), static_cast(v_grad_ptr), - static_cast(fmha_out->data()), + nullptr, // set out to nullptr to calculate workspace size static_cast(fmha_out_grad->data()), cu_seq_q.data(), cu_seq_k.data(), @@ -1303,7 +1241,7 @@ class FlashAttnWithGating { num_splits, softmax_lse->data(), softmax_d.data(), - nonbatched_bias ? bias_d.data() : nullptr, + nonbatched_bias_grad ? bias_d.data() : nullptr, nullptr, &workspace_size, stream, @@ -1316,8 +1254,8 @@ class FlashAttnWithGating { if (!succ) { PADDLE_THROW(phi::errors::External(phi::dynload::flash_attn_error())); } - LOG(INFO) << WaitWithDebugInfo(dev_ctx_) - << "Get workspace_size=" << workspace_size; + VLOG(5) << WaitWithDebugInfo(dev_ctx_) + << "Get workspace_size=" << workspace_size; phi::DenseTensor workspace = CreateWorkspace(workspace_size); succ = phi::dynload::flash_attn_bwd_with_bias_and_mask( @@ -1359,7 +1297,7 @@ class FlashAttnWithGating { if (!succ) { PADDLE_THROW(phi::errors::External(phi::dynload::flash_attn_error())); } - LOG(INFO) << WaitWithDebugInfo(dev_ctx_); + VLOG(5) << WaitWithDebugInfo(dev_ctx_); if (nonbatched_bias) { // compare block reduce @@ -1407,9 +1345,9 @@ class FlashAttnWithGating { FlashAttRange<<>>( start, step, end, cu_seq_q->data(), cu_seq_k->data()); - LOG(INFO) << WaitWithDebugInfo(dev_ctx_) - << "AllocAndInit cu_seq_q and cu_seq_k: start=" << start - << ", step=" << step << ", end=" << end; + VLOG(5) << WaitWithDebugInfo(dev_ctx_) + << "AllocAndInit cu_seq_q and cu_seq_k: start=" << start + << ", step=" << step << ", end=" << end; } phi::DenseTensor CreateWorkspace(uint64_t workspace_size) { @@ -1417,10 +1355,9 @@ class FlashAttnWithGating { if (workspace_size > 0) { workspace = phi::Empty( dev_ctx_, {int64_t(workspace_size / sizeof(float))}); - DBGPTR(workspace.data(), "workspace"); } - LOG(INFO) << WaitWithDebugInfo(dev_ctx_) - << "Allocate workspace: workspace_size=" << workspace_size; + VLOG(5) << WaitWithDebugInfo(dev_ctx_) + << "Allocate workspace: workspace_size=" << workspace_size; return workspace; } @@ -1440,25 +1377,10 @@ class FlashAttnWithGating { dev_ctx_, qkv_out, perm, qkv_transpose_out); } - void ComputeQKVTransposeBackward(const phi::DenseTensor& q_transpose_out_grad, - const phi::DenseTensor& k_transpose_out_grad, - const phi::DenseTensor& v_transpose_out_grad, - phi::DenseTensor* q_out_grad, - phi::DenseTensor* k_out_grad, - phi::DenseTensor* v_out_grad) { - std::vector perm = {0, 1, 3, 2, 4}; - phi::funcs::TransposeGPUKernelDriver( - dev_ctx_, q_transpose_out_grad, perm, q_out_grad); - phi::funcs::TransposeGPUKernelDriver( - dev_ctx_, k_transpose_out_grad, perm, k_out_grad); - phi::funcs::TransposeGPUKernelDriver( - dev_ctx_, v_transpose_out_grad, perm, v_out_grad); - } - void ComputeQKVTransposeBackward( const phi::DenseTensor& qkv_transpose_out_grad, phi::DenseTensor* qkv_out_grad) { - std::vector perm = {1, 2, 4, 0, 3, 5}; + std::vector perm = {1, 2, 3, 0, 4, 5}; phi::funcs::TransposeGPUKernelDriver( dev_ctx_, qkv_transpose_out_grad, perm, qkv_out_grad); } diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu index 261e2e377b5f8..03ccc1282fa51 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu @@ -432,9 +432,6 @@ class FusedGateAttentionOpKernel : public framework::OpKernel { auto fmha_compute = FlashAttnWithGating(dev_ctx, merge_qkv); fmha_compute.ComputeForward(nonbatched_bias, src_mask, - q_transpose_out, - k_transpose_out, - v_transpose_out, qkv_transpose_out, softmax_lse, fmha_out, @@ -474,8 +471,6 @@ template class FusedGateAttentionGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { - LOG(INFO) << "In FusedGateAttentionGradKernel"; - // forward input const auto *query = ctx.Input("Query"); const auto *key = ctx.Input("Key"); From af986bd521f9e35b5cfe6307c41c785638532d74 Mon Sep 17 00:00:00 2001 From: lzydev Date: Tue, 25 Apr 2023 23:21:54 +0800 Subject: [PATCH 064/405] Add singlely compile gpu kernel camke function (#53305) * support register single .cu file * add register GPU kernel function --- cmake/operators.cmake | 52 +++++++ paddle/fluid/operators/CMakeLists.txt | 4 + .../fluid/operators/class_center_sample_op.cc | 131 ------------------ paddle/phi/api/yaml/legacy_ops.yaml | 8 -- paddle/phi/api/yaml/op_compat.yaml | 6 + paddle/phi/api/yaml/ops.yaml | 9 ++ .../phi/ops/compat/class_center_sample_sig.cc | 36 ----- 7 files changed, 71 insertions(+), 175 deletions(-) delete mode 100644 paddle/fluid/operators/class_center_sample_op.cc delete mode 100644 paddle/phi/ops/compat/class_center_sample_sig.cc diff --git a/cmake/operators.cmake b/cmake/operators.cmake index e22a747688b76..826d0e773a8a5 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -51,6 +51,58 @@ function(find_phi_register FILENAME ADD_PATH PATTERN) endif() endfunction() +# Just for those gpu kernels locating at "fluid/operators/", such as 'class_center_sample_op.cu'. +# Add other file modes if need in the future. +function(register_cu_kernel TARGET) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(register_cu_kernel "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + + set(cu_srcs) + set(op_common_deps operator op_registry math_function layer + common_infer_shape_functions) + foreach(cu_src ${register_cu_kernel_SRCS}) + if(${cu_src} MATCHES ".*\\.cu$") + list(APPEND cu_srcs ${cu_src}) + endif() + endforeach() + list(LENGTH cu_srcs cu_srcs_len) + if(${cu_srcs_len} EQUAL 0) + message( + FATAL_ERROR + "The GPU kernel file of ${TARGET} should contains at least one .cu file" + ) + endif() + if(WITH_GPU) + nv_library( + ${TARGET} + SRCS ${cu_srcs} + DEPS ${op_library_DEPS} ${op_common_deps}) + elseif(WITH_ROCM) + hip_library( + ${TARGET} + SRCS ${cu_srcs} + DEPS ${op_library_DEPS} ${op_common_deps}) + endif() + set(OP_LIBRARY + ${TARGET} ${OP_LIBRARY} + CACHE INTERNAL "op libs") + foreach(cu_src ${cu_srcs}) + set(op_name "") + # Add PHI Kernel Registry Message + find_phi_register(${cu_src} ${pybind_file} "PD_REGISTER_KERNEL") + find_phi_register(${cu_src} ${pybind_file} "PD_REGISTER_STRUCT_KERNEL") + find_phi_register(${cu_src} ${pybind_file} + "PD_REGISTER_KERNEL_FOR_ALL_DTYPE") + find_register(${cu_src} "REGISTER_OP_CUDA_KERNEL" op_name) + if(NOT ${op_name} EQUAL "") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDA);\n") + endif() + endforeach() +endfunction() + function(op_library TARGET) # op_library is a function to create op library. The interface is same as # cc_library. But it handle split GPU/CPU code and link some common library diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index cb9686611dfd5..e4add9ae4bd3c 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -102,6 +102,10 @@ op_library(quantize_linear_op DEPS phi) op_library(save_combine_op DEPS string_array phi) op_library(load_combine_op DEPS string_array) +if (WITH_GPU OR WITH_ROCM) + register_cu_kernel(class_center_sample_op SRCS class_center_sample_op.cu DEPS ${OP_HEADER_DEPS}) +endif() + if (WITH_GPU OR WITH_ROCM) op_library(activation_op SRCS activation_op.cc activation_op.kps soft_relu_op.cu DEPS ${OP_HEADER_DEPS}) elseif (WITH_XPU_KP) diff --git a/paddle/fluid/operators/class_center_sample_op.cc b/paddle/fluid/operators/class_center_sample_op.cc deleted file mode 100644 index 54f0e981ca078..0000000000000 --- a/paddle/fluid/operators/class_center_sample_op.cc +++ /dev/null @@ -1,131 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/phi/core/infermeta_utils.h" -#include "paddle/phi/infermeta/unary.h" - -namespace paddle { -namespace operators { - -class ClassCenterSampleOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "Label"), - ctx.device_context().GetPlace()); - } -}; - -class ClassCenterSampleOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput( - "Label", - "(Tensor) The input of ClassCenterSample op. Each value " - "of Label is an integer label."); - AddOutput("RemappedLabel", - "(Tensor) Output tensor with same shape as Label. " - "Each label is remap using sampled class."); - AddOutput("SampledLocalClassCenter", - "(Tensor) The sampled class center for local rank," - "value in [0, num_classes)."); - AddAttr( - "num_classes", - "A positive integer to specify the number of classes at local rank. " - "Note that num_classes of each GPU can be different."); - AddAttr( - "num_samples", - "A positive integer to specify the number of class center to sample."); - AddAttr("ring_id", "(int default 0) nccl communication ring id.") - .SetDefault(0); - AddAttr("nranks", "(int default 1) The total number of GPUs.") - .SetDefault(1); - AddAttr("rank", "(int default 0) The rank id in nranks.") - .SetDefault(0); - AddAttr("fix_seed", - "A flag indicating whether to use a fixed seed to generate " - "random negative class center. NOTE: DO NOT set this flag to" - "true in training. Setting this flag to true is only useful " - "in unittest or for debug") - .SetDefault(false); - AddAttr("seed", - "Random seed used to generate random negative class center. " - "[default 0].") - .SetDefault(0); - AddComment(R"DOC( - Class center sample method is proposed from the paper PartialFC that only sample a subset of the class centers. - The process of sampling subset class centers is straightforward: 1) First select the positive class centers; - 2) Randomly sample negative class centers. Specifically, given a Label tensor, shape [batch_size], select all - the positive class centers and randomly sample negative class centers, then remap the input label tensor using - the sampled class centers. Note that if the number of the positive class centers is greater than the input - num_samples, it keeps all the positive class centers and the shape of SampledLocalClassCenter will be - [num_positive_class_centers]. The op supports CPU, single GPU and multi GPU. - - For more information, Partial FC: Training 10 Million Identities on a Single Machine - arxiv: https://arxiv.org/abs/2010.05222 - - Examples: - For CPU or only one GPU - Given: - Label: [11, 5 , 1 , 3 , 12, 2 , 15, 19, 18, 19] - num_classes = 20 - num_samples = 6 - Then: - RemappedLabel: [4, 3, 0, 2, 5, 1, 6, 8, 7, 8] - SampledLocalClassCenter: [1 , 2 , 3 , 5 , 11, 12, 15, 18, 19] - - For multi GPU - Given: - rank0: - Label: [10, 17, 15, 11, 9 , 12, 18, 18, 17, 18, 19, 2 , 8 , 13, 11, 13, 9 , 10, 0 , 4 ] - num_classes = 10 - num_samples = 6 - ring_id = 0 - nranks = 2 - rank = 0 - rank1: - Label: [10, 17, 15, 11, 9 , 12, 18, 18, 17, 18, 19, 2 , 8 , 13, 11, 13, 9 , 10, 0 , 4 ] - num_classes = 10 - num_samples = 6 - ring_id = 0 - nranks = 2 - rank = 1 - Then: - rank0: - RemappedLabel: [6 , 11, 10, 7 , 4 , 8 , 12, 12, 11, 12, 13, 1 , 3 , 9 , 7 , 9 , 4 , 6 , 0 , 2 ] - SampledLocalClassCenter: [0, 2, 4, 8, 9, 3] - rank1: - RemappedLabel: [6 , 11, 10, 7 , 4 , 8 , 12, 12, 11, 12, 13, 1 , 3 , 9 , 7 , 9 , 4 , 6 , 0 , 2 ] - SampledLocalClassCenter: [0, 1, 2, 3, 5, 7, 8] -)DOC"); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -DECLARE_INFER_SHAPE_FUNCTOR(class_center_sample, - ClassCenterSampleInferShapeFunctor, - PD_INFER_META(phi::ClassCenterSampleInferMeta)); -REGISTER_OP_WITHOUT_GRADIENT(class_center_sample, - ops::ClassCenterSampleOp, - ops::ClassCenterSampleOpMaker, - ClassCenterSampleInferShapeFunctor); diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml index d23a20b18fcc4..61a8cb76e5c2e 100755 --- a/paddle/phi/api/yaml/legacy_ops.yaml +++ b/paddle/phi/api/yaml/legacy_ops.yaml @@ -180,14 +180,6 @@ data_type : x inplace : (x -> out), (input_found_infinite -> output_found_infinite) -- op : class_center_sample - args : (Tensor label, int num_classes, int num_samples, int ring_id, int rank, int nranks, bool fix_seed, int seed) - output : Tensor(remapped_label), Tensor(sampled_local_class_center) - infer_meta : - func : ClassCenterSampleInferMeta - kernel : - func : class_center_sample - - op : coalesce_tensor args : (Tensor[] input, DataType dtype, bool copy_data = false, bool set_constant = false, bool persist_output = false, float constant = 0.0, bool use_align = true, int align_size = -1, int size_of_dtype = -1, int64_t[] concated_shapes = {}, int64_t[] concated_ranks = {}) output : Tensor[](output){input.size()}, Tensor(fused_output) diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml index caf3194b899c2..5585d0c2d3b97 100755 --- a/paddle/phi/api/yaml/op_compat.yaml +++ b/paddle/phi/api/yaml/op_compat.yaml @@ -379,6 +379,12 @@ outputs : out : Out +- op : class_center_sample + inputs : + label : Label + outputs : + {remapped_label : RemappedLabel, sampled_local_class_center : SampledLocalClassCenter} + - op : clip backward : clip_grad, clip_double_grad inputs : diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml index 1541d1890a07a..c228a0506e813 100644 --- a/paddle/phi/api/yaml/ops.yaml +++ b/paddle/phi/api/yaml/ops.yaml @@ -377,6 +377,15 @@ func : cholesky_solve backward : cholesky_solve_grad +- op : class_center_sample + args : (Tensor label, int num_classes, int num_samples, int ring_id = 0, int rank = 0, int nranks = 1, bool fix_seed = false, int seed = 0) + output : Tensor(remapped_label), Tensor(sampled_local_class_center) + infer_meta : + func : ClassCenterSampleInferMeta + kernel : + func : class_center_sample + data_type : label + - op : clip args : (Tensor x, Scalar(float) min, Scalar(float) max) output : Tensor(out) diff --git a/paddle/phi/ops/compat/class_center_sample_sig.cc b/paddle/phi/ops/compat/class_center_sample_sig.cc deleted file mode 100644 index cfaf2b86436db..0000000000000 --- a/paddle/phi/ops/compat/class_center_sample_sig.cc +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/core/compat/op_utils.h" - -namespace phi { - -KernelSignature ClassCenterSampleOpArgumentMapping( - const ArgumentMappingContext& ctx) { - return KernelSignature("class_center_sample", - {"Label"}, - {"num_classes", - "num_samples", - "ring_id", - "rank", - "nranks", - "fix_seed", - "seed"}, - {"RemappedLabel", "SampledLocalClassCenter"}); -} - -} // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(class_center_sample, - phi::ClassCenterSampleOpArgumentMapping); From 37489df57f42f882c8fbc8bbe2f9edbf6ef3298c Mon Sep 17 00:00:00 2001 From: huangjiyi <43315610+huangjiyi@users.noreply.github.com> Date: Wed, 26 Apr 2023 10:40:07 +0800 Subject: [PATCH 065/405] Register fluid xpu kerenls to phi [part 3] (#53189) * update * update --- .../fluid/operators/affine_channel_op_xpu.cc | 16 ++++++++------ .../fused/resnet_basic_block_op_xpu.cc | 18 ++++++++++----- .../operators/fused/resnet_unit_op_xpu.cc | 22 ++++++++++++------- paddle/fluid/operators/sampling_id_op_xpu.cc | 6 ++--- .../sequence_ops/sequence_conv_op_xpu.cc | 18 +++++++-------- .../sequence_ops/sequence_unpad_op_xpu.cc | 4 ++-- .../uniform_random_inplace_op_xpu.cc | 19 ++++++++++------ 7 files changed, 60 insertions(+), 43 deletions(-) diff --git a/paddle/fluid/operators/affine_channel_op_xpu.cc b/paddle/fluid/operators/affine_channel_op_xpu.cc index 2649a9190b86f..7a4de54954d8b 100644 --- a/paddle/fluid/operators/affine_channel_op_xpu.cc +++ b/paddle/fluid/operators/affine_channel_op_xpu.cc @@ -25,7 +25,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class AffineChannelXPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -85,7 +85,7 @@ class AffineChannelXPUKernel : public framework::OpKernel { } }; -template +template class AffineChannelGradXPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -189,10 +189,12 @@ class AffineChannelGradXPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -using XPU = paddle::platform::XPUDeviceContext; - -REGISTER_OP_XPU_KERNEL(affine_channel, ops::AffineChannelXPUKernel); -REGISTER_OP_XPU_KERNEL(affine_channel_grad, - ops::AffineChannelGradXPUKernel); +PD_REGISTER_STRUCT_KERNEL( + affine_channel, XPU, ALL_LAYOUT, ops::AffineChannelXPUKernel, float) {} +PD_REGISTER_STRUCT_KERNEL(affine_channel_grad, + XPU, + ALL_LAYOUT, + ops::AffineChannelGradXPUKernel, + float) {} #endif diff --git a/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc b/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc index f6b2d30453f42..4d026f4b780a1 100644 --- a/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc +++ b/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc @@ -293,7 +293,7 @@ static inline void xpu_conv2d_grad(xpu::Context* ctx, PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d_grad"); } -template +template class ResNetBasicBlockXPUKernel : public framework::OpKernel { public: using XPUT = typename XPUTypeTrait::Type; @@ -696,7 +696,7 @@ class ResNetBasicBlockXPUKernel : public framework::OpKernel { } }; -template +template class ResNetBasicBlockGradXPUKernel : public framework::OpKernel { public: using XPUT = typename XPUTypeTrait::Type; @@ -992,8 +992,14 @@ class ResNetBasicBlockGradXPUKernel : public framework::OpKernel { namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_XPU_KERNEL(resnet_basic_block, - ops::ResNetBasicBlockXPUKernel); -REGISTER_OP_XPU_KERNEL(resnet_basic_block_grad, - ops::ResNetBasicBlockGradXPUKernel); +PD_REGISTER_STRUCT_KERNEL(resnet_basic_block, + XPU, + ALL_LAYOUT, + ops::ResNetBasicBlockXPUKernel, + float) {} +PD_REGISTER_STRUCT_KERNEL(resnet_basic_block_grad, + XPU, + ALL_LAYOUT, + ops::ResNetBasicBlockGradXPUKernel, + float) {} #endif diff --git a/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc b/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc index 1e2741cde5d9e..1e4ed290f43a9 100644 --- a/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc +++ b/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc @@ -19,7 +19,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class ResNetUnitXPUKernel : public framework::OpKernel { using XPUType = typename XPUTypeTrait::Type; @@ -181,7 +181,7 @@ class ResNetUnitXPUKernel : public framework::OpKernel { } }; -template +template class ResNetUnitGradXPUKernel : public framework::OpKernel { using XPUType = typename XPUTypeTrait::Type; @@ -361,9 +361,15 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel { namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_XPU_KERNEL(resnet_unit, - ops::ResNetUnitXPUKernel, - ops::ResNetUnitXPUKernel); -REGISTER_OP_XPU_KERNEL(resnet_unit_grad, - ops::ResNetUnitGradXPUKernel, - ops::ResNetUnitGradXPUKernel); +PD_REGISTER_STRUCT_KERNEL(resnet_unit, + XPU, + ALL_LAYOUT, + ops::ResNetUnitXPUKernel, + plat::float16, + float) {} +PD_REGISTER_STRUCT_KERNEL(resnet_unit_grad, + XPU, + ALL_LAYOUT, + ops::ResNetUnitGradXPUKernel, + plat::float16, + float) {} diff --git a/paddle/fluid/operators/sampling_id_op_xpu.cc b/paddle/fluid/operators/sampling_id_op_xpu.cc index 0b720c21381ac..9fd0193733e6e 100644 --- a/paddle/fluid/operators/sampling_id_op_xpu.cc +++ b/paddle/fluid/operators/sampling_id_op_xpu.cc @@ -16,8 +16,6 @@ #include "paddle/fluid/platform/device_context.h" namespace ops = paddle::operators; -using XPUCtx = paddle::platform::XPUDeviceContext; -REGISTER_OP_XPU_KERNEL(sampling_id, - paddle::operators::SamplingIdKernel, - paddle::operators::SamplingIdKernel); +PD_REGISTER_STRUCT_KERNEL( + sampling_id, XPU, ALL_LAYOUT, ops::SamplingIdKernel, float, double) {} diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc index f7b0b5c3b581a..53fb13180c36a 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc @@ -20,7 +20,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class SequenceConvXPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -154,7 +154,7 @@ class SequenceConvXPUKernel : public framework::OpKernel { } }; -template +template class SequenceConvGradXPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -363,12 +363,12 @@ class SequenceConvGradXPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_XPU_KERNEL( - sequence_conv, - ops::SequenceConvXPUKernel); - -REGISTER_OP_XPU_KERNEL( - sequence_conv_grad, - ops::SequenceConvGradXPUKernel); +PD_REGISTER_STRUCT_KERNEL( + sequence_conv, XPU, ALL_LAYOUT, ops::SequenceConvXPUKernel, float) {} +PD_REGISTER_STRUCT_KERNEL(sequence_conv_grad, + XPU, + ALL_LAYOUT, + ops::SequenceConvGradXPUKernel, + float) {} #endif diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op_xpu.cc b/paddle/fluid/operators/sequence_ops/sequence_unpad_op_xpu.cc index cc81ad20cacda..c875cdc37e80b 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op_xpu.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op_xpu.cc @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_unpad_op.h" namespace ops = paddle::operators; -REGISTER_OP_XPU_KERNEL(sequence_unpad, - ops::SequenceUnpadOpKernel); +PD_REGISTER_STRUCT_KERNEL( + sequence_unpad, XPU, ALL_LAYOUT, ops::SequenceUnpadOpKernel, float) {} #endif diff --git a/paddle/fluid/operators/uniform_random_inplace_op_xpu.cc b/paddle/fluid/operators/uniform_random_inplace_op_xpu.cc index bf0360ace0b87..f1afd8ef3e213 100644 --- a/paddle/fluid/operators/uniform_random_inplace_op_xpu.cc +++ b/paddle/fluid/operators/uniform_random_inplace_op_xpu.cc @@ -22,7 +22,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class XPUUniformRandomInplaceKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { @@ -71,7 +71,7 @@ class XPUUniformRandomInplaceKernel : public framework::OpKernel { } }; -template +template class XPUUniformRandomInplaceGradKernel : public framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext &ctx) const override { @@ -95,10 +95,15 @@ class XPUUniformRandomInplaceGradKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_XPU_KERNEL(uniform_random_inplace, - paddle::operators::XPUUniformRandomInplaceKernel); -REGISTER_OP_XPU_KERNEL( - uniform_random_inplace_grad, - paddle::operators::XPUUniformRandomInplaceGradKernel); +PD_REGISTER_STRUCT_KERNEL(uniform_random_inplace, + XPU, + ALL_LAYOUT, + ops::XPUUniformRandomInplaceKernel, + float) {} +PD_REGISTER_STRUCT_KERNEL(uniform_random_inplace_grad, + XPU, + ALL_LAYOUT, + ops::XPUUniformRandomInplaceGradKernel, + float) {} #endif // PADDLE_WITH_XPU From aed9a75b05ee51276d92f23367a5147d9ff607e4 Mon Sep 17 00:00:00 2001 From: HongyuJia Date: Wed, 26 Apr 2023 10:47:36 +0800 Subject: [PATCH 066/405] [Fix Patch Error] Fix patch error in cmake (#53321) --- cmake/external/eigen.cmake | 5 +++-- cmake/external/gloo.cmake | 11 +++++++---- cmake/external/gtest.cmake | 6 ++++-- cmake/external/pybind11.cmake | 9 +++++++-- 4 files changed, 21 insertions(+), 10 deletions(-) diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index eda9ae8c4a2ac..8ae880773a4f4 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -51,9 +51,10 @@ if(CMAKE_COMPILER_IS_GNUCC) if(GCC_VERSION GREATER_EQUAL "12.0") file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Complex.h.patch complex_header) + # See: [Why calling some `git` commands before `patch`?] set(EIGEN_PATCH_COMMAND - patch -Nd ${EIGEN_SOURCE_DIR}/Eigen/src/Core/arch/SSE/ < - ${complex_header}) + git checkout -- . && git checkout ${EIGEN_TAG} && patch -Nd + ${EIGEN_SOURCE_DIR}/Eigen/src/Core/arch/SSE/ < ${complex_header}) endif() endif() diff --git a/cmake/external/gloo.cmake b/cmake/external/gloo.cmake index 63212c974e257..ebdcabd770cb0 100755 --- a/cmake/external/gloo.cmake +++ b/cmake/external/gloo.cmake @@ -37,8 +37,9 @@ if(WITH_GPU) VERSION_GREATER 12.0) file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/gloo/device.cc.patch native_dst) - set(GLOO_PATCH_COMMAND patch -d ${GLOO_SOURCE_DIR}/gloo/transport/tcp < - ${native_dst}) + set(GLOO_PATCH_COMMAND + git checkout -- . && git checkout ${GLOO_TAG} &&patch -Nd + ${GLOO_SOURCE_DIR}/gloo/transport/tcp < ${native_dst}) endif() endif() @@ -54,9 +55,11 @@ if(CMAKE_COMPILER_IS_GNUCC) native_dst) file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/gloo/types.h.patch types_header) + # See: [Why calling some `git` commands before `patch`?] set(GLOO_PATCH_COMMAND - patch -Nd ${GLOO_SOURCE_DIR}/gloo/transport/tcp < ${native_dst} && - patch -Nd ${GLOO_SOURCE_DIR}/gloo/ < ${types_header}) + git checkout -- . && git checkout ${GLOO_TAG} && patch -Nd + ${GLOO_SOURCE_DIR}/gloo/transport/tcp < ${native_dst} && patch -Nd + ${GLOO_SOURCE_DIR}/gloo/ < ${types_header}) endif() endif() include_directories(${GLOO_INCLUDE_DIR}) diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake index 315f6b5b752b2..777d37a023f05 100644 --- a/cmake/external/gtest.cmake +++ b/cmake/external/gtest.cmake @@ -66,8 +66,10 @@ endif() if(NOT WIN32 AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 12.0) file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/gtest/gtest-death-test.cc.patch native_src) - set(GTEST_PATCH_COMMAND patch -Nd ${GTEST_SOURCE_DIR}/googletest/src < - ${native_src}) + # See: [Why calling some `git` commands before `patch`?] + set(GTEST_PATCH_COMMAND + git checkout -- . && git checkout ${GTEST_TAG} && patch -Nd + ${GTEST_SOURCE_DIR}/googletest/src < ${native_src}) endif() if(WIN32) ExternalProject_Add( diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake index db53e3511be44..c2782af255c59 100644 --- a/cmake/external/pybind11.cmake +++ b/cmake/external/pybind11.cmake @@ -25,8 +25,13 @@ set(PYBIND_PATCH_COMMAND "") if(NOT WIN32) file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/pybind/cast.h.patch native_dst) - set(PYBIND_PATCH_COMMAND patch -d ${PYBIND_INCLUDE_DIR}/pybind11 < - ${native_dst}) + # Note: [Why calling some `git` commands before `patch`?] + # Paddle's CI uses cache to accelarate the make process. However, error might raise when patch codes in two scenarios: + # 1. Patch to the wrong version: the tag version of CI's cache falls behind PYBIND_TAG, use `git checkout ${PYBIND_TAG}` to solve this. + # 2. Patch twice: the tag version of cache == PYBIND_TAG, but patch has already applied to cache. + set(PYBIND_PATCH_COMMAND + git checkout -- . && git checkout ${PYBIND_TAG} && patch -Nd + ${PYBIND_INCLUDE_DIR}/pybind11 < ${native_dst}) endif() ExternalProject_Add( From 652d100da33a629e49bb46281647ff5298f1cc70 Mon Sep 17 00:00:00 2001 From: risemeup1 <62429225+risemeup1@users.noreply.github.com> Date: Wed, 26 Apr 2023 11:02:58 +0800 Subject: [PATCH 067/405] update cmake (#53345) --- tools/dockerfile/build_scripts/build.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/dockerfile/build_scripts/build.sh b/tools/dockerfile/build_scripts/build.sh index 5e076e7696542..94014a4235713 100644 --- a/tools/dockerfile/build_scripts/build.sh +++ b/tools/dockerfile/build_scripts/build.sh @@ -61,9 +61,9 @@ yum -y install bzip2 make git patch unzip bison yasm diffutils \ # /bin/sh cmake-3.8.1-Linux-x86_64.sh --prefix=/usr/local --skip-license # rm cmake-3.8.1-Linux-x86_64.sh -wget -q https://cmake.org/files/v3.16/cmake-3.16.0.tar.gz && tar xzf cmake-3.16.0.tar.gz && \ -cd cmake-3.16.0 && ./bootstrap && \ -make -j8 && make install && cd .. && rm cmake-3.16.0.tar.gz && rm -rf cmake-3.16.0 +wget -q https://cmake.org/files/v3.18/cmake-3.18.0.tar.gz && tar xzf cmake-3.18.0.tar.gz && \ +cd cmake-3.18.0 && ./bootstrap && \ +make -j8 && make install && cd .. && rm cmake-3.18.0.tar.gz && rm -rf cmake-3.18.0 # Install newest autoconf build_autoconf $AUTOCONF_ROOT $AUTOCONF_HASH From ed040a17dba36ed12328102bef595fe8ae3bbf6d Mon Sep 17 00:00:00 2001 From: Wang Xin Date: Wed, 26 Apr 2023 11:03:22 +0800 Subject: [PATCH 068/405] add autogen code support for box_coder op (#53309) --- .../fluid/operators/detection/CMakeLists.txt | 1 - .../fluid/operators/detection/box_coder_op.cc | 144 ------------------ paddle/phi/api/yaml/legacy_ops.yaml | 9 -- paddle/phi/api/yaml/op_compat.yaml | 6 + paddle/phi/api/yaml/ops.yaml | 9 ++ paddle/phi/ops/compat/box_coder_sig.cc | 28 ---- 6 files changed, 15 insertions(+), 182 deletions(-) delete mode 100644 paddle/fluid/operators/detection/box_coder_op.cc delete mode 100644 paddle/phi/ops/compat/box_coder_sig.cc diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index 8c5c1a5d8a2be..64f9e03d7e061 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -28,7 +28,6 @@ function(detection_library TARGET_NAME) PARENT_SCOPE) endfunction() -detection_library(box_coder_op SRCS box_coder_op.cc) detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu) diff --git a/paddle/fluid/operators/detection/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc deleted file mode 100644 index aafe040991ea0..0000000000000 --- a/paddle/fluid/operators/detection/box_coder_op.cc +++ /dev/null @@ -1,144 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/infermeta_utils.h" -#include "paddle/phi/infermeta/ternary.h" - -namespace paddle { -namespace operators { - -class BoxCoderOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; -}; - -class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput( - "PriorBox", - "(Tensor, default Tensor) " - "Box list PriorBox is a 2-D Tensor with shape [M, 4] holds M boxes, " - "each box is represented as [xmin, ymin, xmax, ymax], " - "[xmin, ymin] is the left top coordinate of the anchor box, " - "if the input is image feature map, they are close to the origin " - "of the coordinate system. [xmax, ymax] is the right bottom " - "coordinate of the anchor box."); - AddInput("PriorBoxVar", - "(Tensor, default Tensor, optional) " - "PriorBoxVar is a 2-D Tensor with shape [M, 4] holds M group " - "of variance. PriorBoxVar will set all elements to 1 by " - "default.") - .AsDispensable(); - AddInput( - "TargetBox", - "(phi::DenseTensor or Tensor) This input can be a 2-D phi::DenseTensor " - "with shape " - "[N, 4] when code_type is 'encode_center_size'. This input also can " - "be a 3-D Tensor with shape [N, M, 4] when code_type is " - "'decode_center_size'. [N, 4], each box is represented as " - "[xmin, ymin, xmax, ymax], [xmin, ymin] is the left top coordinate " - "of the box if the input is image feature map, they are close to " - "the origin of the coordinate system. [xmax, ymax] is the right " - "bottom coordinate of the box. This tensor can contain LoD " - "information to represent a batch of inputs. One instance of this " - "batch can contain different numbers of entities."); - AddAttr("code_type", - "(string, default encode_center_size) " - "the code type used with the target box") - .SetDefault("encode_center_size") - .InEnum({"encode_center_size", "decode_center_size"}); - AddAttr("box_normalized", - "(bool, default true) " - "whether treat the priorbox as a normalized box") - .SetDefault(true); - AddAttr("axis", - "(int, default 0)" - "which axis in PriorBox to broadcast for box decode," - "for example, if axis is 0 and TargetBox has shape" - "[N, M, 4] and PriorBox has shape [M, 4], then PriorBox " - "will broadcast to [N, M, 4] for decoding. It is only valid" - "when code type is decode_center_size") - .SetDefault(0) - .InEnum({0, 1}); - AddAttr>( - "variance", - "(vector, default {})," - "variance of prior box with shape [4]. PriorBoxVar and variance can" - "not be provided at the same time.") - .SetDefault(std::vector{}); - AddOutput("OutputBox", - "(phi::DenseTensor or Tensor) " - "When code_type is 'encode_center_size', the output tensor of " - "box_coder_op with shape [N, M, 4] representing the result of N " - "target boxes encoded with M Prior boxes and variances. When " - "code_type is 'decode_center_size', N represents the batch size " - "and M represents the number of decoded boxes."); - - AddComment(R"DOC( - -Bounding Box Coder. - -Encode/Decode the target bounding box with the priorbox information. - -The Encoding schema described below: - - ox = (tx - px) / pw / pxv - - oy = (ty - py) / ph / pyv - - ow = log(abs(tw / pw)) / pwv - - oh = log(abs(th / ph)) / phv - -The Decoding schema described below: - - ox = (pw * pxv * tx * + px) - tw / 2 - - oy = (ph * pyv * ty * + py) - th / 2 - - ow = exp(pwv * tw) * pw + tw / 2 - - oh = exp(phv * th) * ph + th / 2 - -where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width -and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the -priorbox's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`, -`phv` denote the variance of the priorbox and `ox`, `oy`, `ow`, `oh` denote the -encoded/decoded coordinates, width and height. - -During Box Decoding, two modes for broadcast are supported. Say target box has -shape [N, M, 4], and the shape of prior box can be [N, 4] or [M, 4]. Then prior -box will broadcast to target box along the assigned axis. -)DOC"); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -DECLARE_INFER_SHAPE_FUNCTOR(box_coder, - BoxCoderInferShapeFunctor, - PD_INFER_META(phi::BoxCoderInferMeta)); - -REGISTER_OPERATOR( - box_coder, - ops::BoxCoderOp, - ops::BoxCoderOpMaker, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker, - BoxCoderInferShapeFunctor); diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml index 61a8cb76e5c2e..5bb3e8d0d73e7 100755 --- a/paddle/phi/api/yaml/legacy_ops.yaml +++ b/paddle/phi/api/yaml/legacy_ops.yaml @@ -139,15 +139,6 @@ func: bincount optional: weights -- op : box_coder - args : (Tensor prior_box, Tensor prior_box_var, Tensor target_box, str code_type, bool box_normalized, int axis, float[] variance) - output : Tensor(output_box) - infer_meta : - func : BoxCoderInferMeta - kernel : - func : box_coder - optional : prior_box_var - - op : cast args : (Tensor x, DataType dtype) output : Tensor diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml index 5585d0c2d3b97..8d29a77337320 100755 --- a/paddle/phi/api/yaml/op_compat.yaml +++ b/paddle/phi/api/yaml/op_compat.yaml @@ -343,6 +343,12 @@ outputs : out : Out +- op : box_coder + inputs : + {prior_box : PriorBox , prior_box_var : PriorBoxVar, target_box: TargetBox} + outputs : + output_box : OutputBox + - op : broadcast_tensors backward : broadcast_tensors_grad inputs : diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml index c228a0506e813..507fef3309660 100644 --- a/paddle/phi/api/yaml/ops.yaml +++ b/paddle/phi/api/yaml/ops.yaml @@ -329,6 +329,15 @@ func : bmm backward : bmm_grad +- op : box_coder + args : (Tensor prior_box, Tensor prior_box_var, Tensor target_box, str code_type = "encode_center_size", bool box_normalized = true, int axis = 0, float[] variance = {}) + output : Tensor(output_box) + infer_meta : + func : BoxCoderInferMeta + kernel : + func : box_coder + optional : prior_box_var + - op : broadcast_tensors args: (Tensor[] input) output: Tensor[]{input.size()} diff --git a/paddle/phi/ops/compat/box_coder_sig.cc b/paddle/phi/ops/compat/box_coder_sig.cc deleted file mode 100644 index 5b674f3dcd253..0000000000000 --- a/paddle/phi/ops/compat/box_coder_sig.cc +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/core/compat/op_utils.h" - -namespace phi { - -KernelSignature BoxCoderOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature("box_coder", - {"PriorBox", "PriorBoxVar", "TargetBox"}, - {"code_type", "box_normalized", "axis", "variance"}, - {"OutputBox"}); -} - -} // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(box_coder, phi::BoxCoderOpArgumentMapping); From 3ec12c2bfec19490669a8f475fbe0d6e3162d167 Mon Sep 17 00:00:00 2001 From: risemeup1 <62429225+risemeup1@users.noreply.github.com> Date: Wed, 26 Apr 2023 11:04:46 +0800 Subject: [PATCH 069/405] Optimize prompt information (#53291) * Optimize prompt information * add_information * add_information --- paddle/phi/backends/gpu/gpu_resources.cc | 39 +++++++++++++++++++++--- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc index 06dc74f2d27f0..a447df94cb4dc 100644 --- a/paddle/phi/backends/gpu/gpu_resources.cc +++ b/paddle/phi/backends/gpu/gpu_resources.cc @@ -16,12 +16,12 @@ #include +#include #include "paddle/phi/api/include/tensor.h" #include "paddle/phi/backends/gpu/gpu_decls.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/allocator.h" - #ifdef PADDLE_WITH_CUDA #include "paddle/phi/backends/dynload/cublas.h" #include "paddle/phi/backends/dynload/cublasLt.h" @@ -78,10 +78,39 @@ void InitGpuProperties(Place place, for (const int32_t& arch : compiled_archs) { compile_arch_str += std::to_string(arch) + " "; } - LOG(WARNING) << "Paddle with runtime capability " << *compute_capability - << " is not compatible with Paddle installation with arch: " - << compile_arch_str - << ". Please check compiled version of Paddle. "; + std::map arch_computing_mapping_table = { + {20, "Fermi"}, + {30, "Kepler"}, + {35, "Kapler"}, + {37, "Kepler"}, + {50, "Maxwell"}, + {52, "Maxwell"}, + {60, "Pascal"}, + {61, "Pascal"}, + {70, "Volta"}, + {75, "Turing"}, + {80, "Ampere"}, + {86, "Ampere"}, + {89, "Ampere"}}; + if (arch_computing_mapping_table.count(*compute_capability)) { + LOG(WARNING) + << "The GPU architecture in your current machine is " + << arch_computing_mapping_table[*compute_capability] + << ", which is not compatible with Paddle installation with arch: " + << compile_arch_str + << ", it is recommended to install the corresponding wheel package " + "according to the installation information on the official " + "Paddle " + "website."; + } else { + LOG(WARNING) + << "The GPU compute capability in your current machine is " + << *compute_capability << ", which is not supported by Paddle" + << ", it is recommended to install the corresponding wheel package " + "according to the installation information on the official " + "Paddle " + "website."; + } } } #endif From 1d549400dc4ae3b5da32a6a54e471257df027dd4 Mon Sep 17 00:00:00 2001 From: Lucas <33367939+cqulilujia@users.noreply.github.com> Date: Wed, 26 Apr 2023 11:06:54 +0800 Subject: [PATCH 070/405] [Bug Fixs] fix bugs when using cast in xpu/cross_entropy kernels, *test=kunlun (#53325) --- .../kernels/xpu/cross_entropy_grad_kernel.cc | 65 +++++++++++++------ .../phi/kernels/xpu/cross_entropy_kernel.cc | 32 ++++++--- 2 files changed, 66 insertions(+), 31 deletions(-) diff --git a/paddle/phi/kernels/xpu/cross_entropy_grad_kernel.cc b/paddle/phi/kernels/xpu/cross_entropy_grad_kernel.cc index edb7157a3440a..a88abbb5e4c81 100644 --- a/paddle/phi/kernels/xpu/cross_entropy_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/cross_entropy_grad_kernel.cc @@ -54,21 +54,32 @@ void CrossEntropyWithSoftmaxGradKernel(const Context& dev_ctx, d); PADDLE_ENFORCE_XDNN_SUCCESS(r, "soft_softmax_with_cross_entropy_grad"); } else { - xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); - int* labels_int_ptr_l3 = - RAII_GUARD.alloc_l3_or_gm(labels.numel()); - PADDLE_ENFORCE_XDNN_NOT_NULL(labels_int_ptr_l3); - - r = xpu::cast(dev_ctx.x_context(), - labels.data(), - labels_int_ptr_l3, - labels.numel()); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); + const int* labels_int_ptr = nullptr; + if (labels.dtype() == DataType::INT32) { + labels_int_ptr = labels.data(); + } else if (labels.dtype() == DataType::INT64) { + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); + int* labels_int_ptr_l3 = + RAII_GUARD.alloc_l3_or_gm(labels.numel()); + PADDLE_ENFORCE_XDNN_NOT_NULL(labels_int_ptr_l3); + + r = xpu::cast(dev_ctx.x_context(), + labels.data(), + labels_int_ptr_l3, + labels.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); + labels_int_ptr = labels_int_ptr_l3; + } else { + // TODO(lilujia): other data types should be handled + errors::Unimplemented( + ("cross_entropy does not support data types other than int32 and " + "int64")); + } r = xpu::hard_softmax_with_cross_entropy_grad( dev_ctx.x_context(), reinterpret_cast(loss_grad.data()), - labels_int_ptr_l3, + labels_int_ptr, reinterpret_cast(softmax.data()), reinterpret_cast(logit_grad->data()), ignore_index, @@ -113,19 +124,31 @@ void CrossEntropyWithSoftmaxGradKernel(const Context& dev_ctx, t); PADDLE_ENFORCE_XDNN_SUCCESS(r, "soft_softmax_with_cross_entropy_grad"); } else { - int* labels_int_ptr_l3 = - RAII_GUARD.alloc_l3_or_gm(labels.numel()); - PADDLE_ENFORCE_XDNN_NOT_NULL(labels_int_ptr_l3); - - r = xpu::cast(dev_ctx.x_context(), - labels.data(), - labels_int_ptr_l3, - labels.numel()); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); + const int* labels_int_ptr = nullptr; + if (labels.dtype() == DataType::INT32) { + labels_int_ptr = labels.data(); + } else if (labels.dtype() == DataType::INT64) { + int* labels_int_ptr_l3 = + RAII_GUARD.alloc_l3_or_gm(labels.numel()); + PADDLE_ENFORCE_XDNN_NOT_NULL(labels_int_ptr_l3); + + r = xpu::cast(dev_ctx.x_context(), + labels.data(), + labels_int_ptr_l3, + labels.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); + labels_int_ptr = labels_int_ptr_l3; + } else { + // TODO(lilujia): other data types should be handled + errors::Unimplemented( + ("cross_entropy does not support data types other than int32 and " + "int64")); + } + r = xpu::hard_softmax_with_cross_entropy_grad( dev_ctx.x_context(), reinterpret_cast(loss_grad.data()), - labels_int_ptr_l3, + labels_int_ptr, trans_softmax, trans_logit, ignore_index, diff --git a/paddle/phi/kernels/xpu/cross_entropy_kernel.cc b/paddle/phi/kernels/xpu/cross_entropy_kernel.cc index f1b2257427f67..b678fde9a882b 100644 --- a/paddle/phi/kernels/xpu/cross_entropy_kernel.cc +++ b/paddle/phi/kernels/xpu/cross_entropy_kernel.cc @@ -133,20 +133,32 @@ void CrossEntropyWithSoftmaxKernel(const Context& dev_ctx, axis == rank - 1 ? d : t); PADDLE_ENFORCE_XDNN_SUCCESS(r, "soft_cross_entropy"); } else { - DenseTensor labels_int32; - int* labels_int_ptr_l3 = RAII_GUARD.alloc_l3_or_gm(labels.numel()); - PADDLE_ENFORCE_XDNN_NOT_NULL(labels_int_ptr_l3); - - r = xpu::cast(dev_ctx.x_context(), - labels.data(), - labels_int_ptr_l3, - labels.numel()); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); + const int* labels_int_ptr = nullptr; + if (labels.dtype() == DataType::INT32) { + labels_int_ptr = labels.data(); + } else if (labels.dtype() == DataType::INT64) { + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); + int* labels_int_ptr_l3 = + RAII_GUARD.alloc_l3_or_gm(labels.numel()); + PADDLE_ENFORCE_XDNN_NOT_NULL(labels_int_ptr_l3); + + r = xpu::cast(dev_ctx.x_context(), + labels.data(), + labels_int_ptr_l3, + labels.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); + labels_int_ptr = labels_int_ptr_l3; + } else { + // TODO(lilujia): other data types should be handled + errors::Unimplemented( + ("cross_entropy does not support data types other than int32 and " + "int64")); + } r = xpu::hard_cross_entropy( dev_ctx.x_context(), softmax_data, - labels_int_ptr_l3, + labels_int_ptr, loss_data, nullptr, axis == rank - 1 ? n : n * d / t, From 9127cc3c969cd3c94c91cac90a0fb6c164c29d9a Mon Sep 17 00:00:00 2001 From: denglianbin <112610123+denglianbin@users.noreply.github.com> Date: Wed, 26 Apr 2023 11:15:24 +0800 Subject: [PATCH 071/405] =?UTF-8?q?=E3=80=90Hackathon=20No.48=E3=80=91?= =?UTF-8?q?=E4=B8=BA=20Paddle=20meshgrid=20=E7=AE=97=E5=AD=90=E5=AE=9E?= =?UTF-8?q?=E7=8E=B0=20float16=20=E6=95=B0=E6=8D=AE=E7=B1=BB=E5=9E=8B?= =?UTF-8?q?=E6=94=AF=E6=8C=81=20(#53284)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- paddle/phi/kernels/gpu/meshgrid_grad_kernel.cu.cc | 1 + paddle/phi/kernels/gpu/meshgrid_kernel.cu.cc | 1 + python/paddle/fluid/tests/unittests/test_meshgrid_op.py | 8 ++++++++ python/paddle/tensor/creation.py | 2 +- 4 files changed, 11 insertions(+), 1 deletion(-) diff --git a/paddle/phi/kernels/gpu/meshgrid_grad_kernel.cu.cc b/paddle/phi/kernels/gpu/meshgrid_grad_kernel.cu.cc index 80cf88b3ceb7f..17f74cd3743bd 100644 --- a/paddle/phi/kernels/gpu/meshgrid_grad_kernel.cu.cc +++ b/paddle/phi/kernels/gpu/meshgrid_grad_kernel.cu.cc @@ -22,6 +22,7 @@ PD_REGISTER_KERNEL(meshgrid_grad, GPU, ALL_LAYOUT, phi::MeshgridGradKernel, + phi::dtype::float16, float, double, int, diff --git a/paddle/phi/kernels/gpu/meshgrid_kernel.cu.cc b/paddle/phi/kernels/gpu/meshgrid_kernel.cu.cc index c863550979444..73120c1391642 100644 --- a/paddle/phi/kernels/gpu/meshgrid_kernel.cu.cc +++ b/paddle/phi/kernels/gpu/meshgrid_kernel.cu.cc @@ -22,6 +22,7 @@ PD_REGISTER_KERNEL(meshgrid, GPU, ALL_LAYOUT, phi::MeshgridKernel, + phi::dtype::float16, float, double, int, diff --git a/python/paddle/fluid/tests/unittests/test_meshgrid_op.py b/python/paddle/fluid/tests/unittests/test_meshgrid_op.py index 60af417ebc545..0039d4ee422e8 100644 --- a/python/paddle/fluid/tests/unittests/test_meshgrid_op.py +++ b/python/paddle/fluid/tests/unittests/test_meshgrid_op.py @@ -76,6 +76,14 @@ def get_x_shape(self): return [100, 300] +class TestMeshgridOp2Fp16(TestMeshgridOp): + def get_x_shape(self): + return [100, 300] + + def get_dtype(self): + return np.float16 + + class TestMeshgridOp3(unittest.TestCase): def test_api(self): x = paddle.static.data(shape=[100], dtype='int32', name='x') diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py index 186eda03e74d8..c57fceeeb8525 100644 --- a/python/paddle/tensor/creation.py +++ b/python/paddle/tensor/creation.py @@ -1512,7 +1512,7 @@ def meshgrid(*args, **kwargs): Args: *args(Tensor|list of Tensor) : tensors (tuple(list) of tensor): the shapes of input k tensors are (N1,), - (N2,),..., (Nk,). Support data types: ``float64``, ``float32``, ``int32``, ``int64``. + (N2,),..., (Nk,). Support data types: ``float64``, ``float16``, ``float32``, ``int32``, ``int64``. **kwargs (optional): Currently, only accept name in **kwargs The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. From 2a705b74a02cb8072d5a8510276e6a5154ab9ee1 Mon Sep 17 00:00:00 2001 From: denglianbin <112610123+denglianbin@users.noreply.github.com> Date: Wed, 26 Apr 2023 11:15:49 +0800 Subject: [PATCH 072/405] =?UTF-8?q?=E3=80=90Hackathon=20No.48=E3=80=91?= =?UTF-8?q?=E4=B8=BA=20Paddle=20determinant=20=E7=AE=97=E5=AD=90=E5=AE=9E?= =?UTF-8?q?=E7=8E=B0=20float16=20=E6=95=B0=E6=8D=AE=E7=B1=BB=E5=9E=8B?= =?UTF-8?q?=E6=94=AF=E6=8C=81=20(#53286)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../kernels/gpu/determinant_grad_kernel.cu | 1 + paddle/phi/kernels/gpu/determinant_kernel.cu | 9 ++++-- .../impl/determinant_grad_kernel_impl.h | 32 +++++++++++++++---- .../kernels/impl/determinant_kernel_impl.h | 12 ++++++- .../tests/unittests/test_determinant_op.py | 19 +++++++++++ python/paddle/tensor/linalg.py | 2 +- 6 files changed, 65 insertions(+), 10 deletions(-) diff --git a/paddle/phi/kernels/gpu/determinant_grad_kernel.cu b/paddle/phi/kernels/gpu/determinant_grad_kernel.cu index cce12a87fac72..f3187d5fefb51 100644 --- a/paddle/phi/kernels/gpu/determinant_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/determinant_grad_kernel.cu @@ -21,5 +21,6 @@ PD_REGISTER_KERNEL(determinant_grad, GPU, ALL_LAYOUT, phi::DeterminantGradKernel, + phi::dtype::float16, float, double) {} diff --git a/paddle/phi/kernels/gpu/determinant_kernel.cu b/paddle/phi/kernels/gpu/determinant_kernel.cu index 2518408387395..58e27e3ce4abd 100644 --- a/paddle/phi/kernels/gpu/determinant_kernel.cu +++ b/paddle/phi/kernels/gpu/determinant_kernel.cu @@ -17,5 +17,10 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/determinant_kernel_impl.h" -PD_REGISTER_KERNEL( - determinant, GPU, ALL_LAYOUT, phi::DeterminantKernel, float, double) {} +PD_REGISTER_KERNEL(determinant, + GPU, + ALL_LAYOUT, + phi::DeterminantKernel, + phi::dtype::float16, + float, + double) {} diff --git a/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h index 3f463e1d9e064..4d58698c64d22 100644 --- a/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h @@ -15,8 +15,10 @@ #pragma once #include "glog/logging.h" +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/tensor_utils.h" +#include "paddle/phi/kernels/cast_kernel.h" #include "paddle/phi/kernels/determinant_grad_kernel.h" #include "paddle/phi/kernels/elementwise_multiply_kernel.h" #include "paddle/phi/kernels/empty_kernel.h" @@ -26,7 +28,6 @@ #include "paddle/phi/kernels/funcs/matrix_inverse.h" #include "paddle/phi/kernels/funcs/unsqueeze.h" #include "paddle/phi/kernels/transpose_kernel.h" - namespace phi { namespace detail { @@ -113,6 +114,11 @@ void DeterminantGradKernel(const Context& dev_ctx, return; } + using MPType = typename phi::dtype::MPTypeTrait::Type; + auto origin_dt = std::is_same::value + ? DataType::FLOAT16 + : DataType::BFLOAT16; + // The matrix is invertible // let |A| = Determinant(A) // Ref to https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf @@ -123,16 +129,22 @@ void DeterminantGradKernel(const Context& dev_ctx, DenseTensor inverse_A; // A must be square matrices! inverse_A.Resize(x.dims()); - dev_ctx.template Alloc(&inverse_A); + dev_ctx.template Alloc(&inverse_A); - phi::funcs::MatrixInverseFunctor mat_inv; - mat_inv(dev_ctx, x, &inverse_A); + phi::funcs::MatrixInverseFunctor mat_inv; + if (!std::is_same::value) { + mat_inv(dev_ctx, + phi::Cast(dev_ctx, x, DataType::FLOAT32), + &inverse_A); + } else { + mat_inv(dev_ctx, x, &inverse_A); + } VLOG(3) << "inverse(A) dims: " << inverse_A.dims(); // Second: inverse(A).transpose(-2, -1) DenseTensor transpose_inverse_A = - phi::TransposeLast2Dim(dev_ctx, inverse_A); + phi::TransposeLast2Dim(dev_ctx, inverse_A); VLOG(3) << "(dA * |A|).transpose(-2, -1) dims: " << transpose_inverse_A.dims(); @@ -147,7 +159,15 @@ void DeterminantGradKernel(const Context& dev_ctx, VLOG(3) << "unsqueezed(dA * |A|) dims: " << unsqueeze2.dims(); // Finally: unsqueeze(dA * |A|) * inverse(A) - auto res = phi::Multiply(dev_ctx, unsqueeze2, transpose_inverse_A); + DenseTensor res; + if (!std::is_same::value) { + res = phi::Multiply( + dev_ctx, + unsqueeze2, + phi::Cast(dev_ctx, transpose_inverse_A, origin_dt)); + } else { + res = phi::Multiply(dev_ctx, unsqueeze2, transpose_inverse_A); + } VLOG(3) << "unsqueeze(dA * |A|) * inverse(A) dims: " << res.dims(); diff --git a/paddle/phi/kernels/impl/determinant_kernel_impl.h b/paddle/phi/kernels/impl/determinant_kernel_impl.h index 36e47c78c832c..3c437ad659c43 100644 --- a/paddle/phi/kernels/impl/determinant_kernel_impl.h +++ b/paddle/phi/kernels/impl/determinant_kernel_impl.h @@ -21,6 +21,7 @@ #include #include "glog/logging.h" +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/tensor_utils.h" @@ -31,6 +32,13 @@ namespace detail { template class EigenMatrix {}; +template <> +class EigenMatrix { + public: + using MatrixType = + Eigen::Matrix; +}; + template <> class EigenMatrix { public: @@ -74,6 +82,7 @@ struct DeterminantFunctor { std::vector input_vec; std::vector output_vec; phi::TensorToVector(input, dev_ctx, &input_vec); + using MPType = typename phi::dtype::MPTypeTrait::Type; for (int64_t i = 0; i < batch_count; ++i) { // maybe can be parallel auto begin_iter = input_vec.begin() + i * rank * rank; auto end_iter = input_vec.begin() + (i + 1) * rank * rank; @@ -85,7 +94,8 @@ struct DeterminantFunctor { matrix(i, j) = sub_vec[rank * i + j]; } } - output_vec.push_back(matrix.determinant()); + output_vec.push_back( + static_cast(matrix.template cast().determinant())); } phi::TensorFromVector(output_vec, dev_ctx, output); } diff --git a/python/paddle/fluid/tests/unittests/test_determinant_op.py b/python/paddle/fluid/tests/unittests/test_determinant_op.py index ade000cda8712..8e50f0c5552ec 100644 --- a/python/paddle/fluid/tests/unittests/test_determinant_op.py +++ b/python/paddle/fluid/tests/unittests/test_determinant_op.py @@ -50,6 +50,14 @@ def init_data(self): self.target = np.linalg.det(self.case) +class TestDeterminantOpCase1FP16(TestDeterminantOp): + def init_data(self): + np.random.seed(0) + self.case = np.random.rand(10, 10).astype(np.float16) + self.inputs = {'Input': self.case} + self.target = np.linalg.det(self.case.astype(np.float32)) + + class TestDeterminantOpCase2(TestDeterminantOp): def init_data(self): np.random.seed(0) @@ -59,6 +67,17 @@ def init_data(self): self.target = np.linalg.det(self.case) +class TestDeterminantOpCase2FP16(TestDeterminantOp): + def init_data(self): + np.random.seed(0) + # not invertible matrix + self.case = np.ones([4, 2, 4, 4]).astype(np.float16) + self.inputs = {'Input': self.case} + self.target = np.linalg.det(self.case.astype(np.float32)).astype( + np.float16 + ) + + class TestDeterminantAPI(unittest.TestCase): def setUp(self): np.random.seed(0) diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index 3dcbc7c6ac63b..2235cf93cfb60 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -1809,7 +1809,7 @@ def det(x, name=None): if in_dygraph_mode(): return _C_ops.det(x) else: - check_dtype(x.dtype, 'Input', ['float32', 'float64'], 'det') + check_dtype(x.dtype, 'Input', ['float16', 'float32', 'float64'], 'det') input_shape = list(x.shape) assert len(input_shape) >= 2, ( From cf6ed7cb004831384421f62bab236f6e4f2505d7 Mon Sep 17 00:00:00 2001 From: denglianbin <112610123+denglianbin@users.noreply.github.com> Date: Wed, 26 Apr 2023 11:16:22 +0800 Subject: [PATCH 073/405] =?UTF-8?q?=20=E3=80=90Hackathon=20No.48=E3=80=91?= =?UTF-8?q?=E4=B8=BA=20Paddle=20kthvalue=20=E7=AE=97=E5=AD=90=E5=AE=9E?= =?UTF-8?q?=E7=8E=B0=20float16=20=E6=95=B0=E6=8D=AE=E7=B1=BB=E5=9E=8B?= =?UTF-8?q?=E6=94=AF=E6=8C=81=20(#53285)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../fluid/tests/unittests/test_kthvalue_op.py | 24 +++++++++++++++---- python/paddle/tensor/search.py | 2 +- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_kthvalue_op.py b/python/paddle/fluid/tests/unittests/test_kthvalue_op.py index 0bf3d8e948097..e2fa225fd8f7e 100644 --- a/python/paddle/fluid/tests/unittests/test_kthvalue_op.py +++ b/python/paddle/fluid/tests/unittests/test_kthvalue_op.py @@ -40,11 +40,14 @@ def init_args(self): self.k = 5 self.axis = -1 + def init_dtype(self): + self.dtype = np.float64 + def setUp(self): self.op_type = "kthvalue" self.python_api = paddle.kthvalue - self.dtype = np.float64 - self.input_data = np.random.random((2, 1, 2, 4, 10)) + self.init_dtype() + self.input_data = np.random.random((2, 1, 2, 4, 10)).astype(self.dtype) self.init_args() self.inputs = {'X': self.input_data} self.attrs = {'k': self.k, 'axis': self.axis} @@ -62,17 +65,25 @@ def test_check_grad(self): self.check_grad({'X'}, 'Out') +class TestKthvalueOpFp16(TestKthvalueOp): + def init_dtype(self): + self.dtype = np.float16 + + class TestKthvalueOpWithKeepdim(OpTest): def init_args(self): self.k = 2 self.axis = 1 + def init_dtype(self): + self.dtype = np.float64 + def setUp(self): self.init_args() + self.init_dtype() self.op_type = "kthvalue" self.python_api = paddle.kthvalue - self.dtype = np.float64 - self.input_data = np.random.random((1, 3, 2, 4, 10)) + self.input_data = np.random.random((1, 3, 2, 4, 10)).astype(self.dtype) self.inputs = {'X': self.input_data} self.attrs = {'k': self.k, 'axis': self.axis, 'keepdim': True} output, indices = cal_kthvalue( @@ -89,6 +100,11 @@ def test_check_grad(self): self.check_grad({'X'}, 'Out') +class TestKthvalueOpWithKeepdimFp16(TestKthvalueOpWithKeepdim): + def init_dtype(self): + self.dtype = np.float16 + + class TestKthvalueOpKernels(unittest.TestCase): def setUp(self): self.axises = [2, -1] diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py index c9c1d4c35c165..9fc8e39a9ed82 100755 --- a/python/paddle/tensor/search.py +++ b/python/paddle/tensor/search.py @@ -1074,7 +1074,7 @@ def kthvalue(x, k, axis=None, keepdim=False, name=None): Find values and indices of the k-th smallest at the axis. Args: - x(Tensor): A N-D Tensor with type float32, float64, int32, int64. + x(Tensor): A N-D Tensor with type float16, float32, float64, int32, int64. k(int): The k for the k-th smallest number to look for along the axis. axis(int, optional): Axis to compute indices along. The effective range is [-R, R), where R is x.ndim. when axis < 0, it works the same way From b305629c992bf98f5d13722d1f6b9a8f7b3de6f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=99=88=E6=B2=A7=E5=A4=9C?= Date: Wed, 26 Apr 2023 11:42:23 +0800 Subject: [PATCH 074/405] remove *npu.cc (#53342) --- .../operators/detection/box_coder_op_npu.cc | 448 ------------------ .../detection/density_prior_box_op_npu.cc | 396 ---------------- .../detection/iou_similarity_op_npu.cc | 204 -------- .../operators/detection/prior_box_op_npu.cc | 106 ----- 4 files changed, 1154 deletions(-) delete mode 100644 paddle/fluid/operators/detection/box_coder_op_npu.cc delete mode 100644 paddle/fluid/operators/detection/density_prior_box_op_npu.cc delete mode 100644 paddle/fluid/operators/detection/iou_similarity_op_npu.cc delete mode 100644 paddle/fluid/operators/detection/prior_box_op_npu.cc diff --git a/paddle/fluid/operators/detection/box_coder_op_npu.cc b/paddle/fluid/operators/detection/box_coder_op_npu.cc deleted file mode 100644 index 4170088faff18..0000000000000 --- a/paddle/fluid/operators/detection/box_coder_op_npu.cc +++ /dev/null @@ -1,448 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/impl/box_coder.h" - -namespace paddle { -namespace operators { - -template -struct BoxCoderFunction { - public: - explicit BoxCoderFunction(const framework::ExecutionContext& ctx) : ctx(ctx) { - place = ctx.GetPlace(); - stream = ctx.template device_context() - .stream(); - } - phi::DenseTensor Adds(const phi::DenseTensor& x, float scalar) { - phi::DenseTensor y; - y.mutable_data(x.dims(), place); - const auto& runner = NpuOpRunner("Adds", {x}, {y}, {{"value", scalar}}); - runner.Run(stream); - return y; - } - phi::DenseTensor Muls(const phi::DenseTensor& x, float scalar) { - phi::DenseTensor y; - y.mutable_data(x.dims(), place); - const auto& runner = NpuOpRunner("Muls", {x}, {y}, {{"value", scalar}}); - runner.Run(stream); - return y; - } - phi::DenseTensor Mul(const phi::DenseTensor& x, const phi::DenseTensor& y) { - phi::DenseTensor z; - z.mutable_data(x.dims(), place); - const auto& runner = NpuOpRunner("Mul", {x, y}, {z}, {}); - runner.Run(stream); - return z; - } - phi::DenseTensor SubWithBroadCast(const phi::DenseTensor& x, - const phi::DenseTensor& y, - const framework::DDim& shape) { - phi::DenseTensor z; - z.mutable_data(shape, place); - const auto& runner = NpuOpRunner("Sub", {x, y}, {z}, {}); - runner.Run(stream); - return z; - } - void DivWithBroadCastVoid(const phi::DenseTensor& x, - const phi::DenseTensor& y, - const framework::DDim& shape, - phi::DenseTensor* z) { - z->mutable_data(shape, place); - const auto& runner = NpuOpRunner("Div", {x, y}, {*z}, {}); - runner.Run(stream); - } - phi::DenseTensor DivWithBroadCast(const phi::DenseTensor& x, - const phi::DenseTensor& y, - const framework::DDim& shape) { - phi::DenseTensor z; - DivWithBroadCastVoid(x, y, shape, &z); - return z; - } - void MulWithBroadCastVoid(const phi::DenseTensor& x, - const phi::DenseTensor& y, - const framework::DDim& shape, - phi::DenseTensor* z) { - z->mutable_data(shape, place); - const auto& runner = NpuOpRunner("Mul", {x, y}, {*z}, {}); - runner.Run(stream); - } - phi::DenseTensor MulWithBroadCast(const phi::DenseTensor& x, - const phi::DenseTensor& y, - const framework::DDim& shape) { - phi::DenseTensor z; - MulWithBroadCastVoid(x, y, shape, &z); - return z; - } - void AddWithBroadCastVoid(const phi::DenseTensor& x, - const phi::DenseTensor& y, - const framework::DDim& shape, - phi::DenseTensor* z) { - z->mutable_data(shape, place); - const auto& runner = NpuOpRunner("AddV2", {x, y}, {*z}, {}); - runner.Run(stream); - } - phi::DenseTensor AddWithBroadCast(const phi::DenseTensor& x, - const phi::DenseTensor& y, - const framework::DDim& shape) { - phi::DenseTensor z; - AddWithBroadCastVoid(x, y, shape, &z); - return z; - } - phi::DenseTensor Abs(const phi::DenseTensor& x) { - phi::DenseTensor y; - y.mutable_data(x.dims(), place); - const auto& runner = NpuOpRunner("Abs", {x}, {y}, {}); - runner.Run(stream); - return y; - } - phi::DenseTensor Log(const phi::DenseTensor& x) { - phi::DenseTensor t_x_m1 = Adds(x, -1); - phi::DenseTensor y; - y.mutable_data(x.dims(), place); - const auto& runner = NpuOpRunner("Log1p", {t_x_m1}, {y}, {}); - runner.Run(stream); - return y; - } - phi::DenseTensor Exp(const phi::DenseTensor& x) { - phi::DenseTensor y; - y.mutable_data(x.dims(), place); - const auto& runner = NpuOpRunner("Exp", {x}, {y}, {}); - runner.Run(stream); - return y; - } - phi::DenseTensor Dot(const phi::DenseTensor& x, const phi::DenseTensor& y) { - auto dim_x = x.dims(); - auto dim_y = y.dims(); - PADDLE_ENFORCE_EQ( - dim_x.size(), - 2, - platform::errors::InvalidArgument( - "x should be a 2-dim tensor, but got %d-dim.", dim_x.size())); - PADDLE_ENFORCE_EQ( - dim_y.size(), - 2, - platform::errors::InvalidArgument( - "y should be a 2-dim tensor, but got %d-dim.", dim_y.size())); - PADDLE_ENFORCE_EQ( - dim_x[1], - dim_y[0], - platform::errors::InvalidArgument("Expect dim_x[1] == dim_y[0], but " - "got dim_x[1] = %d, dim_y[0] = %d.", - dim_x[1], - dim_y[0])); - phi::DenseTensor z; - z.mutable_data({dim_x[0], dim_y[1]}, place); - const auto& runner = - NpuOpRunner("MatMul", - {x, y}, - {z}, - {{"transpose_x1", false}, {"transpose_x2", false}}); - runner.Run(stream); - return z; - } - void ConcatVoid(const std::vector& inputs, - const framework::DDim& shape_out, - int axis, - phi::DenseTensor* output) { - output->mutable_data(shape_out, place); - std::vector names; - for (size_t i = 0; i < inputs.size(); i++) { - names.push_back("x" + std::to_string(i)); - } - NpuOpRunner runner{ - "ConcatD", - {inputs}, - {*output}, - {{"concat_dim", axis}, {"N", static_cast(inputs.size())}}}; - runner.AddInputNames(names); - runner.Run(stream); - } - phi::DenseTensor Concat(const std::vector& inputs, - const framework::DDim& shape_out, - int axis) { - phi::DenseTensor output; - ConcatVoid(inputs, shape_out, axis, &output); - return output; - } - phi::DenseTensor Slice(const phi::DenseTensor& x, - const std::vector& offsets, - const std::vector& size, - const framework::DDim& shape) { - phi::DenseTensor y; - y.mutable_data(shape, place); - const auto& runner = - NpuOpRunner("SliceD", {x}, {y}, {{"offsets", offsets}, {"size", size}}); - runner.Run(stream); - return y; - } - - private: - platform::Place place; - aclrtStream stream; - const framework::ExecutionContext& ctx; -}; - -template -void Vector2Tensor(const framework::ExecutionContext& ctx, - const std::vector& vec, - const framework::DDim& ddim, - phi::DenseTensor* tsr) { - framework::TensorFromVector(vec, ctx.device_context(), tsr); - ctx.template device_context().Wait(); - tsr->Resize(ddim); -} - -template -void BoxCoderEnc(const framework::ExecutionContext& ctx, - const phi::DenseTensor* tb, - const phi::DenseTensor* pb, - const phi::DenseTensor* pbv, - const bool norm, - const std::vector& variance, - phi::DenseTensor* out) { - auto M = pb->dims()[0]; - auto N = tb->dims()[0]; - auto shape_0 = phi::make_ddim({4, 2}); - phi::DenseTensor m_diff; - phi::DenseTensor m_aver; - std::vector vec_diff = {static_cast(-1), - static_cast(0), - static_cast(0), - static_cast(-1), - static_cast(1), - static_cast(0), - static_cast(0), - static_cast(1)}; - std::vector vec_aver = {static_cast(0.5), - static_cast(0), - static_cast(0), - static_cast(0.5), - static_cast(0.5), - static_cast(0), - static_cast(0), - static_cast(0.5)}; - Vector2Tensor(ctx, vec_diff, shape_0, &m_diff); - Vector2Tensor(ctx, vec_aver, shape_0, &m_aver); - - BoxCoderFunction F(ctx); - phi::DenseTensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5)); - phi::DenseTensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1)); - phi::DenseTensor tb_xy = F.Dot(*tb, m_aver); - phi::DenseTensor tb_wh = F.Adds(F.Dot(*tb, m_diff), (norm ? 0 : 1)); - - pb_xy.Resize({1, M, 2}); - pb_wh.Resize({1, M, 2}); - tb_xy.Resize({N, 1, 2}); - tb_wh.Resize({N, 1, 2}); - - auto shape_half = phi::make_ddim({N, M, 2}); - auto shape_full = phi::make_ddim({N, M, 4}); - - phi::DenseTensor out_xy_0 = F.DivWithBroadCast( - F.SubWithBroadCast(tb_xy, pb_xy, shape_half), pb_wh, shape_half); - phi::DenseTensor out_wh_0 = - F.Log(F.Abs(F.DivWithBroadCast(tb_wh, pb_wh, shape_half))); - phi::DenseTensor out_0 = F.Concat({out_xy_0, out_wh_0}, shape_full, 2); - - if (pbv) { - F.DivWithBroadCastVoid(out_0, *pbv, shape_full, out); - } else { - phi::DenseTensor t_var; - std::vector vec_var(4); - for (auto i = 0; i < 4; i++) { - vec_var[i] = static_cast(variance[i]); - } - Vector2Tensor(ctx, vec_var, phi::make_ddim({1, 1, 4}), &t_var); - F.DivWithBroadCastVoid(out_0, t_var, shape_full, out); - } -} - -template -void BoxCoderDec(const framework::ExecutionContext& ctx, - const phi::DenseTensor* tb, - const phi::DenseTensor* pb, - const phi::DenseTensor* pbv, - const bool norm, - const std::vector& variance, - int axis, - phi::DenseTensor* out) { - auto shape_0 = phi::make_ddim({4, 2}); - phi::DenseTensor m_diff; - phi::DenseTensor m_aver; - std::vector vec_diff = {static_cast(-1), - static_cast(0), - static_cast(0), - static_cast(-1), - static_cast(1), - static_cast(0), - static_cast(0), - static_cast(1)}; - std::vector vec_aver = {static_cast(0.5), - static_cast(0), - static_cast(0), - static_cast(0.5), - static_cast(0.5), - static_cast(0), - static_cast(0), - static_cast(0.5)}; - Vector2Tensor(ctx, vec_diff, shape_0, &m_diff); - Vector2Tensor(ctx, vec_aver, shape_0, &m_aver); - - BoxCoderFunction F(ctx); - phi::DenseTensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5)); - phi::DenseTensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1)); - auto pb_resize_shape = axis == 0 ? phi::make_ddim({1, pb->dims()[0], 2}) - : phi::make_ddim({pb->dims()[0], 1, 2}); - pb_xy.Resize(pb_resize_shape); - pb_wh.Resize(pb_resize_shape); - - auto tbox_slice_shape = phi::make_ddim({tb->dims()[0], tb->dims()[1], 2}); - std::vector tbox_slice_size = { - static_cast(tb->dims()[0]), static_cast(tb->dims()[1]), 2}; - phi::DenseTensor tbox01 = - F.Slice(*tb, {0, 0, 0}, tbox_slice_size, tbox_slice_shape); - phi::DenseTensor tbox23 = - F.Slice(*tb, {0, 0, 2}, tbox_slice_size, tbox_slice_shape); - - phi::DenseTensor tb_xy; - phi::DenseTensor tb_wh; - if (pbv) { - auto pbvt_slice_shape = phi::make_ddim({pbv->dims()[0], 2}); - auto pbvt_resize_shape = axis == 0 ? phi::make_ddim({1, pbv->dims()[0], 2}) - : phi::make_ddim({pbv->dims()[0], 1, 2}); - std::vector pbvt_slice_size = {static_cast(pbv->dims()[0]), 2}; - phi::DenseTensor pbv_t01 = - F.Slice(*pbv, {0, 0}, pbvt_slice_size, pbvt_slice_shape); - phi::DenseTensor pbv_t23 = - F.Slice(*pbv, {0, 2}, pbvt_slice_size, pbvt_slice_shape); - pbv_t01.Resize(pbvt_resize_shape); - pbv_t23.Resize(pbvt_resize_shape); - - F.AddWithBroadCastVoid( - F.MulWithBroadCast(tbox01, F.Mul(pb_wh, pbv_t01), tbox_slice_shape), - pb_xy, - tbox_slice_shape, - &tb_xy); - F.MulWithBroadCastVoid( - F.Exp(F.MulWithBroadCast(pbv_t23, tbox23, tbox_slice_shape)), - pb_wh, - tbox_slice_shape, - &tb_wh); - } else if (variance.empty()) { - F.AddWithBroadCastVoid(F.MulWithBroadCast(tbox01, pb_wh, tbox_slice_shape), - pb_xy, - tbox_slice_shape, - &tb_xy); - F.MulWithBroadCastVoid(F.Exp(tbox23), pb_wh, tbox_slice_shape, &tb_wh); - } else { - phi::DenseTensor t_var01, t_var23; - auto t_var_shape = phi::make_ddim({1, 1, 2}); - std::vector vec_var01 = {static_cast(variance[0]), - static_cast(variance[1])}; - std::vector vec_var23 = {static_cast(variance[2]), - static_cast(variance[3])}; - Vector2Tensor(ctx, vec_var01, t_var_shape, &t_var01); - Vector2Tensor(ctx, vec_var23, t_var_shape, &t_var23); - F.AddWithBroadCastVoid( - F.MulWithBroadCast(tbox01, - F.MulWithBroadCast(pb_wh, t_var01, pb_resize_shape), - tbox_slice_shape), - pb_xy, - tbox_slice_shape, - &tb_xy); - F.MulWithBroadCastVoid( - F.Exp(F.MulWithBroadCast(t_var23, tbox23, tbox_slice_shape)), - pb_wh, - tbox_slice_shape, - &tb_wh); - } - phi::DenseTensor obox01 = - F.AddWithBroadCast(tb_xy, F.Muls(tb_wh, -0.5), tbox_slice_shape); - phi::DenseTensor obox23 = - F.Adds(F.AddWithBroadCast(tb_xy, F.Muls(tb_wh, 0.5), tbox_slice_shape), - (norm ? 0 : -1)); - F.ConcatVoid({obox01, obox23}, out->dims(), 2, out); -} - -template -class BoxCoderNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* prior_box = ctx.Input("PriorBox"); - auto* prior_box_var = ctx.Input("PriorBoxVar"); - auto* target_box = ctx.Input("TargetBox"); - auto* output_box = ctx.Output("OutputBox"); - std::vector variance = ctx.Attr>("variance"); - const int axis = ctx.Attr("axis"); - - if (prior_box_var) { - PADDLE_ENFORCE_EQ(variance.empty(), - true, - platform::errors::InvalidArgument( - "Input 'PriorBoxVar' and attribute 'variance'" - " of BoxCoder operator should not be used at the " - "same time.")); - } - if (!(variance.empty())) { - PADDLE_ENFORCE_EQ(static_cast(variance.size()), - 4, - platform::errors::InvalidArgument( - "Size of attribute 'variance' in BoxCoder operator" - " should be 4. But received size is %d", - variance.size())); - } - - if (target_box->lod().size()) { - PADDLE_ENFORCE_EQ(target_box->lod().size(), - 1, - platform::errors::InvalidArgument( - "Input 'TargetBox' of BoxCoder operator only" - " supports LoD with one level.")); - } - - auto code_type = - phi::funcs::GetBoxCodeType(ctx.Attr("code_type")); - bool normalized = ctx.Attr("box_normalized"); - - if (code_type == phi::funcs::BoxCodeType::kEncodeCenterSize) { - BoxCoderEnc(ctx, - target_box, - prior_box, - prior_box_var, - normalized, - variance, - output_box); - } else { - BoxCoderDec(ctx, - target_box, - prior_box, - prior_box_var, - normalized, - variance, - axis, - output_box); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(box_coder, - ops::BoxCoderNPUKernel, - ops::BoxCoderNPUKernel); diff --git a/paddle/fluid/operators/detection/density_prior_box_op_npu.cc b/paddle/fluid/operators/detection/density_prior_box_op_npu.cc deleted file mode 100644 index c9935e54d82ef..0000000000000 --- a/paddle/fluid/operators/detection/density_prior_box_op_npu.cc +++ /dev/null @@ -1,396 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/detection/density_prior_box_op.h" - -namespace paddle { -namespace operators { - -using fp16 = paddle::platform::float16; - -template -struct DensityPriorBoxFunction { - public: - explicit DensityPriorBoxFunction(const framework::ExecutionContext& ctx) - : ctx(ctx) { - place = ctx.GetPlace(); - stream = ctx.template device_context().stream(); - t0.mutable_data({1}, place); - t1.mutable_data({1}, place); - tn.mutable_data({1}, place); - FillNpuTensorWithConstant(&t0, static_cast(0)); - FillNpuTensorWithConstant(&t1, static_cast(1)); - } - void Arange(int n, phi::DenseTensor* x) { - // x should be init first - FillNpuTensorWithConstant(&tn, static_cast(n)); - const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {*x}, {}); - runner.Run(stream); - } - void Add(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // z should be init first - const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Cast(const phi::DenseTensor* x, phi::DenseTensor* y) { - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(y->type())); - const auto& runner = NpuOpRunner( - "Cast", {*x}, {*y}, {{"dst_type", static_cast(dst_dtype)}}); - runner.Run(stream); - } - void Sub(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // z should be init first - const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Mul(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // y should be init first - const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Adds(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) { - // y should be init first - const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}}); - runner.Run(stream); - } - void Muls(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) { - // y should be init first - const auto& runner = NpuOpRunner("Muls", {*x}, {*y}, {{"value", scalar}}); - runner.Run(stream); - } - void Maximum(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // y should be init first - const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Minimum(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // y should be init first - const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Concat(const std::vector& inputs, - int axis, - phi::DenseTensor* output) { - // output should be init first - std::vector names; - for (size_t i = 0; i < inputs.size(); i++) { - names.push_back("x" + std::to_string(i)); - } - NpuOpRunner runner{ - "ConcatD", - {inputs}, - {*output}, - {{"concat_dim", axis}, {"N", static_cast(inputs.size())}}}; - runner.AddInputNames(names); - runner.Run(stream); - } - void Tile(const phi::DenseTensor* x, - phi::DenseTensor* y, - const std::vector& multiples) { - // y should be init first - if (x->dims() == y->dims()) { - framework::TensorCopy( - *x, - place, - ctx.template device_context(), - y); - return; - } - const auto& runner = - NpuOpRunner("TileD", {*x}, {*y}, {{"multiples", multiples}}); - runner.Run(stream); - } - void FloatVec2Tsr(const std::vector& vec, phi::DenseTensor* tsr_dst) { - // - framework::TensorFromVector(vec, ctx.device_context(), tsr_dst); - ctx.template device_context().Wait(); - } - - private: - platform::Place place; - aclrtStream stream; - const framework::ExecutionContext& ctx; - phi::DenseTensor t0; - phi::DenseTensor t1; - phi::DenseTensor tn; -}; - -template <> -void DensityPriorBoxFunction::Arange(int n, phi::DenseTensor* x) { - phi::DenseTensor x_fp32(phi::DataType::FLOAT32); - x_fp32.mutable_data(x->dims(), place); - FillNpuTensorWithConstant(&tn, static_cast(n)); - const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {x_fp32}, {}); - runner.Run(stream); - Cast(&x_fp32, x); -} - -template <> -void DensityPriorBoxFunction::FloatVec2Tsr(const std::vector& vec, - phi::DenseTensor* tsr_dst) { - phi::DenseTensor tsr_fp32(phi::DataType::FLOAT32); - tsr_fp32.mutable_data(tsr_dst->dims(), place); - framework::TensorFromVector(vec, ctx.device_context(), &tsr_fp32); - ctx.template device_context().Wait(); - Cast(&tsr_fp32, tsr_dst); -} - -template -class DensityPriorBoxOpNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("Input"); - auto* image = ctx.Input("Image"); - auto* boxes = ctx.Output("Boxes"); - auto* vars = ctx.Output("Variances"); - - auto variances = ctx.Attr>("variances"); - auto clip = ctx.Attr("clip"); - - auto fixed_sizes = ctx.Attr>("fixed_sizes"); - auto fixed_ratios = ctx.Attr>("fixed_ratios"); - auto densities = ctx.Attr>("densities"); - - float step_w = ctx.Attr("step_w"); - float step_h = ctx.Attr("step_h"); - float offset = ctx.Attr("offset"); - - int image_w = image->dims()[3]; - int image_h = image->dims()[2]; - int layer_w = input->dims()[3]; - int layer_h = input->dims()[2]; - - auto _type = input->dtype(); - auto place = ctx.GetPlace(); - DensityPriorBoxFunction F(ctx); - - phi::DenseTensor h(_type); - h.mutable_data({layer_h}, place); - phi::DenseTensor w(_type); - w.mutable_data({layer_w}, place); - F.Arange(layer_h, &h); - F.Arange(layer_w, &w); - h.Resize({layer_h, 1, 1, 1}); - w.Resize({1, layer_w, 1, 1}); - - step_w = step_w > 0 ? step_w : static_cast(image_w) / layer_w; - step_h = step_h > 0 ? step_h : static_cast(image_h) / layer_h; - int step_average = static_cast((step_w + step_h) * 0.5); - - int ratios_size = fixed_ratios.size(); - int num_priors_per_ratio = 0; - for (size_t i = 0; i < densities.size(); ++i) { - num_priors_per_ratio += densities[i] * densities[i]; - } - phi::DenseTensor di(_type); - phi::DenseTensor dj(_type); - phi::DenseTensor shifts(_type); - phi::DenseTensor box_w_ratio(_type); - phi::DenseTensor box_h_ratio(_type); - di.mutable_data({ratios_size * num_priors_per_ratio}, place); - dj.mutable_data({ratios_size * num_priors_per_ratio}, place); - shifts.mutable_data({ratios_size * num_priors_per_ratio}, place); - box_w_ratio.mutable_data({ratios_size * num_priors_per_ratio}, place); - box_h_ratio.mutable_data({ratios_size * num_priors_per_ratio}, place); - - int64_t start = 0; - std::vector vec_tile = {0, 0, 0}; - for (size_t i = 0; i < densities.size(); ++i) { - // Range = start:start+ratios_size*density_sqr, density = densities[i] - int density_sqr = densities[i] * densities[i]; - // shifts[Range] = [step_average/density]*ratios_size*density_sqr - phi::DenseTensor shifts_part = - shifts.Slice(start, start + ratios_size * density_sqr); - FillNpuTensorWithConstant(&shifts_part, - static_cast(step_average / densities[i])); - - // di[Range] = [ i // density for i in range(density_sqr) ] * ratios_size - // dj[Range] = [ i % density for i in range(density_sqr) ] * ratios_size - phi::DenseTensor di_part = - di.Slice(start, start + ratios_size * density_sqr); - phi::DenseTensor dj_part = - dj.Slice(start, start + ratios_size * density_sqr); - if (densities[i] > 1) { - di_part.Resize({ratios_size, densities[i], densities[i]}); - dj_part.Resize({ratios_size, densities[i], densities[i]}); - phi::DenseTensor range_n(_type); - range_n.mutable_data({densities[i]}, place); - F.Arange(densities[i], &range_n); - range_n.Resize({1, densities[i], 1}); - vec_tile[0] = ratios_size; - vec_tile[1] = 1; - vec_tile[2] = densities[i]; - F.Tile(&range_n, &di_part, vec_tile); - range_n.Resize({1, 1, densities[i]}); - vec_tile[1] = densities[i]; - vec_tile[2] = 1; - F.Tile(&range_n, &dj_part, vec_tile); - } else { - FillNpuTensorWithConstant(&di_part, static_cast(0)); - FillNpuTensorWithConstant(&dj_part, static_cast(0)); - } - - int start_box_ratio = start; - for (float ar : fixed_ratios) { - // Range_mini = start_box_ratio:start_box_ratio+density_sqr - // box_h_ratio[Range_mini] = [fixed_sizes[i] * sqrt(ar)] * density_sqr - // box_w_ratio[Range_mini] = [fixed_sizes[i] / sqrt(ar)] * density_sqr - phi::DenseTensor box_h_ratio_part = - box_h_ratio.Slice(start_box_ratio, start_box_ratio + density_sqr); - phi::DenseTensor box_w_ratio_part = - box_w_ratio.Slice(start_box_ratio, start_box_ratio + density_sqr); - FillNpuTensorWithConstant(&box_w_ratio_part, - static_cast(fixed_sizes[i] * sqrt(ar))); - FillNpuTensorWithConstant(&box_h_ratio_part, - static_cast(fixed_sizes[i] / sqrt(ar))); - start_box_ratio += density_sqr; - } - start = start_box_ratio; - } - di.Resize({1, 1, ratios_size * num_priors_per_ratio, 1}); - dj.Resize({1, 1, ratios_size * num_priors_per_ratio, 1}); - shifts.Resize({1, 1, ratios_size * num_priors_per_ratio, 1}); - box_w_ratio.Resize({1, 1, ratios_size * num_priors_per_ratio, 1}); - box_h_ratio.Resize({1, 1, ratios_size * num_priors_per_ratio, 1}); - - // c_x = (w+offset)*step_w - 0.5*step_average + 0.5*shifts + dj*shifts - // c_y = (h+offset)*step_h - 0.5*step_average + 0.5*shifts + di*shifts - phi::DenseTensor c_x(_type); - phi::DenseTensor c_y(_type); - auto dim0 = - phi::make_ddim({1, layer_w, ratios_size * num_priors_per_ratio, 1}); - auto dim1 = - phi::make_ddim({layer_h, 1, ratios_size * num_priors_per_ratio, 1}); - c_x.mutable_data(dim0, place); - c_y.mutable_data(dim1, place); - F.Adds(&w, offset, &w); - F.Muls(&w, step_w, &w); - F.Adds(&w, static_cast(-step_average) * static_cast(0.5), &w); - F.Adds(&h, offset, &h); - F.Muls(&h, step_h, &h); - F.Adds(&h, static_cast(-step_average) * static_cast(0.5), &h); - F.Mul(&di, &shifts, &di); - F.Mul(&dj, &shifts, &dj); - F.Muls(&shifts, static_cast(0.5), &shifts); - F.Add(&di, &shifts, &di); - F.Add(&dj, &shifts, &dj); - F.Add(&dj, &w, &c_x); - F.Add(&di, &h, &c_y); - - // box_w_ratio = box_w_ratio / 2 - // box_h_ratio = box_h_ratio / 2 - F.Muls(&box_w_ratio, static_cast(0.5), &box_w_ratio); - F.Muls(&box_h_ratio, static_cast(0.5), &box_h_ratio); - - phi::DenseTensor zero_t(_type); - phi::DenseTensor one_t(_type); - zero_t.mutable_data({1}, place); - one_t.mutable_data({1}, place); - FillNpuTensorWithConstant(&zero_t, static_cast(0)); - FillNpuTensorWithConstant(&one_t, static_cast(1)); - - phi::DenseTensor outbox0(_type); - phi::DenseTensor outbox1(_type); - phi::DenseTensor outbox2(_type); - phi::DenseTensor outbox3(_type); - outbox0.mutable_data(dim0, place); - outbox1.mutable_data(dim1, place); - outbox2.mutable_data(dim0, place); - outbox3.mutable_data(dim1, place); - - // outbox0 = max ( (c_x - box_w_ratio)/image_w, 0 ) - // outbox1 = max ( (c_y - box_h_ratio)/image_h, 0 ) - // outbox2 = min ( (c_x + box_w_ratio)/image_w, 1 ) - // outbox3 = min ( (c_y + box_h_ratio)/image_h, 1 ) - F.Sub(&c_x, &box_w_ratio, &outbox0); - F.Sub(&c_y, &box_h_ratio, &outbox1); - F.Add(&c_x, &box_w_ratio, &outbox2); - F.Add(&c_y, &box_h_ratio, &outbox3); - F.Muls(&outbox0, static_cast(1.0 / image_w), &outbox0); - F.Muls(&outbox1, static_cast(1.0 / image_h), &outbox1); - F.Muls(&outbox2, static_cast(1.0 / image_w), &outbox2); - F.Muls(&outbox3, static_cast(1.0 / image_h), &outbox3); - - F.Maximum(&outbox0, &zero_t, &outbox0); - F.Maximum(&outbox1, &zero_t, &outbox1); - F.Minimum(&outbox2, &one_t, &outbox2); - F.Minimum(&outbox3, &one_t, &outbox3); - if (clip) { - // outbox0 = min ( outbox0, 1 ) - // outbox1 = min ( outbox1, 1 ) - // outbox2 = max ( outbox2, 0 ) - // outbox3 = max ( outbox3, 0 ) - F.Minimum(&outbox0, &one_t, &outbox0); - F.Minimum(&outbox1, &one_t, &outbox1); - F.Maximum(&outbox2, &zero_t, &outbox2); - F.Maximum(&outbox3, &zero_t, &outbox3); - } - - auto out_dim = phi::make_ddim( - {layer_h, layer_w, ratios_size * num_priors_per_ratio, 4}); - boxes->mutable_data(place); - vars->mutable_data(place); - phi::DenseTensor boxes_share(_type); - phi::DenseTensor vars_share(_type); - boxes_share.ShareDataWith(*boxes); - boxes_share.Resize(out_dim); - vars_share.ShareDataWith(*vars); - vars_share.Resize(out_dim); - - phi::DenseTensor box0(_type); - phi::DenseTensor box1(_type); - phi::DenseTensor box2(_type); - phi::DenseTensor box3(_type); - // out_dim = {layer_h, layer_w, ratios_size*num_priors_per_ratio, 1} - out_dim[3] = 1; - box0.mutable_data(out_dim, place); - box1.mutable_data(out_dim, place); - box2.mutable_data(out_dim, place); - box3.mutable_data(out_dim, place); - - std::vector vec_exp_out02 = {layer_h, 1, 1, 1}; - std::vector vec_exp_out13 = {1, layer_w, 1, 1}; - F.Tile(&outbox0, &box0, vec_exp_out02); - F.Tile(&outbox1, &box1, vec_exp_out13); - F.Tile(&outbox2, &box2, vec_exp_out02); - F.Tile(&outbox3, &box3, vec_exp_out13); - F.Concat({box0, box1, box2, box3}, 3, &boxes_share); - - std::vector multiples = { - layer_h, layer_w, ratios_size * num_priors_per_ratio, 1}; - phi::DenseTensor variances_t(_type); - // variances.size() == 4 - variances_t.mutable_data({4}, place); - F.FloatVec2Tsr(variances, &variances_t); - F.Tile(&variances_t, &vars_share, multiples); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(density_prior_box, - ops::DensityPriorBoxOpNPUKernel, - ops::DensityPriorBoxOpNPUKernel); diff --git a/paddle/fluid/operators/detection/iou_similarity_op_npu.cc b/paddle/fluid/operators/detection/iou_similarity_op_npu.cc deleted file mode 100644 index 8395e25d46251..0000000000000 --- a/paddle/fluid/operators/detection/iou_similarity_op_npu.cc +++ /dev/null @@ -1,204 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/detection/iou_similarity_op.h" - -namespace paddle { -namespace operators { - -template -struct IouFunction { - public: - explicit IouFunction(const framework::ExecutionContext& ctx) : ctx(ctx) { - place = ctx.GetPlace(); - stream = ctx.template device_context() - .stream(); - } - void Transpose(const phi::DenseTensor* x, - phi::DenseTensor* y, - const std::vector& axis) { - // y should be init first - const auto& runner = - NpuOpRunner("TransposeD", {*x}, {*y}, {{"perm", axis}}); - runner.Run(stream); - } - void Add(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // y should be init first - const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Sub(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // y should be init first - const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Mul(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // y should be init first - const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void DivNoNan(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // y should be init first - const auto& runner = NpuOpRunner("DivNoNan", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Adds(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) { - // y should be init first - const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}}); - runner.Run(stream); - } - void Maximum(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // z should be init first - const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Minimum(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // z should be init first - const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - - private: - platform::Place place; - aclrtStream stream; - const framework::ExecutionContext& ctx; -}; - -template -class IouSimilarityNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - bool normalized = ctx.Attr("box_normalized"); - auto* out = ctx.Output("Out"); - - auto _type = x->dtype(); - auto place = ctx.GetPlace(); - - IouFunction F(ctx); - - auto N = x->dims()[0]; - auto M = y->dims()[0]; - - out->mutable_data({N, M}, place); - phi::DenseTensor xt(_type); - phi::DenseTensor yt(_type); - xt.mutable_data({4, N}, place); - yt.mutable_data({4, M}, place); - std::vector vec_trans = {1, 0}; - F.Transpose(x, &xt, vec_trans); - F.Transpose(y, &yt, vec_trans); - phi::DenseTensor xmin1 = xt.Slice(0, 1); - phi::DenseTensor ymin1 = xt.Slice(1, 2); - phi::DenseTensor xmax1 = xt.Slice(2, 3); - phi::DenseTensor ymax1 = xt.Slice(3, 4); - phi::DenseTensor xmin2 = yt.Slice(0, 1); - phi::DenseTensor ymin2 = yt.Slice(1, 2); - phi::DenseTensor xmax2 = yt.Slice(2, 3); - phi::DenseTensor ymax2 = yt.Slice(3, 4); - xmin1.Resize({N, 1}); - ymin1.Resize({N, 1}); - xmax1.Resize({N, 1}); - ymax1.Resize({N, 1}); - xmin2.Resize({1, M}); - ymin2.Resize({1, M}); - xmax2.Resize({1, M}); - ymax2.Resize({1, M}); - - phi::DenseTensor w1(_type); - phi::DenseTensor h1(_type); - phi::DenseTensor w2(_type); - phi::DenseTensor h2(_type); - phi::DenseTensor area1(_type); - phi::DenseTensor area2(_type); - w1.mutable_data({N, 1}, place); - h1.mutable_data({N, 1}, place); - w2.mutable_data({1, M}, place); - h2.mutable_data({1, M}, place); - area1.mutable_data({N, 1}, place); - area2.mutable_data({1, M}, place); - F.Sub(&xmax1, &xmin1, &w1); - F.Sub(&ymax1, &ymin1, &h1); - F.Sub(&xmax2, &xmin2, &w2); - F.Sub(&ymax2, &ymin2, &h2); - if (!normalized) { - F.Adds(&w1, 1.0f, &w1); - F.Adds(&h1, 1.0f, &h1); - F.Adds(&w2, 1.0f, &w2); - F.Adds(&h2, 1.0f, &h2); - } - F.Mul(&w1, &h1, &area1); - F.Mul(&w2, &h2, &area2); - - phi::DenseTensor inter_xmax(_type); - phi::DenseTensor inter_ymax(_type); - phi::DenseTensor inter_xmin(_type); - phi::DenseTensor inter_ymin(_type); - inter_xmax.mutable_data({N, M}, place); - inter_ymax.mutable_data({N, M}, place); - inter_xmin.mutable_data({N, M}, place); - inter_ymin.mutable_data({N, M}, place); - F.Minimum(&xmax1, &xmax2, &inter_xmax); - F.Minimum(&ymax1, &ymax2, &inter_ymax); - F.Maximum(&xmin1, &xmin2, &inter_xmin); - F.Maximum(&ymin1, &ymin2, &inter_ymin); - - phi::DenseTensor inter_w(_type); - phi::DenseTensor inter_h(_type); - inter_w.mutable_data({N, M}, place); - inter_h.mutable_data({N, M}, place); - F.Sub(&inter_xmax, &inter_xmin, &inter_w); - F.Sub(&inter_ymax, &inter_ymin, &inter_h); - - if (!normalized) { - F.Adds(&inter_w, 1.0f, &inter_w); - F.Adds(&inter_h, 1.0f, &inter_h); - } - phi::DenseTensor zeros(_type); - zeros.mutable_data({1}, place); - FillNpuTensorWithConstant(&zeros, static_cast(0)); - F.Maximum(&inter_w, &zeros, &inter_w); - F.Maximum(&inter_h, &zeros, &inter_h); - - F.Mul(&inter_w, &inter_h, out); - phi::DenseTensor union_area(_type); - union_area.mutable_data({N, M}, place); - F.Add(&area1, &area2, &union_area); - F.Sub(&union_area, out, &union_area); - F.DivNoNan(out, &union_area, out); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(iou_similarity, - ops::IouSimilarityNPUKernel, - ops::IouSimilarityNPUKernel); diff --git a/paddle/fluid/operators/detection/prior_box_op_npu.cc b/paddle/fluid/operators/detection/prior_box_op_npu.cc deleted file mode 100644 index 7df68d2bbb1bb..0000000000000 --- a/paddle/fluid/operators/detection/prior_box_op_npu.cc +++ /dev/null @@ -1,106 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/detection/prior_box_op.h" - -namespace paddle { -namespace operators { - -template -class PriorBoxNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("Input"); - auto* image = ctx.Input("Image"); - auto* boxes = ctx.Output("Boxes"); - auto* variances = ctx.Output("Variances"); - - PADDLE_ENFORCE_EQ(boxes->dims(), - variances->dims(), - platform::errors::Unimplemented( - "the shape of boxes and variances must be same in " - "the npu kernel of prior_box, but got boxes->dims() " - "= [%s], variances->dims() = [%s]", - boxes->dims(), - variances->dims())); - - auto min_sizes = ctx.Attr>("min_sizes"); - auto max_sizes = ctx.Attr>("max_sizes"); - auto aspect_ratios = ctx.Attr>("aspect_ratios"); - auto variances_attr = ctx.Attr>("variances"); - bool flip = ctx.Attr("flip"); - bool clip = ctx.Attr("clip"); - float step_w = ctx.Attr("step_w"); - float step_h = ctx.Attr("step_h"); - float offset = ctx.Attr("offset"); - - auto place = ctx.GetPlace(); - - phi::DenseTensor out(input->type()); - auto out_dims = phi::vectorize(boxes->dims()); - out_dims.insert(out_dims.begin(), 2); - out.Resize(phi::make_ddim(out_dims)); - out.mutable_data(place); - - framework::NPUAttributeMap attr_input = {{"min_size", min_sizes}, - {"max_size", max_sizes}, - {"aspect_ratio", aspect_ratios}, - {"step_h", step_h}, - {"step_w", step_w}, - {"flip", flip}, - {"clip", clip}, - {"offset", offset}, - {"variance", variances_attr}}; - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = - NpuOpRunner("PriorBox", {*input, *image}, {out}, attr_input); - runner.Run(stream); - - out.Resize(phi::make_ddim({out.numel()})); - phi::DenseTensor out_boxes = out.Slice(0, boxes->numel()); - phi::DenseTensor out_variances = out.Slice(boxes->numel(), out.numel()); - - out_boxes.Resize(boxes->dims()); - out_variances.Resize(variances->dims()); - - boxes->mutable_data(place); - variances->mutable_data(place); - - framework::TensorCopy( - out_boxes, - place, - ctx.template device_context(), - boxes); - framework::TensorCopy( - out_variances, - place, - ctx.template device_context(), - variances); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - prior_box, - ops::PriorBoxNPUKernel, - ops::PriorBoxNPUKernel); From 23e96bdec0484a34a5b1b325932bac0a9b733360 Mon Sep 17 00:00:00 2001 From: engineer1109 Date: Wed, 26 Apr 2023 11:46:37 +0800 Subject: [PATCH 075/405] [Debug][Werror]error: control reaches end of non-void function [-Werror=return-type](#53326) --- .../fluid/prim/utils/static/composite_grad_desc_maker.h | 9 --------- 1 file changed, 9 deletions(-) diff --git a/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h b/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h index eb13902be2068..1ef5161dc0714 100644 --- a/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h +++ b/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h @@ -405,15 +405,6 @@ class CompositeGradOpMakerBase { } } return input_grads; - PADDLE_ENFORCE_LE( - var_names.size(), - 1UL, - platform::errors::Unavailable( - "BUG from operator developer:" - " for input argument with a list of variables, " - " drop_empty_grad is not allowed because it makes" - " the correspondence bewteen a variable and its gradient" - " ambiguous.")); } std::vector MultiOutputGrad( From 66fbfba8ed5c4ac96fbf9634461a7c5a10017645 Mon Sep 17 00:00:00 2001 From: zhuyipin Date: Wed, 26 Apr 2023 13:13:56 +0800 Subject: [PATCH 076/405] add leaky relu composite rule (#52909) * add leaky relu composite rule * add public python api * unset default negative slope * fix unittest case --- .../tests/unittests/test_activation_op.py | 57 +++++++++++++++---- .../incubate/autograd/composite_rules.py | 9 +++ 2 files changed, 55 insertions(+), 11 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index dfa95f760ce6a..c273cd4954941 100644 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -1933,6 +1933,8 @@ def get_alpha(self): def setUp(self): self.op_type = "leaky_relu" self.python_api = paddle.nn.functional.leaky_relu + self.public_python_api = paddle.nn.functional.leaky_relu + self.prim_op_type = "comp" self.init_dtype() self.init_shape() alpha = self.get_alpha() @@ -1948,10 +1950,13 @@ def setUp(self): self.attrs = {'alpha': alpha} self.convert_input_output() + def test_check_output(self): + self.check_output(check_prim=True) + def test_check_grad(self): if self.dtype == np.float16: return - self.check_grad(['X'], 'Out') + self.check_grad(['X'], 'Out', check_prim=True) class TestLeakyReluAlpha1(TestLeakyRelu): @@ -1973,6 +1978,26 @@ class TestLeakyRelu_ZeroDim(TestLeakyRelu): def init_shape(self): self.shape = [] + def setUp(self): + self.op_type = "leaky_relu" + self.prim_op_type = "comp" + self.enable_cinn = False + self.python_api = paddle.nn.functional.leaky_relu + self.public_python_api = paddle.nn.functional.relu + self.init_dtype() + self.init_shape() + alpha = self.get_alpha() + + np.random.seed(1024) + x = np.random.uniform(-1, 1, self.shape).astype(self.dtype) + # The same reason with TestAbs + x[np.abs(x) < 0.005] = 0.05 + out = ref_leaky_relu(x, alpha) + + self.inputs = {'X': x} + self.outputs = {'Out': out} + self.attrs = {'alpha': alpha} + class TestLeakyReluAPI(unittest.TestCase): # test paddle.nn.LeakyReLU, paddle.nn.functional.leaky_relu, @@ -4031,11 +4056,13 @@ def test_check_grad(self): create_test_act_fp16_class(TestSwish) create_test_act_fp16_class(TestHardSwish, check_prim=True) create_test_act_fp16_class(TestMish) -create_test_act_fp16_class(TestLeakyRelu) -create_test_act_fp16_class(TestLeakyReluAlpha1) -create_test_act_fp16_class(TestLeakyReluAlpha2) -create_test_act_fp16_class(TestLeakyReluAlpha3) -create_test_act_fp16_class(TestLeakyRelu_ZeroDim) +create_test_act_fp16_class(TestLeakyRelu, check_prim=True) +create_test_act_fp16_class(TestLeakyReluAlpha1, check_prim=True) +create_test_act_fp16_class(TestLeakyReluAlpha2, check_prim=True) +create_test_act_fp16_class(TestLeakyReluAlpha3, check_prim=True) +create_test_act_fp16_class( + TestLeakyRelu_ZeroDim, check_prim=True, enable_cinn=False +) create_test_act_fp16_class(TestRsqrt) @@ -4142,11 +4169,19 @@ def test_check_grad(self): create_test_act_bf16_class(TestSwish) create_test_act_bf16_class(TestHardSwish, check_prim=True) create_test_act_bf16_class(TestMish) -create_test_act_bf16_class(TestLeakyRelu) -create_test_act_bf16_class(TestLeakyReluAlpha1) -create_test_act_bf16_class(TestLeakyReluAlpha2) -create_test_act_bf16_class(TestLeakyReluAlpha3) -create_test_act_bf16_class(TestLeakyRelu_ZeroDim) +create_test_act_bf16_class(TestLeakyRelu, check_prim=True, enable_cinn=False) +create_test_act_bf16_class( + TestLeakyReluAlpha1, check_prim=True, enable_cinn=False +) +create_test_act_bf16_class( + TestLeakyReluAlpha2, check_prim=True, enable_cinn=False +) +create_test_act_bf16_class( + TestLeakyReluAlpha3, check_prim=True, enable_cinn=False +) +create_test_act_bf16_class( + TestLeakyRelu_ZeroDim, check_prim=True, enable_cinn=False +) create_test_act_bf16_class(TestRsqrt) if __name__ == "__main__": diff --git a/python/paddle/incubate/autograd/composite_rules.py b/python/paddle/incubate/autograd/composite_rules.py index 72bc1601bfacc..3a1a3ea7d6751 100644 --- a/python/paddle/incubate/autograd/composite_rules.py +++ b/python/paddle/incubate/autograd/composite_rules.py @@ -677,3 +677,12 @@ def group_norm_composite(x, scale, bias, epsilon, groups, data_layout): if is_amp: out = cast(out, "float16") return out, ret_mean_, ret_var_ + + +@REGISTER_COMPOSITE('leaky_relu') +def leaky_relu_composite(x, negative_slope): + """define composite rule of op leaky_relu.""" + if negative_slope < 1.0: + return maximum(x, negative_slope * x) + else: + return minimum(x, negative_slope * x) From 35f5c245654f9e995e3d60dd6e603f696cbdae79 Mon Sep 17 00:00:00 2001 From: sneaxiy <32832641+sneaxiy@users.noreply.github.com> Date: Wed, 26 Apr 2023 13:25:34 +0800 Subject: [PATCH 077/405] Optimize c_embedding op in deterministic mode (#53197) * optimize embedding deterministic mode * fix compile error * change FLAGS_cudnn_deterministic to int64 * fix 700 error * add ut * fix ut * fix ut * fix win32 ci * fix flags with PHI_DEFINE_EXPORTED_int64 --- .../operators/collective/c_embedding_op.cu | 87 ++++--- paddle/phi/core/flags.cc | 12 +- paddle/phi/kernels/funcs/embedding_grad.h | 167 ++++++++++++++ .../phi/kernels/gpu/embedding_grad_kernel.cu | 116 +--------- .../unittests/test_embedding_deterministic.py | 213 ++++++++++++++++++ 5 files changed, 456 insertions(+), 139 deletions(-) create mode 100644 paddle/phi/kernels/funcs/embedding_grad.h create mode 100644 python/paddle/fluid/tests/unittests/test_embedding_deterministic.py diff --git a/paddle/fluid/operators/collective/c_embedding_op.cu b/paddle/fluid/operators/collective/c_embedding_op.cu index 8b521580c5cd5..4861b5d26ab0f 100644 --- a/paddle/fluid/operators/collective/c_embedding_op.cu +++ b/paddle/fluid/operators/collective/c_embedding_op.cu @@ -18,8 +18,9 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/float16.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" +#include "paddle/phi/kernels/funcs/embedding_grad.h" -DECLARE_bool(cudnn_deterministic); +DECLARE_int64(embedding_deterministic); namespace paddle { namespace operators { @@ -154,7 +155,6 @@ class CEmbeddingGradCUDAKernel : public framework::OpKernel { int D = d_table_t->dims()[1]; int K = ids_t->numel(); - const int64_t end_idx = start_idx + N; auto limit = K * D; int blocks = NumBlocks(limit); int threads = kNumCUDAThreads; @@ -166,33 +166,64 @@ class CEmbeddingGradCUDAKernel : public framework::OpKernel { t.device(*dev_ctx.eigen_device()) = t.constant(static_cast(0)); const auto &index_type = framework::TransToProtoVarType(ids_t->dtype()); - if (FLAGS_cudnn_deterministic) { - VLOG(2) << "Run grad kernel of embedding with single thread."; - blocks = 1; - } - if (index_type == framework::proto::VarType::INT32) { - CEmbeddingGrad - <<>>(d_table, - d_output, - ids_t->data(), - K, - D, - N, - start_idx, - end_idx, - limit); - } else if (index_type == framework::proto::VarType::INT64) { - CEmbeddingGrad - <<>>(d_table, - d_output, - ids_t->data(), - K, - D, - N, - start_idx, - end_idx, - limit); + if (FLAGS_embedding_deterministic == 1) { + if (index_type == framework::proto::VarType::INT32) { + phi::funcs::LaunchEmbeddingGradDeterministicKernel( + dev_ctx, + ids_t->data(), + d_output, + d_table, + N, + D, + K, + start_idx); + return; + } else if (index_type == framework::proto::VarType::INT64) { + phi::funcs::LaunchEmbeddingGradDeterministicKernel( + dev_ctx, + ids_t->data(), + d_output, + d_table, + N, + D, + K, + start_idx); + return; + } + } else { + if (FLAGS_embedding_deterministic > 1) { + VLOG(2) << "Run grad kernel of embedding with single thread."; + blocks = 1; + } + const int64_t end_idx = start_idx + N; + if (index_type == framework::proto::VarType::INT32) { + CEmbeddingGrad + <<>>(d_table, + d_output, + ids_t->data(), + K, + D, + N, + start_idx, + end_idx, + limit); + return; + } else if (index_type == framework::proto::VarType::INT64) { + CEmbeddingGrad + <<>>(d_table, + d_output, + ids_t->data(), + K, + D, + N, + start_idx, + end_idx, + limit); + return; + } } + PADDLE_THROW(phi::errors::InvalidArgument( + "The data type of Input(Ids) must be int32 or int64.")); } }; diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc index 058ab7159e5ee..d2428e28b0dec 100644 --- a/paddle/phi/core/flags.cc +++ b/paddle/phi/core/flags.cc @@ -236,17 +236,19 @@ PHI_DEFINE_EXPORTED_bool( * CUDA related FLAG * Name: FLAGS_embedding_deterministic * Since Version: 2.5 - * Value Range: bool, default=false + * Value Range: int64, default=0 * Example: * Note: whether to use deterministic algorithm in embedding op. - * If true, it will use deterministic CUDA kernel in embedding op. + * If it is 1, it will use the optimized deterministic CUDA kernel in + * embedding op. If it is 2, it will use the legacy deterministic + * CUDA kernel in embedding op. */ -PHI_DEFINE_EXPORTED_bool( +PHI_DEFINE_EXPORTED_int64( embedding_deterministic, - false, + 0, "Whether allow using an deterministic algorithm for embedding " "operator. The deterministic algorithm may be slower. If " - "true, the algorithm is deterministic."); + "it is larger than 0, the algorithm is deterministic."); /** * CUDNN related FLAG diff --git a/paddle/phi/kernels/funcs/embedding_grad.h b/paddle/phi/kernels/funcs/embedding_grad.h new file mode 100644 index 0000000000000..3ad0f22c8e912 --- /dev/null +++ b/paddle/phi/kernels/funcs/embedding_grad.h @@ -0,0 +1,167 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/amp_type_traits.h" + +namespace phi { +namespace funcs { + +template +__global__ void EmbeddingGradDeterministicKernel(T* table, + const T* output, + const IdT* ids, + const int64_t K, + const int64_t D, + const int64_t start_idx, + const int64_t end_idx) { + using MT = typename dtype::MPTypeTrait::Type; + constexpr int64_t kInvalidId = -1; + extern __shared__ char buf[]; + MT* smem = reinterpret_cast(buf); + MT* my_s = smem + WarpSize * threadIdx.y; + IdT* indices_batch = + reinterpret_cast(buf + sizeof(MT) * WarpSize * BlockDimY); + + const int stride = static_cast(D); + + const int feature = threadIdx.x + blockIdx.x * WarpSize; + + // To ensure determinism. If any other warps pulled grad data targeting + // dst_row, we elect the first warp in each matching group as the leader. + // Each leader warp serializes the accumulates targeting dst_row in shared + // memory, then adding the accumulated buffer to dst_row in table. + for (int batch_start = 0; batch_start < K; + batch_start += WarpSize * BlockDimY) { + int tid = threadIdx.x + threadIdx.y * WarpSize; + if (batch_start + tid < K) { + int64_t cur_id = static_cast(ids[batch_start + tid]); + if (UseLimit) { + if (cur_id >= start_idx && cur_id < end_idx) { + cur_id -= start_idx; + } else { + cur_id = kInvalidId; + } + } + indices_batch[tid] = cur_id; + } + + int batch_end = + min(static_cast(batch_start + WarpSize * BlockDimY), K); + + // Loop over the batch of <= 1024 loaded indices in chunks of BLOCKDIMY + for (int chunk_start = batch_start; chunk_start < batch_end; + chunk_start += BlockDimY) { + // This sync makes sure that indices_batch is ready and match-group + // leaders are done with their accumulates before other warps start + // loading again. + __syncthreads(); + + int n_this_chunk = min(batch_end - chunk_start, BlockDimY); + + int64_t src_row = static_cast(chunk_start + threadIdx.y); + int64_t dst_row = indices_batch[src_row - batch_start]; + if (src_row < K && feature < stride) { + if (UseLimit && dst_row == kInvalidId) { + my_s[threadIdx.x] = static_cast(0); + } else { + my_s[threadIdx.x] = static_cast(output[src_row * D + feature]); + } + } + + __syncthreads(); + + if (src_row < K) { + int match_found_this_thread = 0; + if (threadIdx.x < n_this_chunk && + (!UseLimit || dst_row != kInvalidId)) { + match_found_this_thread = + (dst_row == + indices_batch[chunk_start - batch_start + threadIdx.x]); + } +#ifdef PADDLE_WITH_HIP + unsigned long long int matchmask = // NOLINT + __ballot(match_found_this_thread); // NOLINT + int first_remaining_peer = __ffsll(matchmask) - 1; +#else + // If and only if match_found_this_thread of the Nth thread is non-zero, + // set the Nth bit of matchmask to 1. + unsigned int matchmask = + __ballot_sync(0xffffffff, match_found_this_thread); + // Find the position of the first bit set to 1 in matchmask. + int first_remaining_peer = __ffs(matchmask) - 1; +#endif + + // select lowest-indexed warp as the leader + if (threadIdx.y == first_remaining_peer) { + // Set the first bit 1 in matchmask to 0. + matchmask ^= (1 << first_remaining_peer); + while (matchmask) { +#ifdef PADDLE_WITH_HIP + first_remaining_peer = __ffsll(matchmask) - 1; +#else + first_remaining_peer = __ffs(matchmask) - 1; +#endif + my_s[threadIdx.x] += + smem[threadIdx.x + WarpSize * first_remaining_peer]; + matchmask ^= (1 << first_remaining_peer); + } + if (feature < stride && (!UseLimit || dst_row != kInvalidId)) { + auto table_idx = dst_row * D + feature; + table[table_idx] = static_cast( + static_cast(table[table_idx]) + my_s[threadIdx.x]); + } + } + } + } + } +} + +template +void LaunchEmbeddingGradDeterministicKernel(const GPUContext& ctx, + const IdT* ids, + const T* d_out, + T* d_table, + int64_t N, + int64_t D, + int64_t K, + int64_t start_idx = -1) { +#ifdef PADDLE_WITH_HIP + constexpr int kWarpSize = 64; + constexpr int kBlockDimY = 16; +#else + constexpr int kWarpSize = 32; + constexpr int kBlockDimY = 32; +#endif + dim3 threads(kWarpSize, kBlockDimY); + dim3 grids(static_cast((D + kWarpSize - 1) / kWarpSize)); + using MT = typename dtype::MPTypeTrait::Type; + constexpr auto kSharedMemSize = sizeof(MT) * kWarpSize * kBlockDimY + + sizeof(IdT) * kWarpSize * kBlockDimY; + if (start_idx < 0) { + EmbeddingGradDeterministicKernel + <<>>( + d_table, d_out, ids, K, D, -1, -1); + } else { + int64_t end_idx = start_idx + N; + EmbeddingGradDeterministicKernel + <<>>( + d_table, d_out, ids, K, D, start_idx, end_idx); + } +} + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu index 4771dd15dd296..99ba12b1d6213 100644 --- a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/embedding_grad_kernel.h" +#include "paddle/phi/kernels/funcs/embedding_grad.h" #include "gflags/gflags.h" #include "glog/logging.h" @@ -26,20 +27,10 @@ #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/embedding_util.h" -DECLARE_bool(embedding_deterministic); +DECLARE_int64(embedding_deterministic); namespace phi { -#ifdef PADDLE_WITH_HIP -#define WARP_SIZE 64 -#define BLOCKDIMY 16 -#else -#define WARP_SIZE 32 -#define BLOCKDIMY 32 -#endif - -#define MASK 0xffffffff - template __global__ void InputTypeConvert(const InT* in_ids, const int64_t K, @@ -74,91 +65,6 @@ __global__ void EmbeddingGrad(T* table, } } -template -__global__ void EmbeddingGradDeterministic( - T* table, const T* output, const IdT* ids, const IdT K, const IdT D) { - using MT = typename dtype::MPTypeTrait::Type; - extern __shared__ char buf[]; - MT* smem = reinterpret_cast(buf); - MT* my_s = smem + WARP_SIZE * threadIdx.y; - IdT* indices_batch = - reinterpret_cast(buf + sizeof(MT) * WARP_SIZE * BLOCKDIMY); - - const int stride = static_cast(D); - - const int feature = threadIdx.x + blockIdx.x * WARP_SIZE; - - // To ensure determinism. If any other warps pulled grad data targeting - // dst_row, we elect the first warp in each matching group as the leader. - // Each leader warp serializes the accumulates targeting dst_row in shared - // memory, then adding the accumulated buffer to dst_row in table. - for (int batch_start = 0; batch_start < K; - batch_start += WARP_SIZE * BLOCKDIMY) { - int tid = threadIdx.x + threadIdx.y * WARP_SIZE; - if (batch_start + tid < K) - indices_batch[tid] = static_cast(ids[batch_start + tid]); - - int batch_end = - min(static_cast(batch_start + WARP_SIZE * BLOCKDIMY), K); - - // Loop over the batch of <= 1024 loaded indices in chunks of BLOCKDIMY - for (int chunk_start = batch_start; chunk_start < batch_end; - chunk_start += BLOCKDIMY) { - // This sync makes sure that indices_batch is ready and match-group - // leaders are done with their accumulates before other warps start - // loading again. - __syncthreads(); - - int n_this_chunk = min(batch_end - chunk_start, BLOCKDIMY); - - IdT src_row = static_cast(chunk_start + threadIdx.y); - IdT dst_row = indices_batch[src_row - batch_start]; - if (src_row < K && feature < stride) - my_s[threadIdx.x] = static_cast(output[src_row * D + feature]); - - __syncthreads(); - - if (src_row < K) { - int match_found_this_thread = 0; - if (threadIdx.x < n_this_chunk) { - match_found_this_thread = - (dst_row == - indices_batch[chunk_start - batch_start + threadIdx.x]); - } -#ifdef PADDLE_WITH_HIP - unsigned long long int matchmask = // NOLINT - __ballot(match_found_this_thread); // NOLINT - int first_remaining_peer = __ffsll(matchmask) - 1; -#else - // If and only if match_found_this_thread of the Nth thread is non-zero, - // set the Nth bit of matchmask to 1. - unsigned int matchmask = __ballot_sync(MASK, match_found_this_thread); - // Find the position of the first bit set to 1 in matchmask. - int first_remaining_peer = __ffs(matchmask) - 1; -#endif - - // select lowest-indexed warp as the leader - if (threadIdx.y == first_remaining_peer) { - // Set the first bit 1 in matchmask to 0. - matchmask ^= (1 << first_remaining_peer); - while (matchmask) { -#ifdef PADDLE_WITH_HIP - first_remaining_peer = __ffsll(matchmask) - 1; -#else - first_remaining_peer = __ffs(matchmask) - 1; -#endif - my_s[threadIdx.x] += - smem[threadIdx.x + WARP_SIZE * first_remaining_peer]; - matchmask ^= (1 << first_remaining_peer); - } - if (feature < stride) - table[dst_row * D + feature] += static_cast(my_s[threadIdx.x]); - } - } - } - } -} - template struct EmbeddingGradCUDAFunctor { EmbeddingGradCUDAFunctor(const Context& dev_ctx, @@ -198,20 +104,18 @@ struct EmbeddingGradCUDAFunctor { cudaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx_.stream())); #endif - if (FLAGS_embedding_deterministic) { - dim3 threads(WARP_SIZE, BLOCKDIMY); - dim3 grids(static_cast((D + WARP_SIZE - 1) / WARP_SIZE)); - using MT = typename dtype::MPTypeTrait::Type; - EmbeddingGradDeterministic - <<>>(d_table, d_output, ids, K, D); + if (FLAGS_embedding_deterministic == 1) { + phi::funcs::LaunchEmbeddingGradDeterministicKernel( + dev_ctx_, ids, d_output, d_table, N, D, K); } else { const int gridx = 2 * dev_ctx_.GetSMCount(); dim3 threads(128, 8); dim3 grids(gridx, 1); + if (FLAGS_embedding_deterministic > 1) { + VLOG(2) << "Run grad kernel of embedding with single thread."; + grids.x = 1; + threads.y = 1; + } EmbeddingGrad<<>>( d_table, d_output, ids, N, K, D); } diff --git a/python/paddle/fluid/tests/unittests/test_embedding_deterministic.py b/python/paddle/fluid/tests/unittests/test_embedding_deterministic.py new file mode 100644 index 0000000000000..e64b4aa07ef9c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_embedding_deterministic.py @@ -0,0 +1,213 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import random +import sys +import unittest + +import numpy as np + +import paddle +from paddle.distributed.fleet.layers.mpu.mp_ops import _c_lookup_table + + +@contextlib.contextmanager +def deterministic_guard(value): + flag_name = 'FLAGS_embedding_deterministic' + old_value = paddle.get_flags(flag_name)[flag_name] + paddle.set_flags({flag_name: value}) + assert paddle.get_flags(flag_name)[flag_name] == value + yield + paddle.set_flags({flag_name: old_value}) + assert paddle.get_flags(flag_name)[flag_name] == old_value + + +def to_numpy(tensor): + if tensor.dtype in [paddle.float16, paddle.bfloat16]: + tensor = tensor.astype(paddle.float32) + return tensor.numpy() + + +def clone_weight(weight): + if weight.dtype == paddle.bfloat16: + weight = weight.astype(paddle.float32).numpy() + weight = paddle.to_tensor(weight, dtype=paddle.float32).astype( + paddle.bfloat16 + ) + else: + weight = paddle.to_tensor(weight.numpy()) + weight.stop_gradient = False + return weight + + +def embedding(ids, weight, out_grad, deterministic_level=0, rank=None): + weight = clone_weight(weight) + with deterministic_guard(deterministic_level): + if rank is not None: + vocab_size, _ = weight.shape + start_idx = vocab_size * rank + out = _c_lookup_table(weight, ids, start_index=start_idx) + else: + out = paddle.nn.functional.embedding(ids, weight) + out.backward(out_grad.clone()) + return to_numpy(out), to_numpy(weight.grad) + + +def embedding_ground_truth(ids, weight, out_grad, rank=None): + weight = clone_weight(weight.astype(paddle.float32)) + out_grad = out_grad.astype(paddle.float32) + return embedding(ids, weight, out_grad, deterministic_level=2, rank=rank) + + +def generate_input_data( + ids_shape, + vocab_size, + hidden_size, + weight_dtype, + ids_dtype, + allow_duplicate_id=True, + rank=None, + nranks=None, + allow_pure_random=False, +): + max_id = vocab_size if rank is None else vocab_size * nranks + if allow_duplicate_id: + ids = np.random.randint(low=0, high=max_id, size=ids_shape) + else: + sequence = list(range(max_id)) + numel = int(np.prod(ids_shape)) + if len(sequence) < numel: + return None, None, None + ids = np.array(random.sample(sequence, numel)).reshape(ids_shape) + + ids = paddle.to_tensor(ids).astype(ids_dtype) + ids.stop_gradient = True + + weight = paddle.randn([vocab_size, hidden_size]).astype(weight_dtype) + weight.stop_gradient = False + + out_grad_shape = list(ids_shape) + [hidden_size] + if allow_duplicate_id and not allow_pure_random: + out_grad = paddle.randint(low=-10, high=10, shape=out_grad_shape) + else: + out_grad = paddle.randn(out_grad_shape) + out_grad = out_grad.astype(weight.dtype) + return ids, weight, out_grad + + +def get_all_dtypes(): + if not paddle.is_compiled_with_cuda() or paddle.is_compiled_with_rocm(): + return [] + + dtypes = [paddle.float32, paddle.float16] + if 'A100' in paddle.device.cuda.get_device_properties().name: + dtypes.append(paddle.bfloat16) + return dtypes + + +class TestEmbeddingBase(unittest.TestCase): + def setUp(self): + self.ids_shape = [32, 3] + self.vocab_size = 128 + self.hidden_size = 1024 + self.nranks = 8 + + def check_main( + self, + weight_dtype, + ids_dtype, + deterministic_level=0, + rank=None, + allow_duplicate_id=True, + allow_pure_random=False, + ): + if sys.platform == 'win32' and rank is not None: + return + + ids, weight, out_grad = generate_input_data( + ids_shape=self.ids_shape, + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + weight_dtype=weight_dtype, + ids_dtype=ids_dtype, + allow_duplicate_id=allow_duplicate_id, + rank=rank, + nranks=self.nranks, + allow_pure_random=allow_pure_random, + ) + if ids is None: + return + + if allow_pure_random: + out_1, weight_grad_1 = embedding_ground_truth( + ids, weight, out_grad, rank + ) + out_2, weight_grad_2 = embedding_ground_truth( + ids, weight, out_grad, rank + ) + else: + out_1, weight_grad_1 = embedding_ground_truth( + ids, weight, out_grad, rank + ) + out_2, weight_grad_2 = embedding( + ids, + weight, + out_grad, + deterministic_level=deterministic_level, + rank=rank, + ) + np.testing.assert_equal(out_1, out_2) + np.testing.assert_equal(weight_grad_1, weight_grad_2) + + def test_main(self): + weight_dtypes = get_all_dtypes() + ids_dtypes = [paddle.int64, paddle.int32] + deterministic_levels = [0, 1] + ranks = [None, 0, 2, 4, 8] + allow_duplicate_ids = [False, True] + allow_pure_randoms = [False, True] + for weight_dtype in weight_dtypes: + for ids_dtype in ids_dtypes: + for deterministic_level in deterministic_levels: + for rank in ranks: + for allow_duplicate_id in allow_duplicate_ids: + for allow_pure_random in allow_pure_randoms: + self.check_main( + weight_dtype, + ids_dtype, + deterministic_level, + rank, + allow_duplicate_id, + allow_pure_random, + ) + + +class TestEmbedding2(TestEmbeddingBase): + def setUp(self): + self.ids_shape = [32, 16] + self.vocab_size = 128 + self.hidden_size = 1024 + self.nranks = 8 + + +class TestEmbeddingDeterministic(unittest.TestCase): + def setUp(self): + self.ids_shape = [32, 16] + self.vocab_size = 128 + self.hidden_size = 1024 + + +if __name__ == "__main__": + unittest.main() From c59debe22926fb97d99ab3007e1b5f092e40f6c3 Mon Sep 17 00:00:00 2001 From: ShenLiang <1422485404@qq.com> Date: Wed, 26 Apr 2023 14:35:18 +0800 Subject: [PATCH 078/405] [HybridParallel]Add segment methods for pipelineparallel (#53344) --- .../parallel_layers/pp_layers.py | 34 ++++++++++++++++++- .../unittests/hybrid_parallel_pp_layer.py | 14 ++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py index 2f5c42a69e362..f3be9894a9cfe 100755 --- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py +++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py @@ -109,7 +109,37 @@ def __init__( ), "layer number should be greater than number of segments" def do_segment(self): - if self.method == "uniform": + + if isinstance(self.method, list): + seg_method = self.method[:] + source_num_parts = len(seg_method) - 1 + + def check_sanity(): + assert seg_method[0] == 0, "seg_method[0] should be 0" + for part in seg_method: + assert isinstance(part, int), "part should be int" + assert part >= 0, f"part[{part}] should be greater than 0" + assert ( + part <= self.num_items + ), "part[{}] should be less than num_items[{}]".format( + part, self.num_items + ) + + check_sanity() + + if self.num_parts == source_num_parts + 1: + seg_method.append(self.num_items) + return seg_method + elif self.num_parts == source_num_parts: + return seg_method + else: + raise ValueError( + "We set seg_method as {}, this length is {}, but the number of stages is {}".format( + seg_method, len(seg_method), self.num_parts + ) + ) + + elif self.method == "uniform": return self.uniform(self.num_items, self.num_parts) elif self.method.startswith('layer:'): @@ -144,6 +174,8 @@ def do_segment(self): memory_counter = 0 result[actual_num_parts] = len(weights) return result + else: + raise ValueError(f"method {self.method} is not supported") def _gen_layer_weight(self, layername): weight_idxs = [] diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py index 5d15e79d64b75..cf4c20e550ab3 100644 --- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py +++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py @@ -136,6 +136,20 @@ def test_pipelayer_sequential(self): np.testing.assert_array_equal(param_a.name, param_b.name) np.testing.assert_allclose(param_a.numpy(), param_b.numpy()) + def test_pipelayer_segment_method(self): + init_net = AlexNetPipe() + pipe_model = PipelineLayer( + layers=init_net.to_layers(), + num_stages=self.pipeline_parallel_size, + seg_method=[0, 4], + loss_fn=nn.CrossEntropyLoss(), + ) + stage_id = self.hcg.get_stage_id() + if stage_id == 0: + np.testing.assert_array_equal(len(pipe_model.parameters()), 4) + elif stage_id == 1: + np.testing.assert_array_equal(len(pipe_model.parameters()), 8) + if __name__ == '__main__': unittest.main() From cd88156a369bbfb83d6306f89e0ae6ebd78b8040 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AA=91=E9=A9=AC=E5=B0=8F=E7=8C=AB?= <1435130236@qq.com> Date: Wed, 26 Apr 2023 14:47:29 +0800 Subject: [PATCH 079/405] [Bug fixes] enable two ops to support bf16 in llama model (#53026) --- python/paddle/nn/functional/activation.py | 2 +- python/paddle/tensor/manipulation.py | 6 ++++-- python/paddle/tensor/math.py | 1 + 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py index d89ce9cb4537a..04fa9ebc6dd09 100644 --- a/python/paddle/nn/functional/activation.py +++ b/python/paddle/nn/functional/activation.py @@ -965,7 +965,7 @@ def silu(x, name=None): Where :math:`x` is the input Tensor. Parameters: - x (Tensor): The input Tensor with data type float32, float64. + x (Tensor): The input Tensor with data type bfloat16, float16, float32, float64. name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None. Returns: diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index 4d053e7c90c6d..27e2a4b812de5 100644 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -1899,7 +1899,7 @@ def split(x, num_or_sections, axis=0, name=None): Split the input tensor into multiple sub-Tensors. Args: - x (Tensor): A N-D Tensor. The data type is bool, float16, float32, float64, uint8, int8, int32 or int64. + x (Tensor): A N-D Tensor. The data type is bool, bfloat16, float16, float32, float64, uint8, int8, int32 or int64. num_or_sections (int|list|tuple): If ``num_or_sections`` is an int, then ``num_or_sections`` indicates the number of equal sized sub-Tensors that the ``x`` will be divided into. If ``num_or_sections`` is a list or tuple, the length of it indicates the number of @@ -1970,6 +1970,7 @@ def split(x, num_or_sections, axis=0, name=None): 'input', [ 'bool', + 'bfloat16', 'float16', 'uint16', 'float32', @@ -2546,7 +2547,7 @@ def unsqueeze(x, axis, name=None): please use `Tensor.clone` like ``unsqueeze_clone_x = x.unsqueeze(-1).clone()``. Args: - x (Tensor): The input Tensor to be unsqueezed. Supported data type: float32, float64, bool, int8, int32, int64. + x (Tensor): The input Tensor to be unsqueezed. Supported data type: bfloat16, float16, float32, float64, bool, int8, int32, int64. axis (int|list|tuple|Tensor): Indicates the dimensions to be inserted. The data type is ``int32`` . If ``axis`` is a list or tuple, the elements of it should be integers or Tensors with shape [1]. If ``axis`` is a Tensor, it should be an 1-D Tensor . @@ -2600,6 +2601,7 @@ def unsqueeze(x, axis, name=None): input, 'input', [ + 'uint16', 'float16', 'uint16', 'float32', diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index ece8463f35e63..2f94f0a7e2013 100644 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -499,6 +499,7 @@ def _elementwise_op(helper): "elementwise_sub", "elementwise_mul", "elementwise_div", + "elementwise_max", ] if original_op_type in bf16_and_complex_supported_ops: data_type = [ From f9e5072b2ba1a36f4fde23763b37d0fbf599fb1d Mon Sep 17 00:00:00 2001 From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com> Date: Wed, 26 Apr 2023 15:59:40 +0800 Subject: [PATCH 080/405] remove some [-Wunused-parameter] waring (#53319) * test,test=develop * test,test=develop * test,test=develop * test,test=develop * test,test=develop * test,test=develop * test,test=develop --- paddle/fluid/distributed/ps/service/server.h | 11 +++--- .../fluid/distributed/ps/table/common_table.h | 14 ++++---- paddle/fluid/distributed/ps/table/table.h | 33 ++++++++--------- paddle/fluid/framework/data_feed.h | 23 ++++++------ paddle/fluid/framework/data_set.h | 36 ++++++++++--------- paddle/fluid/framework/device_worker.h | 19 +++++----- .../framework/no_need_buffer_vars_inference.h | 27 +++++++------- paddle/fluid/framework/op_registry.h | 2 +- paddle/phi/core/kernel_registry.h | 2 +- paddle/phi/kernels/empty_kernel.cc | 6 ++-- paddle/phi/kernels/funcs/compound_functors.h | 18 +++++----- .../phi/kernels/funcs/elementwise_functor.h | 25 ++++++------- paddle/phi/kernels/funcs/for_range.h | 5 +-- paddle/phi/kernels/funcs/reduce_functor.h | 10 +++--- .../kernels/impl/meshgrid_grad_kernel_impl.h | 4 +-- paddle/phi/kernels/impl/reduce_grad.h | 6 ++-- .../kernels/impl/set_value_grad_kernel_impl.h | 2 +- paddle/phi/kernels/onednn/conv_handler.h | 4 +-- 18 files changed, 131 insertions(+), 116 deletions(-) diff --git a/paddle/fluid/distributed/ps/service/server.h b/paddle/fluid/distributed/ps/service/server.h index 7aab3abe71f91..48b32d22cac79 100644 --- a/paddle/fluid/distributed/ps/service/server.h +++ b/paddle/fluid/distributed/ps/service/server.h @@ -31,6 +31,7 @@ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/core/macros.h" namespace google { namespace protobuf { @@ -96,7 +97,9 @@ class PSServer { virtual int32_t StartS2S() { return 0; } virtual ::std::future SendPServer2PServerMsg( - int msg_type, int to_pserver_id, const std::string &msg) { + int msg_type UNUSED, + int to_pserver_id UNUSED, + const std::string &msg UNUSED) { LOG(FATAL) << "NotImplementError: PSServer::send_pserver2pserver_msg"; std::promise promise; std::future fut = promise.get_future(); @@ -124,9 +127,9 @@ class PSServer { } return itr->second(msg_type, from_pserver_id, msg); } - virtual int32_t ReceiveFromPServer(int msg_type, - int pserver_id, - const std::string &msg) { + virtual int32_t ReceiveFromPServer(int msg_type UNUSED, + int pserver_id UNUSED, + const std::string &msg UNUSED) { LOG(FATAL) << "NotImplementError::PSServer::ReceiveFromPServer"; return -1; } diff --git a/paddle/fluid/distributed/ps/table/common_table.h b/paddle/fluid/distributed/ps/table/common_table.h index d1b93af538cde..d56b8cd39a00a 100644 --- a/paddle/fluid/distributed/ps/table/common_table.h +++ b/paddle/fluid/distributed/ps/table/common_table.h @@ -70,18 +70,20 @@ class BarrierTable : public Table { BarrierTable() {} virtual ~BarrierTable() {} - virtual void *GetShard(size_t shard_idx) { return 0; } + virtual void *GetShard(size_t shard_idx UNUSED) { return 0; } - virtual int32_t Pull(TableContext &context) { return 0; } // NOLINT - virtual int32_t Push(TableContext &context) { return 0; } // NOLINT + virtual int32_t Pull(TableContext &context UNUSED) { return 0; } // NOLINT + virtual int32_t Push(TableContext &context UNUSED) { return 0; } // NOLINT - int32_t Shrink(const std::string ¶m) override { return 0; } + int32_t Shrink(const std::string ¶m UNUSED) override { return 0; } virtual void Clear() {} virtual int32_t Flush() { return 0; } - virtual int32_t Load(const std::string &path, const std::string ¶m) { + virtual int32_t Load(const std::string &path UNUSED, + const std::string ¶m UNUSED) { return 0; } - virtual int32_t Save(const std::string &path, const std::string ¶m) { + virtual int32_t Save(const std::string &path UNUSED, + const std::string ¶m UNUSED) { return 0; } virtual int32_t InitializeShard() { return 0; } diff --git a/paddle/fluid/distributed/ps/table/table.h b/paddle/fluid/distributed/ps/table/table.h index f07a3f2132217..b64e05e3b0a11 100644 --- a/paddle/fluid/distributed/ps/table/table.h +++ b/paddle/fluid/distributed/ps/table/table.h @@ -32,6 +32,7 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/string_helper.h" +#include "paddle/phi/core/macros.h" namespace paddle { namespace distributed { @@ -77,22 +78,22 @@ class Table { virtual int32_t Push(TableContext &context) = 0; // NOLINT // only for barrier - virtual int32_t Barrier(const uint32_t trainer_id, - const std::string barrier_type) { + virtual int32_t Barrier(const uint32_t trainer_id UNUSED, + const std::string barrier_type UNUSED) { return 0; } // only for barrier table virtual int32_t SetTableMap( - std::unordered_map> *table_map) { + std::unordered_map> *table_map UNUSED) { return 0; } // only for tensor table virtual int32_t SetProgramEnv( - framework::Scope *scope, - platform::Place place, - const std::vector *sub_program) { + framework::Scope *scope UNUSED, + platform::Place place UNUSED, + const std::vector *sub_program UNUSED) { return 0; } @@ -115,23 +116,23 @@ class Table { const std::string &converter) = 0; // for cache virtual int32_t SaveCache( - const std::string &path, - const std::string ¶m, + const std::string &path UNUSED, + const std::string ¶m UNUSED, paddle::framework::Channel> - &shuffled_channel) { + &shuffled_channel UNUSED) { return 0; } virtual int64_t CacheShuffle( - const std::string &path, - const std::string ¶m, - double cache_threshold, + const std::string &path UNUSED, + const std::string ¶m UNUSED, + double cache_threshold UNUSED, std::function( int msg_type, int to_pserver_id, std::string &msg)> // NOLINT - send_msg_func, + send_msg_func UNUSED, paddle::framework::Channel> - &shuffled_channel, - const std::vector &table_ptrs) { + &shuffled_channel UNUSED, + const std::vector
&table_ptrs UNUSED) { return 0; } @@ -149,7 +150,7 @@ class Table { virtual void *GetShard(size_t shard_idx) = 0; virtual std::pair PrintTableStat() { return {0, 0}; } - virtual int32_t CacheTable(uint16_t pass_id) { return 0; } + virtual int32_t CacheTable(uint16_t pass_id UNUSED) { return 0; } // for patch model virtual void Revert() {} diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h index a6cf44234773b..1057640842c2c 100644 --- a/paddle/fluid/framework/data_feed.h +++ b/paddle/fluid/framework/data_feed.h @@ -402,22 +402,22 @@ class CustomParser { virtual void Init(const std::vector& slots) = 0; virtual bool Init(const std::vector& slots) = 0; virtual void ParseOneInstance(const char* str, Record* instance) = 0; - virtual int ParseInstance(int len, - const char* str, - std::vector* instances) { + virtual int ParseInstance(int len UNUSED, + const char* str UNUSED, + std::vector* instances UNUSED) { return 0; } virtual bool ParseOneInstance( - const std::string& line, - std::function&, int)> - GetInsFunc) { // NOLINT + const std::string& line UNUSED, + std::function&, int)> GetInsFunc + UNUSED) { // NOLINT return true; } virtual bool ParseFileInstance( - std::function ReadBuffFunc, - std::function&, int, int)> - PullRecordsFunc, // NOLINT - int& lines) { // NOLINT + std::function ReadBuffFunc UNUSED, + std::function&, int, int)> PullRecordsFunc + UNUSED, // NOLINT + int& lines UNUSED) { // NOLINT return false; } }; @@ -1267,7 +1267,8 @@ class DataFeed { virtual void SetInsIdVec(MiniBatchGpuPack* pack) {} #endif - virtual void DumpWalkPath(std::string dump_path, size_t dump_rate) { + virtual void DumpWalkPath(std::string dump_path UNUSED, + size_t dump_rate UNUSED) { PADDLE_THROW(platform::errors::Unimplemented( "This function(DumpWalkPath) is not implemented.")); } diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h index 22bb5b703944d..1bc60993e36a0 100644 --- a/paddle/fluid/framework/data_set.h +++ b/paddle/fluid/framework/data_set.h @@ -25,6 +25,7 @@ #include #include #include +#include "paddle/phi/core/macros.h" #ifdef PADDLE_WITH_GLOO #include @@ -51,12 +52,12 @@ class Dataset { Dataset() {} virtual ~Dataset() {} // do sample - virtual void TDMSample(const std::string tree_name, - const std::string tree_path, - const std::vector tdm_layer_counts, - const uint16_t start_sample_layer, - const bool with_hierachy, - const uint16_t seed_, + virtual void TDMSample(const std::string tree_name UNUSED, + const std::string tree_path UNUSED, + const std::vector tdm_layer_counts UNUSED, + const uint16_t start_sample_layer UNUSED, + const bool with_hierachy UNUSED, + const uint16_t seed_ UNUSED, const uint16_t sample_slot) {} // set file list virtual void SetFileList(const std::vector& filelist) = 0; @@ -238,8 +239,9 @@ class DatasetImpl : public Dataset { virtual void WaitPreLoadDone(); virtual void ReleaseMemory(); virtual void LocalShuffle(); - virtual void GlobalShuffle(int thread_num = -1) {} - virtual void SlotsShuffle(const std::set& slots_to_replace) {} + virtual void GlobalShuffle(int thread_num UNUSED = -1) {} + virtual void SlotsShuffle( + const std::set& slots_to_replace UNUSED) {} virtual const std::vector& GetSlotsOriginalData() { return slots_shuffle_original_data_; } @@ -251,12 +253,12 @@ class DatasetImpl : public Dataset { virtual void MergeByInsId() {} virtual void PreprocessInstance() {} virtual void PostprocessInstance() {} - virtual void SetCurrentPhase(int current_phase) {} - virtual void GenerateLocalTablesUnlock(int table_id, - int feadim, - int read_thread_num, - int consume_thread_num, - int shard_num) {} + virtual void SetCurrentPhase(int current_phase UNUSED) {} + virtual void GenerateLocalTablesUnlock(int table_id UNUSED, + int feadim UNUSED, + int read_thread_num UNUSED, + int consume_thread_num UNUSED, + int shard_num UNUSED) {} virtual void ClearLocalTables() {} virtual void CreatePreLoadReaders(); virtual void DestroyPreLoadReaders(); @@ -288,9 +290,9 @@ class DatasetImpl : public Dataset { virtual uint32_t GetPassID() { return pass_id_; } protected: - virtual int ReceiveFromClient(int msg_type, - int client_id, - const std::string& msg) { + virtual int ReceiveFromClient(int msg_type UNUSED, + int client_id UNUSED, + const std::string& msg UNUSED) { // TODO(yaoxuefeng) for SlotRecordDataset return -1; } diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index 743513e38aad1..177513b70cd3c 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -44,6 +44,7 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/timer.h" #include "paddle/phi/backends/dynload/port.h" +#include "paddle/phi/core/macros.h" namespace paddle { namespace framework { @@ -179,14 +180,14 @@ class DeviceWorker { virtual void BindingDataFeedMemory() = 0; virtual void SetRootScope(Scope* root_scope); virtual void SetDataFeed(DataFeed* data_feed); - virtual void SetWorkerNum(int num) {} - virtual void CacheProgram(const ProgramDesc& main_program) {} + virtual void SetWorkerNum(int num UNUSED) {} + virtual void CacheProgram(const ProgramDesc& main_program UNUSED) {} virtual void ProduceTasks() {} virtual void GetXpuOpIndex() {} - virtual void Schedule(int taskid) {} + virtual void Schedule(int taskid UNUSED) {} #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - virtual void SetStream(const gpuStream_t stream) {} - virtual void SetEvent(const gpuEvent_t event) {} + virtual void SetStream(const gpuStream_t stream UNUSED) {} + virtual void SetEvent(const gpuEvent_t event UNUSED) {} #endif virtual void SetNeedDumpField(bool need_dump_field) { need_dump_field_ = need_dump_field; @@ -256,7 +257,7 @@ class CPUWorkerBase : public DeviceWorker { virtual void TrainFiles() = 0; virtual void TrainFilesWithProfiler() {} virtual void PrintFetchVars() {} - virtual void CreateDeviceResource(const ProgramDesc& main_prog) {} + virtual void CreateDeviceResource(const ProgramDesc& main_prog UNUSED) {} protected: int thread_id_; @@ -684,7 +685,7 @@ class SectionWorker : public DeviceWorker { void PrepareUnusedVar(); void BindingDataFeedMemory() override {} - void CreateDeviceResource(const ProgramDesc& main_prog) override{}; + void CreateDeviceResource(const ProgramDesc& main_prog UNUSED) override{}; void TrainFiles() override; void TrainFilesWithProfiler() override{}; @@ -693,7 +694,7 @@ class SectionWorker : public DeviceWorker { const platform::Place& place() const { return place_; } - void SetDeviceIndex(int tid) override {} + void SetDeviceIndex(int tid UNUSED) override {} void SetThreadIndex(int thread_id) { thread_id_ = thread_id; } void SetMicrobatchNum(int num) { num_microbatches_ = num; } void SetPipelineStageNum(int num) { num_pipeline_stages_ = num; } @@ -755,7 +756,7 @@ class HeterSectionWorker : public DeviceWorker { ~HeterSectionWorker() override {} void Initialize(const TrainerDesc& desc) override; - void CreateDeviceResource(const ProgramDesc& main_prog) override{}; + void CreateDeviceResource(const ProgramDesc& main_prog UNUSED) override{}; void TrainFiles() override; void TrainFilesWithProfiler() override; diff --git a/paddle/fluid/framework/no_need_buffer_vars_inference.h b/paddle/fluid/framework/no_need_buffer_vars_inference.h index 47e74e2d80a39..37f790a0d3f41 100644 --- a/paddle/fluid/framework/no_need_buffer_vars_inference.h +++ b/paddle/fluid/framework/no_need_buffer_vars_inference.h @@ -23,6 +23,7 @@ #include "paddle/fluid/framework/type_defs.h" #include "paddle/fluid/imperative/type_defs.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/macros.h" namespace paddle { namespace framework { @@ -83,19 +84,19 @@ class NoNeedBufferVarsInference { } }; -#define DECLARE_NO_NEED_BUFFER_VARS_INFERER(class_type, ...) \ - class class_type final \ - : public ::paddle::framework::NoNeedBufferVarsInference { \ - public: \ - using ::paddle::framework::NoNeedBufferVarsInference:: \ - NoNeedBufferVarsInference; \ - \ - const std::unordered_set &operator()( \ - const ::paddle::framework::InferNoNeedBufferVarsContext &ctx) \ - const final { \ - static std::unordered_set __ret__{__VA_ARGS__}; \ - return __ret__; \ - } \ +#define DECLARE_NO_NEED_BUFFER_VARS_INFERER(class_type, ...) \ + class class_type final \ + : public ::paddle::framework::NoNeedBufferVarsInference { \ + public: \ + using ::paddle::framework::NoNeedBufferVarsInference:: \ + NoNeedBufferVarsInference; \ + \ + const std::unordered_set &operator()( \ + const ::paddle::framework::InferNoNeedBufferVarsContext &ctx \ + UNUSED) const final { \ + static std::unordered_set __ret__{__VA_ARGS__}; \ + return __ret__; \ + } \ } class InferNoNeedBufferVarsFN { diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 13ef07ab9855e..941e8ea050e08 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -148,7 +148,7 @@ class OpRegistry { }; template -inline void CheckKernelLaunch(const char* op_type) {} +inline void CheckKernelLaunch(const char* op_type UNUSED) {} #ifdef PADDLE_WITH_CUDA template <> diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h index 5d2ab27469af1..3ca455da1dd40 100644 --- a/paddle/phi/core/kernel_registry.h +++ b/paddle/phi/core/kernel_registry.h @@ -1365,7 +1365,7 @@ struct KernelRegistrar { return 0; \ } \ void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ - const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel) + const ::phi::KernelKey& kernel_key UNUSED, ::phi::Kernel* kernel UNUSED) #else #define __PD_REGISTER_KERNEL_FOR_ALL_DTYPE( \ reg_type, kernel_name, backend, layout, kernel_fn) \ diff --git a/paddle/phi/kernels/empty_kernel.cc b/paddle/phi/kernels/empty_kernel.cc index f2782a63c9020..0877a8e24468c 100644 --- a/paddle/phi/kernels/empty_kernel.cc +++ b/paddle/phi/kernels/empty_kernel.cc @@ -12,17 +12,17 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/kernels/empty_kernel.h" - #include "paddle/phi/backends/all_context.h" #include "paddle/phi/common/complex.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/macros.h" namespace phi { template void EmptyKernel(const Context& dev_ctx, const IntArray& shape, - DataType dtype, + DataType dtype UNUSED, DenseTensor* out) { out->Resize(phi::make_ddim(shape.GetData())); dev_ctx.template Alloc(out); @@ -31,7 +31,7 @@ void EmptyKernel(const Context& dev_ctx, template void EmptyLikeKernel(const Context& dev_ctx, const DenseTensor& x, - DataType dtype, + DataType dtype UNUSED, DenseTensor* out) { dev_ctx.template Alloc(out); } diff --git a/paddle/phi/kernels/funcs/compound_functors.h b/paddle/phi/kernels/funcs/compound_functors.h index a27d0e5a9f164..0fd3fd0e932fc 100644 --- a/paddle/phi/kernels/funcs/compound_functors.h +++ b/paddle/phi/kernels/funcs/compound_functors.h @@ -17,7 +17,7 @@ limitations under the License. */ #include #include #include - +#include "paddle/phi/core/macros.h" namespace phi { namespace funcs { @@ -33,7 +33,7 @@ struct BinaryCompoundFunctor { return func1_(x, intermediat_out); } - inline HOSTDEVICE T GetIntermediateOut(T x, T y) { return func2_(y); } + inline HOSTDEVICE T GetIntermediateOut(T x UNUSED, T y) { return func2_(y); } BinaryFunctor func1_; UnaryFunctor func2_; @@ -64,16 +64,18 @@ struct BinaryCompoundGradDxFunctor { const UnaryFun &unary_fun) : d_binary_fun_(d_binary_fun), unary_fun_(unary_fun) {} - inline HOSTDEVICE T Recompute(T x, T y, T out, T dout) { + inline HOSTDEVICE T Recompute(T x, T y, T out UNUSED, T dout) { return dout * d_binary_fun_.Dx(x, unary_fun_(y)); } - inline HOSTDEVICE T - UseIntermediateOut(T x, T y, T intermediate_out, T out, T dout) { + inline HOSTDEVICE T UseIntermediateOut( + T x, T y UNUSED, T intermediate_out, T out UNUSED, T dout) { return dout * d_binary_fun_.Dx(x, intermediate_out); } - inline HOSTDEVICE T GetIntermediateOut(T x, T y) { return unary_fun_(y); } + inline HOSTDEVICE T GetIntermediateOut(T x UNUSED, T y) { + return unary_fun_(y); + } private: DBinaryFun d_binary_fun_; @@ -210,13 +212,13 @@ struct BinaryCompoundGradDIntermedaiteOutFunctor { const UnaryFun &unary_fun) : d_binary_fun_(d_binary_fun), unary_fun_(unary_fun) {} - inline HOSTDEVICE T Recompute(T x, T y, T out, T dout) { + inline HOSTDEVICE T Recompute(T x, T y, T out UNUSED, T dout) { return dout * d_binary_fun_.Dy(x, unary_fun_(y)); } inline HOSTDEVICE T UseIntermediateOut(T x, T intermediate_out, - T out, + T out UNUSED, T dout) { return dout * d_binary_fun_.Dy(x, intermediate_out); } diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h index 3543c0c6aa066..dc78bd7098411 100644 --- a/paddle/phi/kernels/funcs/elementwise_functor.h +++ b/paddle/phi/kernels/funcs/elementwise_functor.h @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/phi/common/float16.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/core/macros.h" #if defined(__xpu__) #include @@ -259,7 +260,7 @@ template <> struct FMaxGradDx { HOSTDEVICE dtype::float16 operator()(dtype::float16 x, dtype::float16 y, - dtype::float16 out, + dtype::float16 out UNUSED, dtype::float16 dout) const { return dout * static_cast((x >= y) || dtype::isnan(y)); } @@ -267,7 +268,7 @@ struct FMaxGradDx { template <> struct FMaxGradDx { - HOSTDEVICE int operator()(int x, int y, int out, int dout) const { + HOSTDEVICE int operator()(int x, int y, int out UNUSED, int dout) const { return dout * static_cast((x >= y)); } }; @@ -276,7 +277,7 @@ template <> struct FMaxGradDx { HOSTDEVICE int64_t operator()(int64_t x, int64_t y, - int64_t out, + int64_t out UNUSED, int64_t dout) const { return dout * static_cast((x >= y)); } @@ -293,7 +294,7 @@ template <> struct FMaxGradDy { HOSTDEVICE dtype::float16 operator()(dtype::float16 x, dtype::float16 y, - dtype::float16 out, + dtype::float16 out UNUSED, dtype::float16 dout) const { return dout * static_cast(!((x >= y) || dtype::isnan(y))); } @@ -303,7 +304,7 @@ template <> struct FMaxGradDy { HOSTDEVICE int64_t operator()(int64_t x, int64_t y, - int64_t out, + int64_t out UNUSED, int64_t dout) const { return dout * static_cast(!((x >= y))); } @@ -311,7 +312,7 @@ struct FMaxGradDy { template <> struct FMaxGradDy { - HOSTDEVICE int operator()(int x, int y, int out, int dout) const { + HOSTDEVICE int operator()(int x, int y, int out UNUSED, int dout) const { return dout * static_cast(!((x >= y))); } }; @@ -327,7 +328,7 @@ template <> struct FMinGradDx { HOSTDEVICE dtype::float16 operator()(dtype::float16 x, dtype::float16 y, - dtype::float16 out, + dtype::float16 out UNUSED, dtype::float16 dout) const { return dout * static_cast((x <= y) || dtype::isnan(y)); } @@ -335,7 +336,7 @@ struct FMinGradDx { template <> struct FMinGradDx { - HOSTDEVICE int operator()(int x, int y, int out, int dout) const { + HOSTDEVICE int operator()(int x, int y, int out UNUSED, int dout) const { return dout * static_cast((x <= y)); } }; @@ -344,7 +345,7 @@ template <> struct FMinGradDx { HOSTDEVICE int64_t operator()(int64_t x, int64_t y, - int64_t out, + int64_t out UNUSED, int64_t dout) const { return dout * static_cast((x <= y)); } @@ -361,7 +362,7 @@ template <> struct FMinGradDy { HOSTDEVICE dtype::float16 operator()(dtype::float16 x, dtype::float16 y, - dtype::float16 out, + dtype::float16 out UNUSED, dtype::float16 dout) const { return dout * static_cast(!((x <= y) || dtype::isnan(y))); } @@ -369,7 +370,7 @@ struct FMinGradDy { template <> struct FMinGradDy { - HOSTDEVICE int operator()(int x, int y, int out, int dout) const { + HOSTDEVICE int operator()(int x, int y, int out UNUSED, int dout) const { return dout * static_cast(!((x <= y))); } }; @@ -378,7 +379,7 @@ template <> struct FMinGradDy { HOSTDEVICE int64_t operator()(int64_t x, int64_t y, - int64_t out, + int64_t out UNUSED, int64_t dout) const { return dout * static_cast(!((x <= y))); } diff --git a/paddle/phi/kernels/funcs/for_range.h b/paddle/phi/kernels/funcs/for_range.h index 4625414e7a5be..9648a7d845ff0 100644 --- a/paddle/phi/kernels/funcs/for_range.h +++ b/paddle/phi/kernels/funcs/for_range.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/phi/backends/all_context.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" - +#include "paddle/phi/core/macros.h" namespace phi { namespace funcs { @@ -29,7 +29,8 @@ struct ForRange { template <> struct ForRange { - ForRange(const phi::CPUContext& dev_ctx, size_t limit) : limit_(limit) {} + ForRange(const phi::CPUContext& dev_ctx UNUSED, size_t limit) + : limit_(limit) {} template void operator()(Function func) const { diff --git a/paddle/phi/kernels/funcs/reduce_functor.h b/paddle/phi/kernels/funcs/reduce_functor.h index e0e7ec3d403f1..f1d3772cc25a5 100644 --- a/paddle/phi/kernels/funcs/reduce_functor.h +++ b/paddle/phi/kernels/funcs/reduce_functor.h @@ -14,9 +14,9 @@ #pragma once +#include "paddle/phi/core/macros.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" - namespace phi { namespace funcs { @@ -130,12 +130,12 @@ struct SumGradFunctor { typename DY, typename Dim> void operator()(const DeviceContext& place, - X* x, - Y* y, + X* x UNUSED, + Y* y UNUSED, DX* dx, DY* dy, const Dim& dim, - int size) { + int size UNUSED) { dx->device(place) = dy->broadcast(dim); } }; @@ -171,7 +171,7 @@ struct MaxOrMinGradFunctor { DX* dx, DY* dy, const Dim& dim, - int size) { + int size UNUSED) { auto equals = (*x) == y->broadcast(dim); auto ones = dx->constant(1); auto zeros = dx->constant(0); diff --git a/paddle/phi/kernels/impl/meshgrid_grad_kernel_impl.h b/paddle/phi/kernels/impl/meshgrid_grad_kernel_impl.h index 386bb1b47ef6d..bdedcee095707 100644 --- a/paddle/phi/kernels/impl/meshgrid_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/meshgrid_grad_kernel_impl.h @@ -15,15 +15,15 @@ #pragma once #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/macros.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" #include "paddle/phi/kernels/meshgrid_grad_kernel.h" - namespace phi { template void MeshgridBackward(const Context& ctx, - const std::vector& ins, + const std::vector& ins UNUSED, const std::vector& out_grad, std::vector outs) { int n = out_grad.size(); diff --git a/paddle/phi/kernels/impl/reduce_grad.h b/paddle/phi/kernels/impl/reduce_grad.h index e9d1aec0f09c5..5665c9713c476 100644 --- a/paddle/phi/kernels/impl/reduce_grad.h +++ b/paddle/phi/kernels/impl/reduce_grad.h @@ -14,10 +14,10 @@ #pragma once +#include "paddle/phi/core/macros.h" #include "paddle/phi/kernels/cast_kernel.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/reduce_grad_functions.h" - namespace phi { template void ComputeFromInput(const Context& dev_ctx, const DenseTensor& x, - const DenseTensor& out_grad, + const DenseTensor& out_grad UNUSED, const paddle::optional& out, const DenseTensor& input2, const std::vector& dims, - bool keep_dim, + bool keep_dim UNUSED, bool reduce_all, DenseTensor* x_grad) { reduce_all = recompute_reduce_all(x, dims, reduce_all); diff --git a/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h b/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h index 1292a3af36352..3d2a0a3d0db67 100644 --- a/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h @@ -54,7 +54,7 @@ void SetValueGradImpl(const Context& dev_ctx, const IntArray& steps, const std::vector& axes, const std::vector& decrease_axes, - const std::vector& none_axes, + const std::vector& none_axes UNUSED, DenseTensor* x_grad, DenseTensor* value_grad) { PADDLE_ENFORCE_EQ( diff --git a/paddle/phi/kernels/onednn/conv_handler.h b/paddle/phi/kernels/onednn/conv_handler.h index 102686d24ad4a..2be0ba5649711 100644 --- a/paddle/phi/kernels/onednn/conv_handler.h +++ b/paddle/phi/kernels/onednn/conv_handler.h @@ -17,8 +17,8 @@ #include "paddle/phi/backends/onednn/onednn_helper.h" #include "paddle/phi/backends/onednn/onednn_reuse.h" #include "paddle/phi/core/expect.h" +#include "paddle/phi/core/macros.h" #include "paddle/phi/kernels/cpu/conv_util.h" - namespace phi { namespace onednn { @@ -50,7 +50,7 @@ class ConvOneDNNHandlerT const std::string& padding_algorithm, const std::vector& dilations_in, int groups, - const std::string& data_format, + const std::string& data_format UNUSED, bool is_test, bool is_BFLOAT16, const std::string& fuse_activation, From 8386417ed2e08f465ee69492b4ef3131afe47b78 Mon Sep 17 00:00:00 2001 From: lijialin03 <124568209+lijialin03@users.noreply.github.com> Date: Wed, 26 Apr 2023 16:20:52 +0800 Subject: [PATCH 081/405] add paddle.optimizer.LBFGS API and a modify its test case test=develop (#51912) * modify numel in lbfgs and add a new test case. test=develop * change param 'lr' to 'learning_rate' in lbfgs and its test * add opt LBFGS and change test --- .../fluid/tests/unittests/test_lbfgs_class.py | 560 ++++++++++++++ .../fluid/tests/unittests/test_lbfgs_v2.py | 274 ------- python/paddle/incubate/optimizer/lbfgs.py | 21 +- python/paddle/optimizer/__init__.py | 2 + python/paddle/optimizer/lbfgs.py | 701 ++++++++++++++++++ 5 files changed, 1275 insertions(+), 283 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_lbfgs_class.py delete mode 100644 python/paddle/fluid/tests/unittests/test_lbfgs_v2.py create mode 100644 python/paddle/optimizer/lbfgs.py diff --git a/python/paddle/fluid/tests/unittests/test_lbfgs_class.py b/python/paddle/fluid/tests/unittests/test_lbfgs_class.py new file mode 100644 index 0000000000000..44c854f2119ef --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_lbfgs_class.py @@ -0,0 +1,560 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle +from paddle.incubate.optimizer import lbfgs as incubate_lbfgs +from paddle.incubate.optimizer import line_search_dygraph +from paddle.optimizer import lbfgs + +np.random.seed(123) + +# func()should be func(w, x)where w is parameter to be optimize ,x is input of optimizer func +# np_w is the init parameter of w + + +class Net(paddle.nn.Layer): + def __init__(self, np_w, func): + super().__init__() + self.func = func + w = paddle.to_tensor(np_w) + self.w = paddle.create_parameter( + shape=w.shape, + dtype=w.dtype, + default_initializer=paddle.nn.initializer.Assign(w), + ) + + def forward(self, x): + return self.func(self.w, x) + + +def train_step(inputs, targets, net, opt): + def closure(): + outputs = net(inputs) + loss = paddle.nn.functional.mse_loss(outputs, targets) + opt.clear_grad() + loss.backward() + return loss + + loss = opt.step(closure) + return loss + + +class TestLbfgs(unittest.TestCase): + def test_function_fix_incubate(self): + paddle.disable_static() + np_w = np.random.rand(1).astype(np.float32) + + input = np.random.rand(1).astype(np.float32) + weights = [np.random.rand(1).astype(np.float32) for i in range(5)] + targets = [weights[i] * input for i in range(5)] + + def func(w, x): + return w * x + + net = Net(np_w, func) + opt = incubate_lbfgs.LBFGS( + learning_rate=1, + max_iter=10, + max_eval=None, + tolerance_grad=1e-07, + tolerance_change=1e-09, + history_size=5, + line_search_fn='strong_wolfe', + parameters=net.parameters(), + ) + + for weight, target in zip(weights, targets): + input = paddle.to_tensor(input) + target = paddle.to_tensor(target) + loss = 1 + while loss > 1e-4: + loss = train_step(input, target, net, opt) + np.testing.assert_allclose(net.w, weight, rtol=1e-05) + + def test_inf_minima_incubate(self): + # not converage + input = np.random.rand(1).astype(np.float32) + + def outputs1(x): + # weight[0] = 1.01 weight[1] = 0.99 + return x * x * x - 3 * x * x + 3 * 1.01 * 0.99 * x + + def outputs2(x): + # weight[0] = 4 weight[1] = 2 + return pow(x, 4) + 5 * pow(x, 2) + + targets = [outputs1(input), outputs2(input)] + input = paddle.to_tensor(input) + + def func1(extream_point, x): + return ( + x * x * x + - 3 * x * x + + 3 * extream_point[0] * extream_point[1] * x + ) + + def func2(extream_point, x): + return pow(x, extream_point[0]) + 5 * pow(x, extream_point[1]) + + extream_point = np.array([-2.34, 1.45]).astype('float32') + net1 = Net(extream_point, func1) + # converge of old_sk.pop() + opt1 = incubate_lbfgs.LBFGS( + learning_rate=1, + max_iter=10, + max_eval=None, + tolerance_grad=1e-07, + tolerance_change=1e-09, + history_size=1, + line_search_fn='strong_wolfe', + parameters=net1.parameters(), + ) + + net2 = Net(extream_point, func2) + # converge of line_search = None + opt2 = incubate_lbfgs.LBFGS( + learning_rate=1, + max_iter=50, + max_eval=None, + tolerance_grad=1e-07, + tolerance_change=1e-09, + history_size=10, + line_search_fn=None, + parameters=net2.parameters(), + ) + + n_iter = 0 + while n_iter < 20: + loss = train_step(input, paddle.to_tensor(targets[0]), net1, opt1) + n_iter = opt1.state_dict()["state"]["func_evals"] + + n_iter = 0 + while n_iter < 10: + loss = train_step(input, paddle.to_tensor(targets[1]), net2, opt2) + n_iter = opt1.state_dict()["state"]["func_evals"] + + def test_error_incubate(self): + # test parameter is not Paddle Tensor + def error_func1(): + extream_point = np.array([-1, 2]).astype('float32') + extream_point = paddle.to_tensor(extream_point) + return incubate_lbfgs.LBFGS( + learning_rate=1, + max_iter=10, + max_eval=None, + tolerance_grad=1e-07, + tolerance_change=1e-09, + history_size=3, + line_search_fn='strong_wolfe', + parameters=extream_point, + ) + + self.assertRaises(TypeError, error_func1) + + def test_error2_incubate(self): + # not converage + input = np.random.rand(1).astype(np.float32) + + def outputs2(x): + # weight[0] = 4 weight[1] = 2 + return pow(x, 4) + 5 * pow(x, 2) + + targets = [outputs2(input)] + input = paddle.to_tensor(input) + + def func2(extream_point, x): + return pow(x, extream_point[0]) + 5 * pow(x, extream_point[1]) + + extream_point = np.array([-2.34, 1.45]).astype('float32') + net2 = Net(extream_point, func2) + # converge of line_search = None + opt2 = incubate_lbfgs.LBFGS( + learning_rate=1, + max_iter=50, + max_eval=None, + tolerance_grad=1e-07, + tolerance_change=1e-09, + history_size=10, + line_search_fn='None', + parameters=net2.parameters(), + ) + + def error_func(): + n_iter = 0 + while n_iter < 10: + loss = train_step( + input, paddle.to_tensor(targets[0]), net2, opt2 + ) + n_iter = opt2.state_dict()["state"]["func_evals"] + + self.assertRaises(RuntimeError, error_func) + + def test_line_search_incubate(self): + def func1(x, alpha, d): + return paddle.to_tensor(x + alpha * d), paddle.to_tensor([0.0]) + + def func2(x, alpha, d): + return paddle.to_tensor(x + alpha * d), paddle.to_tensor([1.0]) + + def func3(x, alpha, d): + return paddle.to_tensor(x + alpha * d), paddle.to_tensor([-1.0]) + + line_search_dygraph._strong_wolfe( + func1, + paddle.to_tensor([1.0]), + paddle.to_tensor([0.001]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([0.0]), + paddle.to_tensor([0.0]), + max_ls=1, + ) + + line_search_dygraph._strong_wolfe( + func1, + paddle.to_tensor([1.0]), + paddle.to_tensor([0.001]), + paddle.to_tensor([0.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([0.0]), + paddle.to_tensor([0.0]), + max_ls=0, + ) + + line_search_dygraph._strong_wolfe( + func2, + paddle.to_tensor([1.0]), + paddle.to_tensor([-0.001]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + max_ls=1, + ) + + line_search_dygraph._strong_wolfe( + func3, + paddle.to_tensor([1.0]), + paddle.to_tensor([-0.001]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + max_ls=1, + ) + + line_search_dygraph._cubic_interpolate( + paddle.to_tensor([2.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([0.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([2.0]), + paddle.to_tensor([0.0]), + [0.1, 0.5], + ) + + line_search_dygraph._cubic_interpolate( + paddle.to_tensor([2.0]), + paddle.to_tensor([0.0]), + paddle.to_tensor([-3.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([-0.1]), + [0.1, 0.5], + ) + + def test_error3_incubate(self): + # test parameter shape size <= 0 + def error_func3(): + extream_point = np.array([-1, 2]).astype('float32') + extream_point = paddle.to_tensor(extream_point) + + def func(w, x): + return w * x + + net = Net(extream_point, func) + net.w = paddle.create_parameter( + shape=[-1, 2], + dtype=net.w.dtype, + ) + opt = incubate_lbfgs.LBFGS( + learning_rate=1, + max_iter=10, + max_eval=None, + tolerance_grad=1e-07, + tolerance_change=1e-09, + history_size=5, + line_search_fn='strong_wolfe', + parameters=net.parameters(), + ) + + self.assertRaises(AssertionError, error_func3) + + def test_function_fix(self): + paddle.disable_static() + np_w = np.random.rand(1).astype(np.float32) + + input = np.random.rand(1).astype(np.float32) + weights = [np.random.rand(1).astype(np.float32) for i in range(5)] + targets = [weights[i] * input for i in range(5)] + + def func(w, x): + return w * x + + net = Net(np_w, func) + opt = lbfgs.LBFGS( + learning_rate=1, + max_iter=10, + max_eval=None, + tolerance_grad=1e-07, + tolerance_change=1e-09, + history_size=5, + line_search_fn='strong_wolfe', + parameters=net.parameters(), + ) + + for weight, target in zip(weights, targets): + input = paddle.to_tensor(input) + target = paddle.to_tensor(target) + loss = 1 + while loss > 1e-4: + loss = train_step(input, target, net, opt) + np.testing.assert_allclose(net.w, weight, rtol=1e-05) + + def test_inf_minima(self): + # not converage + input = np.random.rand(1).astype(np.float32) + + def outputs1(x): + # weight[0] = 1.01 weight[1] = 0.99 + return x * x * x - 3 * x * x + 3 * 1.01 * 0.99 * x + + def outputs2(x): + # weight[0] = 4 weight[1] = 2 + return pow(x, 4) + 5 * pow(x, 2) + + targets = [outputs1(input), outputs2(input)] + input = paddle.to_tensor(input) + + def func1(extream_point, x): + return ( + x * x * x + - 3 * x * x + + 3 * extream_point[0] * extream_point[1] * x + ) + + def func2(extream_point, x): + return pow(x, extream_point[0]) + 5 * pow(x, extream_point[1]) + + extream_point = np.array([-2.34, 1.45]).astype('float32') + net1 = Net(extream_point, func1) + # converge of old_sk.pop() + opt1 = lbfgs.LBFGS( + learning_rate=1, + max_iter=10, + max_eval=None, + tolerance_grad=1e-07, + tolerance_change=1e-09, + history_size=1, + line_search_fn='strong_wolfe', + parameters=net1.parameters(), + ) + + net2 = Net(extream_point, func2) + # converge of line_search = None + opt2 = lbfgs.LBFGS( + learning_rate=1, + max_iter=50, + max_eval=None, + tolerance_grad=1e-07, + tolerance_change=1e-09, + history_size=10, + line_search_fn=None, + parameters=net2.parameters(), + ) + + n_iter = 0 + while n_iter < 20: + loss = train_step(input, paddle.to_tensor(targets[0]), net1, opt1) + n_iter = opt1.state_dict()["state"]["func_evals"] + + n_iter = 0 + while n_iter < 10: + loss = train_step(input, paddle.to_tensor(targets[1]), net2, opt2) + n_iter = opt1.state_dict()["state"]["func_evals"] + + def test_error(self): + # test parameter is not Paddle Tensor + def error_func1(): + extream_point = np.array([-1, 2]).astype('float32') + extream_point = paddle.to_tensor(extream_point) + return lbfgs.LBFGS( + learning_rate=1, + max_iter=10, + max_eval=None, + tolerance_grad=1e-07, + tolerance_change=1e-09, + history_size=3, + line_search_fn='strong_wolfe', + parameters=extream_point, + ) + + self.assertRaises(TypeError, error_func1) + + def test_error2(self): + # not converage + input = np.random.rand(1).astype(np.float32) + + def outputs2(x): + # weight[0] = 4 weight[1] = 2 + return pow(x, 4) + 5 * pow(x, 2) + + targets = [outputs2(input)] + input = paddle.to_tensor(input) + + def func2(extream_point, x): + return pow(x, extream_point[0]) + 5 * pow(x, extream_point[1]) + + extream_point = np.array([-2.34, 1.45]).astype('float32') + net2 = Net(extream_point, func2) + # converge of line_search = None + opt2 = lbfgs.LBFGS( + learning_rate=1, + max_iter=50, + max_eval=None, + tolerance_grad=1e-07, + tolerance_change=1e-09, + history_size=10, + line_search_fn='None', + parameters=net2.parameters(), + ) + + def error_func(): + n_iter = 0 + while n_iter < 10: + loss = train_step( + input, paddle.to_tensor(targets[0]), net2, opt2 + ) + n_iter = opt2.state_dict()["state"]["func_evals"] + + self.assertRaises(RuntimeError, error_func) + + def test_line_search(self): + def func1(x, alpha, d): + return paddle.to_tensor(x + alpha * d), paddle.to_tensor([0.0]) + + def func2(x, alpha, d): + return paddle.to_tensor(x + alpha * d), paddle.to_tensor([1.0]) + + def func3(x, alpha, d): + return paddle.to_tensor(x + alpha * d), paddle.to_tensor([-1.0]) + + lbfgs._strong_wolfe( + func1, + paddle.to_tensor([1.0]), + paddle.to_tensor([0.001]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([0.0]), + paddle.to_tensor([0.0]), + max_ls=1, + ) + + lbfgs._strong_wolfe( + func1, + paddle.to_tensor([1.0]), + paddle.to_tensor([0.001]), + paddle.to_tensor([0.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([0.0]), + paddle.to_tensor([0.0]), + max_ls=0, + ) + + lbfgs._strong_wolfe( + func2, + paddle.to_tensor([1.0]), + paddle.to_tensor([-0.001]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + max_ls=1, + ) + + lbfgs._strong_wolfe( + func3, + paddle.to_tensor([1.0]), + paddle.to_tensor([-0.001]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + max_ls=1, + ) + + lbfgs._cubic_interpolate( + paddle.to_tensor([2.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([0.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([2.0]), + paddle.to_tensor([0.0]), + [0.1, 0.5], + ) + + lbfgs._cubic_interpolate( + paddle.to_tensor([2.0]), + paddle.to_tensor([0.0]), + paddle.to_tensor([-3.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([1.0]), + paddle.to_tensor([-0.1]), + [0.1, 0.5], + ) + + def test_error3(self): + # test parameter shape size <= 0 + def error_func3(): + extream_point = np.array([-1, 2]).astype('float32') + extream_point = paddle.to_tensor(extream_point) + + def func(w, x): + return w * x + + net = Net(extream_point, func) + net.w = paddle.create_parameter( + shape=[-1, 2], + dtype=net.w.dtype, + ) + opt = lbfgs.LBFGS( + learning_rate=1, + max_iter=10, + max_eval=None, + tolerance_grad=1e-07, + tolerance_change=1e-09, + history_size=5, + line_search_fn='strong_wolfe', + parameters=net.parameters(), + ) + + self.assertRaises(AssertionError, error_func3) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_lbfgs_v2.py b/python/paddle/fluid/tests/unittests/test_lbfgs_v2.py deleted file mode 100644 index 9617938967cd3..0000000000000 --- a/python/paddle/fluid/tests/unittests/test_lbfgs_v2.py +++ /dev/null @@ -1,274 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np - -import paddle -from paddle.incubate.optimizer import LBFGS -from paddle.incubate.optimizer.line_search_dygraph import ( - _cubic_interpolate, - _strong_wolfe, -) - -np.random.seed(123) - -# func()should be func(w, x)where w is parameter to be optimize ,x is input of optimizer func -# np_w is the init parameter of w - - -class Net(paddle.nn.Layer): - def __init__(self, np_w, func): - super().__init__() - self.func = func - w = paddle.to_tensor(np_w) - self.w = paddle.create_parameter( - shape=w.shape, - dtype=w.dtype, - default_initializer=paddle.nn.initializer.Assign(w), - ) - - def forward(self, x): - return self.func(self.w, x) - - -def train_step(inputs, targets, net, opt): - def closure(): - outputs = net(inputs) - loss = paddle.nn.functional.mse_loss(outputs, targets) - opt.clear_grad() - loss.backward() - return loss - - loss = opt.step(closure) - return loss - - -class TestLbfgs(unittest.TestCase): - def test_function_fix(self): - paddle.disable_static() - np_w = np.random.rand(1).astype(np.float32) - - input = np.random.rand(1).astype(np.float32) - weights = [np.random.rand(1).astype(np.float32) for i in range(5)] - targets = [weights[i] * input for i in range(5)] - - def func(w, x): - return w * x - - net = Net(np_w, func) - opt = LBFGS( - lr=1, - max_iter=10, - max_eval=None, - tolerance_grad=1e-07, - tolerance_change=1e-09, - history_size=5, - line_search_fn='strong_wolfe', - parameters=net.parameters(), - ) - - for weight, target in zip(weights, targets): - input = paddle.to_tensor(input) - target = paddle.to_tensor(target) - loss = 1 - while loss > 1e-4: - loss = train_step(input, target, net, opt) - np.testing.assert_allclose(net.w, weight, rtol=1e-05) - - def test_inf_minima(self): - # not converage - input = np.random.rand(1).astype(np.float32) - - def outputs1(x): - # weight[0] = 1.01 weight[1] = 0.99 - return x * x * x - 3 * x * x + 3 * 1.01 * 0.99 * x - - def outputs2(x): - # weight[0] = 4 weight[1] = 2 - return pow(x, 4) + 5 * pow(x, 2) - - targets = [outputs1(input), outputs2(input)] - input = paddle.to_tensor(input) - - def func1(extream_point, x): - return ( - x * x * x - - 3 * x * x - + 3 * extream_point[0] * extream_point[1] * x - ) - - def func2(extream_point, x): - return pow(x, extream_point[0]) + 5 * pow(x, extream_point[1]) - - extream_point = np.array([-2.34, 1.45]).astype('float32') - net1 = Net(extream_point, func1) - # converge of old_sk.pop() - opt1 = LBFGS( - lr=1, - max_iter=10, - max_eval=None, - tolerance_grad=1e-07, - tolerance_change=1e-09, - history_size=1, - line_search_fn='strong_wolfe', - parameters=net1.parameters(), - ) - - net2 = Net(extream_point, func2) - # converge of line_search = None - opt2 = LBFGS( - lr=1, - max_iter=50, - max_eval=None, - tolerance_grad=1e-07, - tolerance_change=1e-09, - history_size=10, - line_search_fn=None, - parameters=net2.parameters(), - ) - - n_iter = 0 - while n_iter < 20: - loss = train_step(input, paddle.to_tensor(targets[0]), net1, opt1) - n_iter = opt1.state_dict()["state"]["func_evals"] - - n_iter = 0 - while n_iter < 10: - loss = train_step(input, paddle.to_tensor(targets[1]), net2, opt2) - n_iter = opt1.state_dict()["state"]["func_evals"] - - def test_error(self): - # test parameter is not Paddle Tensor - def error_func1(): - extream_point = np.array([-1, 2]).astype('float32') - extream_point = paddle.to_tensor(extream_point) - return LBFGS( - lr=1, - max_iter=10, - max_eval=None, - tolerance_grad=1e-07, - tolerance_change=1e-09, - history_size=3, - line_search_fn='strong_wolfe', - parameters=extream_point, - ) - - self.assertRaises(TypeError, error_func1) - - def test_error2(self): - # not converage - input = np.random.rand(1).astype(np.float32) - - def outputs2(x): - # weight[0] = 4 weight[1] = 2 - return pow(x, 4) + 5 * pow(x, 2) - - targets = [outputs2(input)] - input = paddle.to_tensor(input) - - def func2(extream_point, x): - return pow(x, extream_point[0]) + 5 * pow(x, extream_point[1]) - - extream_point = np.array([-2.34, 1.45]).astype('float32') - net2 = Net(extream_point, func2) - # converge of line_search = None - opt2 = LBFGS( - lr=1, - max_iter=50, - max_eval=None, - tolerance_grad=1e-07, - tolerance_change=1e-09, - history_size=10, - line_search_fn='None', - parameters=net2.parameters(), - ) - - def error_func(): - n_iter = 0 - while n_iter < 10: - loss = train_step( - input, paddle.to_tensor(targets[0]), net2, opt2 - ) - n_iter = opt2.state_dict()["state"]["func_evals"] - - self.assertRaises(RuntimeError, error_func) - - def test_line_search(self): - def func1(x, alpha, d): - return paddle.to_tensor(x + alpha * d), paddle.to_tensor([0.0]) - - def func2(x, alpha, d): - return paddle.to_tensor(x + alpha * d), paddle.to_tensor([1.0]) - - def func3(x, alpha, d): - return paddle.to_tensor(x + alpha * d), paddle.to_tensor([-1.0]) - - _strong_wolfe( - func1, - paddle.to_tensor([1.0]), - paddle.to_tensor([0.001]), - paddle.to_tensor([0.0]), - paddle.to_tensor([1.0]), - paddle.to_tensor([0.0]), - paddle.to_tensor([0.0]), - max_ls=0, - ) - - _strong_wolfe( - func2, - paddle.to_tensor([1.0]), - paddle.to_tensor([-0.001]), - paddle.to_tensor([1.0]), - paddle.to_tensor([1.0]), - paddle.to_tensor([1.0]), - paddle.to_tensor([1.0]), - max_ls=1, - ) - - _strong_wolfe( - func3, - paddle.to_tensor([1.0]), - paddle.to_tensor([-0.001]), - paddle.to_tensor([1.0]), - paddle.to_tensor([1.0]), - paddle.to_tensor([1.0]), - paddle.to_tensor([1.0]), - max_ls=1, - ) - - _cubic_interpolate( - paddle.to_tensor([2.0]), - paddle.to_tensor([1.0]), - paddle.to_tensor([0.0]), - paddle.to_tensor([1.0]), - paddle.to_tensor([2.0]), - paddle.to_tensor([0.0]), - [0.1, 0.5], - ) - - _cubic_interpolate( - paddle.to_tensor([2.0]), - paddle.to_tensor([0.0]), - paddle.to_tensor([-3.0]), - paddle.to_tensor([1.0]), - paddle.to_tensor([1.0]), - paddle.to_tensor([-0.1]), - [0.1, 0.5], - ) - - -if __name__ == '__main__': - unittest.main() diff --git a/python/paddle/incubate/optimizer/lbfgs.py b/python/paddle/incubate/optimizer/lbfgs.py index 937a3b2f9af3f..ae7511ae03e94 100644 --- a/python/paddle/incubate/optimizer/lbfgs.py +++ b/python/paddle/incubate/optimizer/lbfgs.py @@ -18,10 +18,12 @@ import paddle from paddle.optimizer import Optimizer +from paddle.utils import deprecated from .line_search_dygraph import _strong_wolfe +@deprecated(since="2.5.0", update_to="paddle.optimizer.LBFGS", level=1) class LBFGS(Optimizer): r""" The L-BFGS is a quasi-Newton method for solving an unconstrained optimization problem over a differentiable function. @@ -40,7 +42,7 @@ class LBFGS(Optimizer): Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. pp179: Algorithm 7.5 (L-BFGS). Args: - lr (float, optional): learning rate .The default value is 1. + learning_rate (float, optional): learning rate .The default value is 1. max_iter (int, optional): maximal number of iterations per optimization step. The default value is 20. max_eval (int, optional): maximal number of function evaluations per optimization @@ -97,7 +99,7 @@ def forward(self, x): return self.w * x net = Net() - opt = LBFGS(lr=1, max_iter=1, max_eval=None, tolerance_grad=1e-07, tolerance_change=1e-09, history_size=100, line_search_fn='strong_wolfe', parameters=net.parameters()) + opt = LBFGS(learning_rate=1, max_iter=1, max_eval=None, tolerance_grad=1e-07, tolerance_change=1e-09, history_size=100, line_search_fn='strong_wolfe', parameters=net.parameters()) def train_step(inputs, targets): def closure(): outputs = net(inputs) @@ -118,7 +120,7 @@ def closure(): def __init__( self, - lr=1.0, + learning_rate=1.0, max_iter=20, max_eval=None, tolerance_grad=1e-7, @@ -133,7 +135,7 @@ def __init__( if max_eval is None: max_eval = max_iter * 5 // 4 - self.lr = lr + self.learning_rate = learning_rate self.max_iter = max_iter self.max_eval = max_eval self.tolerance_grad = tolerance_grad @@ -202,7 +204,7 @@ def _gather_flat_grad(self): def _add_grad(self, alpha, direction): offset = 0 for p in self._params: - numel = p.numel().item() + numel = reduce(lambda x, y: x * y, p.shape) p = paddle.assign( p.add( direction[offset : offset + numel].reshape(p.shape) * alpha @@ -234,11 +236,10 @@ def step(self, closure): """ with paddle.no_grad(): - # Make sure the closure is always called with grad enabled closure = paddle.enable_grad()(closure) - lr = self.lr + learning_rate = self.learning_rate max_iter = self.max_iter max_eval = self.max_eval tolerance_grad = self.tolerance_grad @@ -342,9 +343,11 @@ def step(self, closure): ############################################################ # reset initial guess for step size if state['n_iter'] == 1: - alpha = min(1.0, 1.0 / flat_grad.abs().sum()) * lr + alpha = ( + min(1.0, 1.0 / flat_grad.abs().sum()) * learning_rate + ) else: - alpha = lr + alpha = learning_rate # directional derivative gtd = flat_grad.dot(d) diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py index cef51897b20ab..7d9737dc7da1f 100644 --- a/python/paddle/optimizer/__init__.py +++ b/python/paddle/optimizer/__init__.py @@ -22,6 +22,7 @@ from .sgd import SGD # noqa: F401 from .momentum import Momentum # noqa: F401 from .lamb import Lamb # noqa: F401 +from .lbfgs import LBFGS # noqa: F401 from . import lr # noqa: F401 __all__ = [ # noqa @@ -35,4 +36,5 @@ 'SGD', 'Momentum', 'Lamb', + 'LBFGS', ] diff --git a/python/paddle/optimizer/lbfgs.py b/python/paddle/optimizer/lbfgs.py new file mode 100644 index 0000000000000..aaa933541962f --- /dev/null +++ b/python/paddle/optimizer/lbfgs.py @@ -0,0 +1,701 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict +from functools import reduce + +import paddle + +from ..fluid import framework +from .optimizer import Optimizer + +__all__ = [] + + +def _cubic_interpolate(x1, f1, g1, x2, f2, g2, bounds=None): + r"""Cubic interpolation between (x1, f1, g1) and (x2, f2, g2). + Use two points and their gradient to determine a cubic function and get the minimun point + between them in the cubic curve. + + Reference: + Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. + pp59: formula 3.59 + + Args: + x1, f1, g1: point1's position, value and gradient. + x2, f2, g2: point2's position, value and gradient. + bounds: bounds of interpolation area + + Returns: + min_pos: the minimun point between the specified points in the cubic curve. + """ + # Compute bounds of interpolation area + if bounds is not None: + xmin_bound, xmax_bound = bounds + else: + xmin_bound, xmax_bound = (x1, x2) if x1 <= x2 else (x2, x1) + + d1 = g1 + g2 - 3 * (f1 - f2) / (x1 - x2) + d2_square = d1**2 - g1 * g2 + if d2_square >= 0: + d2 = d2_square.sqrt() + if x1 <= x2: + min_pos = x2 - (x2 - x1) * ((g2 + d2 - d1) / (g2 - g1 + 2 * d2)) + else: + min_pos = x1 - (x1 - x2) * ((g1 + d2 - d1) / (g1 - g2 + 2 * d2)) + return min(max(min_pos, xmin_bound), xmax_bound) + else: + return (xmin_bound + xmax_bound) / 2.0 + + +def _strong_wolfe( + obj_func, + xk, + alpha, + d, + loss, + grad, + gtd, + c1=1e-4, + c2=0.9, + tolerance_change=1e-9, + max_ls=25, +): + r"""Implements of line search algorithm that satisfies the strong Wolfe conditions using double zoom. + + Reference: + Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. + pp60: Algorithm 3.5 (Line Search Algorithm). + + Args: + obj_func: the objective function to minimize. ```` accepts a multivariate input and returns a scalar. + xk (Tensor): the starting point of the iterates. + alpha (Scalar): the initial step size. + d (Tensor): search direction. + loss (scalar): the initial loss + grad (Tensor): the initial grad + c1 (Scalar): parameter for sufficient decrease condition. + c2 (Scalar): parameter for curvature condition. + tolerance_change (Scalar): terminates if the change of function value/position/parameter between + two iterations is smaller than this value. + max_ls(int): max iteration of line search. + alpha_max (float): max step length. + + Returns: + loss_new (Scaler): loss of obj_func at final alpha. + grad_new, (Tensor): derivative of obj_func at final alpha. + alpha(Tensor): optimal step length, or 0. if the line search algorithm did not converge. + ls_func_evals (Scaler): number of objective function called in line search process. + + Following summarizes the essentials of the strong Wolfe line search algorithm. + Some notations used in the description: + + - `func` denotes the objective function. + - `obi_func` is a function of step size alpha, restricting `obj_func` on a line. + + obi_func = func(xk + alpha * d), + where xk is the position of k'th iterate, d is the line search direction(decent direction), + and a is the step size. + - alpha : substitute of alpha + - a1 is alpha of last iteration, which is alpha_(i-1). + - a2 is alpha of current iteration, which is alpha_i. + - a_lo is alpha in left position when calls zoom, which is alpha_low. + - a_hi is alpha in right position when calls zoom, which is alpha_high. + + Line Search Algorithm: + repeat + Compute obi_func(a2) and derphi(a2). + 1. If obi_func(a2) > obi_func(0) + c_1 * a2 * obi_func'(0) or [obi_func(a2) >= obi_func(a1) and i > 1], + alpha= zoom(a1, a2) and stop; + + 2. If |obi_func'(a2)| <= -c_2 * obi_func'(0), + alpha= a2 and stop; + + 3. If obi_func'(a2) >= 0, + alpha= zoom(a2, a1) and stop; + + a1 = a2 + a2 = min(2 * a2, a2) + i = i + 1 + end(repeat) + + zoom(a_lo, a_hi) Algorithm: + repeat + aj = cubic_interpolation(a_lo, a_hi) + Compute obi_func(aj) and derphi(aj). + 1. If obi_func(aj) > obi_func(0) + c_1 * aj * obi_func'(0) or obi_func(aj) >= obi_func(a_lo), + then a_hi <- aj; + 2. + 2.1. If |obi_func'(aj)| <= -c_2 * obi_func'(0), then alpha= a2 and stop; + + 2.2. If obi_func'(aj) * (a2 - a1) >= 0, then a_hi = a_lo + + a_lo = aj; + end(repeat) + """ + + d_norm = d.abs().max() + grad = grad.clone() + # evaluate objective and gradient using initial step + loss_new, grad_new = obj_func(xk, alpha, d) + ls_func_evals = 1 + gtd_new = paddle.dot(grad_new, d) + + # bracket an interval containing a point satisfying the Wolfe criteria + t_prev, f_prev, g_prev, gtd_prev = ( + paddle.to_tensor(0, dtype=grad.dtype), + loss, + grad, + gtd, + ) + done = False + ls_iter = 0 + while ls_iter < max_ls: + # check conditions + if loss_new > (loss + c1 * alpha * gtd) or ( + ls_iter > 1 and loss_new >= f_prev + ): + bracket = [t_prev, alpha] + bracket_f = [f_prev, loss_new] + bracket_g = [g_prev, grad_new.clone()] + bracket_gtd = [gtd_prev, gtd_new] + break + + if paddle.abs(gtd_new) <= -c2 * gtd: + bracket = [alpha] + bracket_f = [loss_new] + bracket_g = [grad_new] + done = True + break + + if gtd_new >= 0: + bracket = [t_prev, alpha] + bracket_f = [f_prev, loss_new] + bracket_g = [g_prev, grad_new.clone()] + bracket_gtd = [gtd_prev, gtd_new] + break + + # interpolate + min_step = alpha + 0.01 * (alpha - t_prev) + max_step = alpha * 10 + tmp = alpha + alpha = _cubic_interpolate( + t_prev, + f_prev, + gtd_prev, + alpha, + loss_new, + gtd_new, + bounds=(min_step, max_step), + ) + + # next step + t_prev = tmp + f_prev = loss_new + g_prev = grad_new.clone() + gtd_prev = gtd_new + + loss_new, grad_new = obj_func(xk, alpha, d) + ls_func_evals += 1 + gtd_new = grad_new.dot(d) + ls_iter += 1 + + # reached max number of iterations? + if ls_iter == max_ls: + bracket = [0, alpha] + bracket_f = [loss, loss_new] + bracket_g = [grad, grad_new] + + # zoom phase: we now have a point satisfying the criteria, or + # a bracket around it. We refine the bracket until we find the + # exact point satisfying the criteria + insuf_progress = False + # find high and low points in bracket + low_pos, high_pos = (0, 1) if bracket_f[0] <= bracket_f[-1] else (1, 0) + while not done and ls_iter < max_ls: + # line-search bracket is so small + if paddle.abs(bracket[1] - bracket[0]) * d_norm < tolerance_change: + break + + # compute new trial value + alpha = _cubic_interpolate( + bracket[0], + bracket_f[0], + bracket_gtd[0], + bracket[1], + bracket_f[1], + bracket_gtd[1], + ) + + # test that we are making sufficient progress: + # in case `alpha` is so close to boundary, we mark that we are making + # insufficient progress, and if + # + we have made insufficient progress in the last step, or + # + `alpha` is at one of the boundary, + # we will move `alpha` to a position which is `0.1 * len(bracket)` + # away from the nearest boundary point. + + eps = 0.1 * (max(bracket) - min(bracket)) + if min(max(bracket) - alpha, alpha - min(bracket)) < eps: + # interpolation close to boundary + if insuf_progress or alpha >= max(bracket) or alpha <= min(bracket): + # evaluate at 0.1 away from boundary + if paddle.abs(alpha - max(bracket)) < paddle.abs( + alpha - min(bracket) + ): + alpha = max(bracket) - eps + else: + alpha = min(bracket) + eps + insuf_progress = False + else: + insuf_progress = True + else: + insuf_progress = False + # Evaluate new point + loss_new, grad_new = obj_func(xk, alpha, d) + ls_func_evals += 1 + gtd_new = grad_new.dot(d) + ls_iter += 1 + + if ( + loss_new > (loss + c1 * alpha * gtd) + or loss_new >= bracket_f[low_pos] + ): + # Armijo condition not satisfied or not lower than lowest point + bracket[high_pos] = alpha + bracket_f[high_pos] = loss_new + # bracket_g[high_pos] = grad_new.clone(memory_format=torch.contiguous_format) + bracket_g[high_pos] = grad_new.clone() + bracket_gtd[high_pos] = gtd_new + low_pos, high_pos = ( + (0, 1) if bracket_f[0] <= bracket_f[1] else (1, 0) + ) + else: + if paddle.abs(gtd_new) <= -c2 * gtd: + # Wolfe conditions satisfied + done = True + elif gtd_new * (bracket[high_pos] - bracket[low_pos]) >= 0: + # old high becomes new low + bracket[high_pos] = bracket[low_pos] + bracket_f[high_pos] = bracket_f[low_pos] + bracket_g[high_pos] = bracket_g[low_pos] + bracket_gtd[high_pos] = bracket_gtd[low_pos] + + # new point becomes new low + bracket[low_pos] = alpha + bracket_f[low_pos] = loss_new + bracket_g[low_pos] = grad_new.clone() + bracket_gtd[low_pos] = gtd_new + + # return stuff + alpha = bracket[low_pos] + loss_new = bracket_f[low_pos] + grad_new = bracket_g[low_pos] + return loss_new, grad_new, alpha, ls_func_evals + + +class LBFGS(Optimizer): + r""" + The L-BFGS is a quasi-Newton method for solving an unconstrained optimization problem over a differentiable function. + Closely related is the Newton method for minimization. Consider the iterate update formula: + + .. math:: + x_{k+1} = x_{k} + H_k \nabla{f_k} + + If :math:`H_k` is the inverse Hessian of :math:`f` at :math:`x_k`, then it's the Newton method. + If :math:`H_k` is symmetric and positive definite, used as an approximation of the inverse Hessian, then + it's a quasi-Newton. In practice, the approximated Hessians are obtained + by only using the gradients, over either whole or part of the search + history, the former is BFGS, the latter is L-BFGS. + + Reference: + Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. pp179: Algorithm 7.5 (L-BFGS). + + Args: + learning_rate (float, optional): learning rate .The default value is 1. + max_iter (int, optional): maximal number of iterations per optimization step. + The default value is 20. + max_eval (int, optional): maximal number of function evaluations per optimization + step. The default value is max_iter * 1.25. + tolerance_grad (float, optional): termination tolerance on first order optimality + The default value is 1e-5. + tolerance_change (float, optional): termination tolerance on function + value/parameter changes. The default value is 1e-9. + history_size (int, optional): update history size. The default value is 100. + line_search_fn (string, optional): either 'strong_wolfe' or None. The default value is strong_wolfe. + parameters (list|tuple, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. The default value is None. + weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \ + It canbe a float value as coeff of L2 regularization or \ + :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`. + If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \ + the regularization setting here in optimizer will be ignored for this parameter. \ + Otherwise, the regularization setting here in optimizer will take effect. \ + Default None, meaning there is no regularization. + grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of \ + some derived class of ``GradientClipBase`` . There are three cliping strategies \ + ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , \ + :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. + name (str, optional): Normally there is no need for user to set this property. + For more information, please refer to :ref:`api_guide_Name`. + The default value is None. + + Return: + loss (Tensor): the final loss of closure. + + Examples: + .. code-block:: python + + import paddle + import numpy as np + from paddle.incubate.optimizer import LBFGS + + paddle.disable_static() + np.random.seed(0) + np_w = np.random.rand(1).astype(np.float32) + np_x = np.random.rand(1).astype(np.float32) + + inputs = [np.random.rand(1).astype(np.float32) for i in range(10)] + # y = 2x + targets = [2 * x for x in inputs] + + class Net(paddle.nn.Layer): + def __init__(self): + super().__init__() + w = paddle.to_tensor(np_w) + self.w = paddle.create_parameter(shape=w.shape, dtype=w.dtype, default_initializer=paddle.nn.initializer.Assign(w)) + + def forward(self, x): + return self.w * x + + net = Net() + opt = LBFGS(learning_rate=1, max_iter=1, max_eval=None, tolerance_grad=1e-07, tolerance_change=1e-09, history_size=100, line_search_fn='strong_wolfe', parameters=net.parameters()) + def train_step(inputs, targets): + def closure(): + outputs = net(inputs) + loss = paddle.nn.functional.mse_loss(outputs, targets) + print('loss: ', loss.item()) + opt.clear_grad() + loss.backward() + return loss + opt.step(closure) + + + for input, target in zip(inputs, targets): + input = paddle.to_tensor(input) + target = paddle.to_tensor(target) + train_step(input, target) + + """ + + def __init__( + self, + learning_rate=1.0, + max_iter=20, + max_eval=None, + tolerance_grad=1e-7, + tolerance_change=1e-9, + history_size=100, + line_search_fn=None, + parameters=None, + weight_decay=None, + grad_clip=None, + name=None, + ): + if max_eval is None: + max_eval = max_iter * 5 // 4 + + self.learning_rate = learning_rate + self.max_iter = max_iter + self.max_eval = max_eval + self.tolerance_grad = tolerance_grad + self.tolerance_change = tolerance_change + self.history_size = history_size + self.line_search_fn = line_search_fn + + if isinstance(parameters, paddle.Tensor): + raise TypeError( + "parameters argument given to the optimizer should be " + "an iterable of Tensors or dicts, but got " + type(parameters) + ) + + self.state = defaultdict(dict) + + super().__init__( + learning_rate=1.0, + parameters=parameters, + weight_decay=weight_decay, + grad_clip=grad_clip, + name=name, + ) + + if not isinstance(self._parameter_list[0], dict): + self._params = self._parameter_list + else: + for idx, param_group in enumerate(self._param_groups): + self._params = param_group['params'] + + self._numel_cache = None + + def state_dict(self): + r"""Returns the state of the optimizer as a :class:`dict`. + + Return: + state, a dict holding current optimization state. Its content + differs between optimizer classes. + """ + + packed_state = {} + for k, v in self.state.items(): + packed_state.update({k: v}) + + return {'state': packed_state} + + def _numel(self): + # compute the number of all parameters + if self._numel_cache is None: + self._numel_cache = reduce( + lambda total, p: total + p.numel(), self._params, 0 + ) + return self._numel_cache + + # flatten grad of all parameters + def _gather_flat_grad(self): + views = [] + for p in self._params: + if p.grad is None: + view = paddle.zeros_like(p).reshape([-1]) + else: + view = p.grad.reshape([-1]) + views.append(view) + return paddle.concat(views, axis=0) + + # compute xk = xk + alpha * direction + def _add_grad(self, alpha, direction): + offset = 0 + for p in self._params: + numel = reduce(lambda x, y: x * y, p.shape) + p = paddle.assign( + p.add( + direction[offset : offset + numel].reshape(p.shape) * alpha + ), + p, + ) + offset += numel + assert offset == self._numel() + + def _clone_param(self): + return [p.clone() for p in self._params] + + def _set_param(self, params_data): + for p, pdata in zip(self._params, params_data): + paddle.assign(pdata, p) + + def _directional_evaluate(self, closure, x, alpha, d): + self._add_grad(alpha, d) + loss = float(closure()) + flat_grad = self._gather_flat_grad() + self._set_param(x) + return loss, flat_grad + + @framework.non_static_only + def step(self, closure): + """Performs a single optimization step. + Args: + closure (callable): A closure that reevaluates the model + and returns the loss. + """ + + with paddle.no_grad(): + # Make sure the closure is always called with grad enabled + closure = paddle.enable_grad()(closure) + + learning_rate = self.learning_rate + max_iter = self.max_iter + max_eval = self.max_eval + tolerance_grad = self.tolerance_grad + tolerance_change = self.tolerance_change + line_search_fn = self.line_search_fn + history_size = self.history_size + state = self.state + state.setdefault('func_evals', 0) + state.setdefault('n_iter', 0) + + # evaluate initial f(x) and df/dx + orig_loss = closure() + loss = float(orig_loss) + + current_evals = 1 + state['func_evals'] += 1 + + flat_grad = self._gather_flat_grad() + opt_cond = flat_grad.abs().max() <= tolerance_grad + + # optimal condition + if opt_cond: + return orig_loss + + # tensors cached in state (for tracing) + d = state.get('d') + alpha = state.get('alpha') + old_yk = state.get('old_yk') + old_sk = state.get('old_sk') + ro = state.get('ro') + H_diag = state.get('H_diag') + prev_flat_grad = state.get('prev_flat_grad') + prev_loss = state.get('prev_loss') + + n_iter = 0 + # optimize for a max of max_iter iterations + while n_iter < max_iter: + # keep track of nb of iterations + n_iter += 1 + state['n_iter'] += 1 + + ############################################################ + # compute gradient descent direction + ############################################################ + if state['n_iter'] == 1: + d = flat_grad.neg() + old_yk = [] + old_sk = [] + ro = [] + H_diag = paddle.to_tensor(1.0, dtype=orig_loss.dtype) + else: + # do lbfgs update (update memory) + y = flat_grad.subtract(prev_flat_grad) + s = d.multiply(paddle.to_tensor(alpha, dtype=d.dtype)) + ys = y.dot(s) + if ys > 1e-10: + # updating memory + if len(old_yk) == history_size: + # shift history by one (limited-memory) + old_yk.pop(0) + old_sk.pop(0) + ro.pop(0) + + # store new direction/step + old_yk.append(y) + old_sk.append(s) + ro.append(1.0 / ys) + + # update scale of initial Hessian approximation + H_diag = ys / y.dot(y) # (y*y) + + # compute the approximate (L-BFGS) inverse Hessian + # multiplied by the gradient + num_old = len(old_yk) + + if 'al' not in state: + state['al'] = [None] * history_size + al = state['al'] + + # iteration in L-BFGS loop collapsed to use just one buffer + q = flat_grad.neg() + for i in range(num_old - 1, -1, -1): + al[i] = old_sk[i].dot(q) * ro[i] + paddle.assign(q.add(old_yk[i] * (-al[i])), q) + + # multiply by initial Hessian + # r/d is the final direction + d = r = paddle.multiply(q, H_diag) + for i in range(num_old): + be_i = old_yk[i].dot(r) * ro[i] + paddle.assign(r.add(old_sk[i] * (al[i] - be_i)), r) + + if prev_flat_grad is None: + prev_flat_grad = flat_grad.clone() + else: + paddle.assign(flat_grad, prev_flat_grad) + prev_loss = loss + + ############################################################ + # compute step length + ############################################################ + # reset initial guess for step size + if state['n_iter'] == 1: + alpha = ( + min(1.0, 1.0 / flat_grad.abs().sum()) * learning_rate + ) + else: + alpha = learning_rate + + # directional derivative + gtd = flat_grad.dot(d) + + # directional derivative is below tolerance + if gtd > -tolerance_change: + break + + # optional line search: user function + ls_func_evals = 0 + if line_search_fn is not None: + # perform line search, using user function + if line_search_fn != "strong_wolfe": + raise RuntimeError("only 'strong_wolfe' is supported") + else: + x_init = self._clone_param() + + def obj_func(x, alpha, d): + return self._directional_evaluate( + closure, x, alpha, d + ) + + loss, flat_grad, alpha, ls_func_evals = _strong_wolfe( + obj_func, x_init, alpha, d, loss, flat_grad, gtd + ) + self._add_grad(alpha, d) + opt_cond = flat_grad.abs().max() <= tolerance_grad + else: + # no line search, simply move with fixed-step + self._add_grad(alpha, d) + if n_iter != max_iter: + with paddle.enable_grad(): + loss = float(closure()) + flat_grad = self._gather_flat_grad() + opt_cond = flat_grad.abs().max() <= tolerance_grad + ls_func_evals = 1 + + # update func eval + current_evals += ls_func_evals + state['func_evals'] += ls_func_evals + + # optimal condition + if opt_cond: + break + + # lack of progress + if (d * alpha).abs().max() <= tolerance_change: + break + + if abs(loss - prev_loss) < tolerance_change: + break + + # check conditions + if current_evals >= max_eval: + break + + if n_iter == max_iter: + break + + state['d'] = d + state['alpha'] = alpha + state['old_yk'] = old_yk + state['old_sk'] = old_sk + state['ro'] = ro + state['H_diag'] = H_diag + state['prev_flat_grad'] = prev_flat_grad + state['prev_loss'] = prev_loss + + return orig_loss From 1a790edf0804a0e22ad5bd99f01c00f2e3bcf74a Mon Sep 17 00:00:00 2001 From: zqw_1997 <118182234+zhengqiwen1997@users.noreply.github.com> Date: Wed, 26 Apr 2023 16:43:09 +0800 Subject: [PATCH 082/405] [Zero-Dim] Support output 0D for gather_nd, einsum. (#53175) * add test cases, test=allcase * fix test cases, test=allcase * fix test cases, test=allcase * assert_allclose, test=allcase * 1e-5 to 1e-4, test=allcase * change rtol from 1e-4 to 1e-3, test=allcase --- .../tests/unittests/test_zero_dim_tensor.py | 106 ++++++++++++++++++ python/paddle/tensor/einsum.py | 4 +- 2 files changed, 108 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py index 9c049ddbf435d..0e2449e7799c2 100644 --- a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py @@ -17,6 +17,7 @@ # 0D Tensor's shape is always [], numel is 1 # which can be created by paddle.rand([]) +import os import unittest import numpy as np @@ -1719,6 +1720,75 @@ def test_gather_xD_axis_1(self): self.assertEqual(x.grad.shape, [2, 3]) self.assertEqual(out.grad.shape, [2]) + def test_gather_nd(self): + x1 = paddle.to_tensor([1.0, 3.0, 5.0, 7.0, 9.0], stop_gradient=False) + x2 = paddle.to_tensor( + [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], stop_gradient=False + ) + + index1 = paddle.full([1], 1, 'int64') + index2 = paddle.full([2], 1, 'int64') + + out1 = paddle.gather_nd(x1, index1) + out2 = paddle.gather_nd(x2, index2) + + out1.retain_grads() + out2.retain_grads() + + out1.backward() + out2.backward() + + self.assertEqual(out1.shape, []) + self.assertEqual(out2.shape, []) + np.testing.assert_array_equal(out1, np.array(3.0)) + np.testing.assert_array_equal(out2, np.array(5.0)) + self.assertEqual(x1.grad.shape, [5]) + self.assertEqual(x2.grad.shape, [2, 3]) + self.assertEqual(out1.grad.shape, []) + self.assertEqual(out2.grad.shape, []) + + def test_einsum(self): + os.environ['FLAGS_new_einsum'] = "0" + x = paddle.rand([5]) + # sum + out1 = paddle.einsum('i->', x) + expect1 = np.einsum('i->', x) + # dot + out2 = paddle.einsum('i,i->', x, x) + expect2 = np.einsum('i,i->', x, x) + + out1.retain_grads() + out2.retain_grads() + + out1.backward() + out2.backward() + + self.assertEqual(out1.shape, []) + self.assertEqual(out2.shape, []) + np.testing.assert_allclose(out1, expect1, rtol=1e-03) + np.testing.assert_allclose(out2, expect2, rtol=1e-03) + + def test_einsum_V2(self): + os.environ['FLAGS_new_einsum'] = "1" + x = paddle.rand([5]) + # sum + out1 = paddle.einsum('i->', x) + expect1 = np.einsum('i->', x) + # dot + out2 = paddle.einsum('i,i->', x, x) + expect2 = np.einsum('i,i->', x, x) + + out1.retain_grads() + out2.retain_grads() + + out1.backward() + out2.backward() + + self.assertEqual(out1.shape, []) + self.assertEqual(out2.shape, []) + np.testing.assert_allclose(out1, expect1, rtol=1e-03) + np.testing.assert_allclose(out2, expect2, rtol=1e-03) + def test_scatter_1D(self): x = paddle.to_tensor([1.0, 3.0, 5.0, 7.0, 9.0], stop_gradient=False) index = paddle.full([], 2, 'int64') @@ -3520,6 +3590,42 @@ def test_gather_XD_axis_1(self): self.assertEqual(res[1].shape, (2, 3)) self.assertEqual(res[2].shape, (2,)) + @prog_scope() + def test_gather_nd(self): + x1 = paddle.full([10], 1.0, 'float32') + x1.stop_gradient = False + x2 = paddle.full([2, 3], 1.0, 'float32') + x2.stop_gradient = False + + index1 = paddle.full([1], 1, 'int64') + index2 = paddle.full([2], 1, 'int64') + + out1 = paddle.gather_nd(x1, index1) + out2 = paddle.gather_nd(x2, index2) + paddle.static.append_backward(out1.sum()) + paddle.static.append_backward(out2.sum()) + + prog = paddle.static.default_main_program() + res = self.exe.run( + prog, + fetch_list=[ + out1, + out2, + x1.grad_name, + x2.grad_name, + out1.grad_name, + out2.grad_name, + ], + ) + self.assertEqual(res[0].shape, ()) + self.assertEqual(res[1].shape, ()) + np.testing.assert_array_equal(res[0], 1.0) + np.testing.assert_array_equal(res[1], 1.0) + self.assertEqual(res[2].shape, (10,)) + self.assertEqual(res[3].shape, (2, 3)) + self.assertEqual(res[4].shape, ()) + self.assertEqual(res[5].shape, ()) + @prog_scope() def test_scatter_1D(self): x = paddle.full([10], 1.0, 'float32') diff --git a/python/paddle/tensor/einsum.py b/python/paddle/tensor/einsum.py index 7ab104dde94b0..082300763740a 100644 --- a/python/paddle/tensor/einsum.py +++ b/python/paddle/tensor/einsum.py @@ -966,8 +966,8 @@ def einsum(equation, *operands): # dot print(paddle.einsum('i,i->', x, x)) - # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [1.45936954]) + # Tensor(shape=[], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # 1.45936954) # outer print(paddle.einsum("i,j->ij", x, y)) From e72cad592400d00009bc208fbbec9a532698b7cd Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com> Date: Wed, 26 Apr 2023 17:02:44 +0800 Subject: [PATCH 083/405] modify approve rules (#53270) * delete prim flag for matmul_2_grad * delete prim flag for matmul_2_grad * recover multiply prune * add rules * add rules * add rules * modify approve rules * fix conflict * recover block --- tools/check_file_diff_approvals.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh index b67a343b8ea8f..55efe2d47c395 100644 --- a/tools/check_file_diff_approvals.sh +++ b/tools/check_file_diff_approvals.sh @@ -82,9 +82,10 @@ API_FILES=("CMakeLists.txt" "paddle/phi/core/infermeta_utils.h" "paddle/fluid/prim/api/composite_backward/composite_backward_api.h" "paddle/fluid/prim/api/manual_prim/prim_manual_api.h" + "paddle/fluid/prim/api/api.yaml" "python/paddle/incubate/autograd/composite_rules.py" "python/paddle/incubate/autograd/primitives.py" - "paddle/fluid/prim/api/api.yaml" + ) approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000` From 1164626c7e9cae1f9ca468fe78493d59649c5689 Mon Sep 17 00:00:00 2001 From: Ruibiao Chen Date: Wed, 26 Apr 2023 17:03:25 +0800 Subject: [PATCH 084/405] Fix fused_attention_op and fused_feedforward_op bugs in xpu (#53318) * Fix fused_attention_op and fused_feedforward_op bugs in xpu * Fix d_x alloc errors for fused_feedforward_grad_kernel --- ...rd_grad_xpu_kernel.cc => fused_feedforward_grad_kernel.cc} | 4 ++-- ..._feedforward_xpu_kernel.cc => fused_feedforward_kernel.cc} | 2 +- paddle/phi/kernels/xpu/fused_attention_kernel.cc | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) rename paddle/phi/kernels/fusion/xpu/{fused_feedforward_grad_xpu_kernel.cc => fused_feedforward_grad_kernel.cc} (99%) rename paddle/phi/kernels/fusion/xpu/{fused_feedforward_xpu_kernel.cc => fused_feedforward_kernel.cc} (99%) diff --git a/paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_kernel.cc similarity index 99% rename from paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_xpu_kernel.cc rename to paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_kernel.cc index cb10930dc9b3e..6798df360de19 100644 --- a/paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_xpu_kernel.cc +++ b/paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_kernel.cc @@ -478,7 +478,7 @@ void FusedFeedForwardGradKernel( dropout2_fix_seed, nullptr, dropout2_seed_val); - + dev_ctx.template Alloc(d_x); dev_ctx.template Alloc(d_ln_scale); dev_ctx.template Alloc(d_ln_bias); dev_ctx.template Alloc(d_linear1_bias); @@ -529,7 +529,7 @@ void FusedFeedForwardGradKernel( } // namespace fusion } // namespace phi -PD_REGISTER_KERNEL(fused_feedward_grad, +PD_REGISTER_KERNEL(fused_feedforward_grad, XPU, ALL_LAYOUT, phi::fusion::FusedFeedForwardGradKernel, diff --git a/paddle/phi/kernels/fusion/xpu/fused_feedforward_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/fused_feedforward_kernel.cc similarity index 99% rename from paddle/phi/kernels/fusion/xpu/fused_feedforward_xpu_kernel.cc rename to paddle/phi/kernels/fusion/xpu/fused_feedforward_kernel.cc index 35039ba571e57..221305014190b 100644 --- a/paddle/phi/kernels/fusion/xpu/fused_feedforward_xpu_kernel.cc +++ b/paddle/phi/kernels/fusion/xpu/fused_feedforward_kernel.cc @@ -377,7 +377,7 @@ void FusedFeedForwardKernel(const Context& dev_ctx, } // namespace fusion } // namespace phi -PD_REGISTER_KERNEL(fused_feedward, +PD_REGISTER_KERNEL(fused_feedforward, XPU, ALL_LAYOUT, phi::fusion::FusedFeedForwardKernel, diff --git a/paddle/phi/kernels/xpu/fused_attention_kernel.cc b/paddle/phi/kernels/xpu/fused_attention_kernel.cc index e91c109b37502..d18dda47866ef 100644 --- a/paddle/phi/kernels/xpu/fused_attention_kernel.cc +++ b/paddle/phi/kernels/xpu/fused_attention_kernel.cc @@ -181,12 +181,12 @@ void FusedAttentionKernel(const Context &dev_ctx, float *ln_mean_ptr = (ln_mean == nullptr) ? (nullptr) - : reinterpret_cast(dev_ctx.template Alloc(ln_mean)); + : reinterpret_cast(dev_ctx.template Alloc(ln_mean)); float *ln_var_ptr = (ln_var == nullptr) ? (nullptr) - : reinterpret_cast(dev_ctx.template Alloc(ln_var)); + : reinterpret_cast(dev_ctx.template Alloc(ln_var)); XPUTypeT *ln_out_ptr = (ln_out == nullptr) From 55c4eb8a6c58a3d72d92b6b7c797037681e2a417 Mon Sep 17 00:00:00 2001 From: mhy-666 <57670156+mhy-666@users.noreply.github.com> Date: Wed, 26 Apr 2023 17:21:59 +0800 Subject: [PATCH 085/405] =?UTF-8?q?=E3=80=90prim=E3=80=91scatter=5Fnd=5Fad?= =?UTF-8?q?d=5Fgrad=20(#52469)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add scatter_nd_add comp * add scatter_nd_add prim * fix * fix * add public_python_api in TestScatterNdAddSimpleOp setup function * fix composite_backward_api.h * fix composite_backward * add test cases * fix composite_backward_api.h, unittest --- .../composite_backward_api.h | 16 ++ paddle/phi/api/yaml/backward.yaml | 1 + .../tests/unittests/test_scatter_nd_op.py | 247 ++++++++++-------- 3 files changed, 153 insertions(+), 111 deletions(-) diff --git a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h index 5e1e490c1b73a..099ebc81b900b 100644 --- a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h +++ b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h @@ -1805,5 +1805,21 @@ void roll_grad(const Tensor& x, set_output(x_grad_output, x_grad); } } + +template +void scatter_nd_add_grad(const Tensor& index, + const Tensor& updates, + const Tensor& out_grad, + Tensor* x_grad, + Tensor* updates_grad) { + if (x_grad) { + by_pass(out_grad, x_grad); + } + if (updates_grad) { + // Gradient by Gather: dUpdates = dO[Ids] + auto tmp_updates_grad = gather_nd(out_grad, index); + set_output(tmp_updates_grad, updates_grad); + } +} } // namespace prim } // namespace paddle diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml index 2394182ee4bd1..6faf2d0ba7a49 100644 --- a/paddle/phi/api/yaml/backward.yaml +++ b/paddle/phi/api/yaml/backward.yaml @@ -1454,6 +1454,7 @@ kernel : func : scatter_nd_add_grad no_need_buffer : updates + composite: scatter_nd_add_grad(index, updates, out_grad, x_grad, updates_grad) - backward_op : segment_pool_grad forward : segment_pool (Tensor x, Tensor segment_ids, str pooltype="SUM") -> Tensor(out), Tensor(summed_ids) diff --git a/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py b/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py index 0d09e0af5c32a..66799466c59e4 100644 --- a/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py +++ b/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py @@ -69,6 +69,8 @@ class TestScatterNdAddSimpleOp(OpTest): def setUp(self): self.op_type = "scatter_nd_add" self.python_api = paddle.scatter_nd_add + self.public_python_api = paddle.scatter_nd_add + self.prim_op_type = "prim" self._set_dtype() if self.dtype == np.float64: target_dtype = "float64" @@ -94,7 +96,7 @@ def test_check_output(self): self.check_output() def test_check_grad(self): - self.check_grad(['X', 'Updates'], 'Out') + self.check_grad(['X', 'Updates'], 'Out', check_prim=True) class TestScatterNdAddSimpleFP16Op(TestScatterNdAddSimpleOp): @@ -127,7 +129,9 @@ def test_check_output(self): def test_check_grad(self): if core.is_compiled_with_cuda(): place = core.CUDAPlace(0) - self.check_grad_with_place(place, ['X', 'Updates'], 'Out') + self.check_grad_with_place( + place, ['X', 'Updates'], 'Out', check_prim=True + ) class TestScatterNdAddWithEmptyIndex(OpTest): @@ -138,6 +142,8 @@ class TestScatterNdAddWithEmptyIndex(OpTest): def setUp(self): self.op_type = "scatter_nd_add" self.python_api = paddle.scatter_nd_add + self.public_python_api = paddle.scatter_nd_add + self.prim_op_type = "prim" self._set_dtype() if self.dtype == np.float64: target_dtype = "float64" @@ -166,7 +172,7 @@ def test_check_output(self): self.check_output() def test_check_grad(self): - self.check_grad(['X', 'Updates'], 'Out') + self.check_grad(['X', 'Updates'], 'Out', check_prim=True) class TestScatterNdAddWithEmptyIndexFP16(TestScatterNdAddWithEmptyIndex): @@ -199,7 +205,9 @@ def test_check_output(self): def test_check_grad(self): if core.is_compiled_with_cuda(): place = core.CUDAPlace(0) - self.check_grad_with_place(place, ['X', 'Updates'], 'Out') + self.check_grad_with_place( + place, ['X', 'Updates'], 'Out', check_prim=True + ) class TestScatterNdAddWithHighRankSame(OpTest): @@ -210,6 +218,8 @@ class TestScatterNdAddWithHighRankSame(OpTest): def setUp(self): self.op_type = "scatter_nd_add" self.python_api = paddle.scatter_nd_add + self.public_python_api = paddle.scatter_nd_add + self.prim_op_type = "prim" self._set_dtype() if self.dtype == np.float64: target_dtype = "float64" @@ -241,7 +251,7 @@ def test_check_output(self): self.check_output() def test_check_grad(self): - self.check_grad(['X', 'Updates'], 'Out') + self.check_grad(['X', 'Updates'], 'Out', check_prim=True) class TestScatterNdAddWithHighRankSameFP16(TestScatterNdAddWithHighRankSame): @@ -274,7 +284,9 @@ def test_check_output(self): def test_check_grad(self): if core.is_compiled_with_cuda(): place = core.CUDAPlace(0) - self.check_grad_with_place(place, ['X', 'Updates'], 'Out') + self.check_grad_with_place( + place, ['X', 'Updates'], 'Out', check_prim=True + ) class TestScatterNdAddWithHighRankDiff(OpTest): @@ -285,6 +297,8 @@ class TestScatterNdAddWithHighRankDiff(OpTest): def setUp(self): self.op_type = "scatter_nd_add" self.python_api = paddle.scatter_nd_add + self.public_python_api = paddle.scatter_nd_add + self.prim_op_type = "prim" shape = (8, 2, 2, 1, 10) ref_np = np.random.rand(*shape).astype("double") index = np.vstack([np.random.randint(0, s, size=500) for s in shape]).T @@ -300,7 +314,7 @@ def test_check_output(self): self.check_output() def test_check_grad(self): - self.check_grad(['X', 'Updates'], 'Out') + self.check_grad(['X', 'Updates'], 'Out', check_prim=True) # Test Python API @@ -310,70 +324,76 @@ class TestScatterNdOpAPI(unittest.TestCase): """ def testcase1(self): - ref1 = paddle.static.data( - name='ref1', - shape=[10, 9, 8, 1, 3], - dtype='float32', - ) - index1 = paddle.static.data( - name='index1', - shape=[5, 5, 8, 5], - dtype='int32', - ) - updates1 = paddle.static.data( - name='update1', - shape=[5, 5, 8], - dtype='float32', - ) - output1 = paddle.scatter_nd_add(ref1, index1, updates1) + with paddle.fluid.framework._static_guard(): + ref1 = paddle.static.data( + name='ref1', + shape=[10, 9, 8, 1, 3], + dtype='float32', + ) + index1 = paddle.static.data( + name='index1', + shape=[5, 5, 8, 5], + dtype='int32', + ) + updates1 = paddle.static.data( + name='update1', + shape=[5, 5, 8], + dtype='float32', + ) + output1 = paddle.scatter_nd_add(ref1, index1, updates1) def testcase2(self): - ref2 = paddle.static.data( - name='ref2', - shape=[10, 9, 8, 1, 3], - dtype='double', - ) - index2 = paddle.static.data( - name='index2', - shape=[5, 8, 5], - dtype='int32', - ) - updates2 = paddle.static.data( - name='update2', - shape=[5, 8], - dtype='double', - ) - output2 = paddle.scatter_nd_add( - ref2, index2, updates2, name="scatter_nd_add" - ) + with paddle.fluid.framework._static_guard(): + ref2 = paddle.static.data( + name='ref2', + shape=[10, 9, 8, 1, 3], + dtype='double', + ) + index2 = paddle.static.data( + name='index2', + shape=[5, 8, 5], + dtype='int32', + ) + updates2 = paddle.static.data( + name='update2', + shape=[5, 8], + dtype='double', + ) + output2 = paddle.scatter_nd_add( + ref2, index2, updates2, name="scatter_nd_add" + ) def testcase3(self): - shape3 = [10, 9, 8, 1, 3] - index3 = paddle.static.data( - name='index3', - shape=[5, 5, 8, 5], - dtype='int32', - ) - updates3 = paddle.static.data( - name='update3', - shape=[5, 5, 8], - dtype='float32', - ) - output3 = paddle.scatter_nd(index3, updates3, shape3) + with paddle.fluid.framework._static_guard(): + shape3 = [10, 9, 8, 1, 3] + index3 = paddle.static.data( + name='index3', + shape=[5, 5, 8, 5], + dtype='int32', + ) + updates3 = paddle.static.data( + name='update3', + shape=[5, 5, 8], + dtype='float32', + ) + output3 = paddle.scatter_nd(index3, updates3, shape3) def testcase4(self): - shape4 = [10, 9, 8, 1, 3] - index4 = paddle.static.data( - name='index4', - shape=[5, 5, 8, 5], - dtype='int32', - ) - updates4 = paddle.static.data( - name='update4', - shape=[5, 5, 8], - dtype='double', - ) - output4 = paddle.scatter_nd(index4, updates4, shape4, name='scatter_nd') + with paddle.fluid.framework._static_guard(): + shape4 = [10, 9, 8, 1, 3] + index4 = paddle.static.data( + name='index4', + shape=[5, 5, 8, 5], + dtype='int32', + ) + updates4 = paddle.static.data( + name='update4', + shape=[5, 5, 8], + dtype='double', + ) + output4 = paddle.scatter_nd( + index4, updates4, shape4, name='scatter_nd' + ) def testcase5(self): if not fluid.core.is_compiled_with_cuda(): @@ -430,60 +450,65 @@ def test_static_graph(): class TestScatterNdOpRaise(unittest.TestCase): def test_check_raise(self): def check_raise_is_test(): - try: - ref5 = paddle.static.data( - name='ref5', shape=[-1, 3, 4, 5], dtype='float32' - ) - index5 = paddle.static.data( - name='index5', shape=[-1, 2, 10], dtype='int32' - ) - updates5 = paddle.static.data( - name='updates5', shape=[-1, 2, 10], dtype='float32' - ) - output5 = paddle.scatter_nd_add(ref5, index5, updates5) - except Exception as e: - t = "The last dimension of Input(Index)'s shape should be no greater " - if t in str(e): - raise IndexError + with paddle.fluid.framework._static_guard(): + try: + ref5 = paddle.static.data( + name='ref5', shape=[-1, 3, 4, 5], dtype='float32' + ) + index5 = paddle.static.data( + name='index5', shape=[-1, 2, 10], dtype='int32' + ) + updates5 = paddle.static.data( + name='updates5', shape=[-1, 2, 10], dtype='float32' + ) + output5 = paddle.scatter_nd_add(ref5, index5, updates5) + except Exception as e: + t = "The last dimension of Input(Index)'s shape should be no greater " + if t in str(e): + raise IndexError self.assertRaises(IndexError, check_raise_is_test) def test_check_raise2(self): with self.assertRaises(ValueError): - ref6 = paddle.static.data( - name='ref6', - shape=[10, 9, 8, 1, 3], - dtype='double', - ) - index6 = paddle.static.data( - name='index6', - shape=[5, 8, 5], - dtype='int32', - ) - updates6 = paddle.static.data( - name='update6', - shape=[5, 8], - dtype='float32', - ) - output6 = paddle.scatter_nd_add(ref6, index6, updates6) - - def test_check_raise3(self): - def check_raise_is_test(): - try: - shape = [3, 4, 5] - index7 = paddle.static.data( - name='index7', shape=[-1, 2, 1], dtype='int32' + with paddle.fluid.framework._static_guard(): + ref6 = paddle.static.data( + name='ref6', + shape=[10, 9, 8, 1, 3], + dtype='double', + ) + index6 = paddle.static.data( + name='index6', + shape=[5, 8, 5], + dtype='int32', ) - updates7 = paddle.static.data( - name='updates7', shape=[-1, 2, 4, 5, 20], dtype='float32' + updates6 = paddle.static.data( + name='update6', + shape=[5, 8], + dtype='float32', ) - output7 = paddle.scatter_nd(index7, updates7, shape) - except Exception as e: - t = "Updates has wrong shape" - if t in str(e): - raise ValueError + output6 = paddle.scatter_nd_add(ref6, index6, updates6) - self.assertRaises(ValueError, check_raise_is_test) + def test_check_raise3(self): + def check_raise_is_test(): + with paddle.fluid.framework._static_guard(): + try: + shape = [3, 4, 5] + index7 = paddle.static.data( + name='index7', shape=[-1, 2, 1], dtype='int32' + ) + updates7 = paddle.static.data( + name='updates7', + shape=[-1, 2, 4, 5, 20], + dtype='float32', + ) + output7 = paddle.scatter_nd(index7, updates7, shape) + except Exception as e: + t = "Updates has wrong shape" + if t in str(e): + raise ValueError + + self.assertRaises(ValueError, check_raise_is_test) class TestDygraph(unittest.TestCase): From 0e30d56a5330a13e5d5590d5c52e7ff05b487772 Mon Sep 17 00:00:00 2001 From: huangjiyi <43315610+huangjiyi@users.noreply.github.com> Date: Wed, 26 Apr 2023 19:42:05 +0800 Subject: [PATCH 086/405] Add CMake option WITH_CPP_DIST for installing C++ distribution (#53282) * rm paddle_install_dir * update test=document_fix * update test=document_fix * update * add test * update * update * update * update * update * update * update * update * update * add todo commont --- CMakeLists.txt | 16 +++ cmake/PaddleConfig.cmake.in | 33 +++++ cmake/inference_lib.cmake | 161 +------------------------ cmake/paddle_lib.cmake | 24 ++++ cmake/version.cmake | 43 +++++++ python/env_dict.py.in | 5 +- python/setup.py.in | 82 ++++++++++--- setup.py | 57 +++++++++ test/CMakeLists.txt | 9 ++ test/paddle_lib/CMakeLists.txt.in | 14 +++ test/paddle_lib/test_paddle_lib.cc | 27 +++++ test/paddle_lib/test_paddle_lib_gpu.cc | 25 ++++ 12 files changed, 321 insertions(+), 175 deletions(-) create mode 100644 cmake/PaddleConfig.cmake.in create mode 100644 cmake/paddle_lib.cmake create mode 100644 test/paddle_lib/CMakeLists.txt.in create mode 100644 test/paddle_lib/test_paddle_lib.cc create mode 100644 test/paddle_lib/test_paddle_lib_gpu.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index db92b63b41c7a..34110edff3038 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -246,6 +246,7 @@ option(WITH_DISTRIBUTE "Compile with distributed support" OFF) option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF) option(ON_INFER "Turn on inference optimization and inference-lib generation" ON) +option(WITH_CPP_DIST "Install PaddlePaddle C++ distribution" OFF) ################################ Internal Configurations ####################################### option(WITH_NV_JETSON "Compile PaddlePaddle with NV JETSON" OFF) option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" @@ -662,6 +663,21 @@ if(WITH_STRIP) endif() endif() +if(WITH_CPP_DIST) + # TODO(huangjiyi): Separate installing C++ distribution from python package + # installation and support for installing C++ distribution on more platforms. + if(NOT LINUX OR NOT WITH_PYTHON) + set(WITH_CPP_DIST + OFF + CACHE + STRING + "Currently C++ Distribution Generation is only available on Linux and compiling WITH_PYTHON=ON." + FORCE) + else() + include(paddle_lib) + endif() +endif() + add_subdirectory(paddle) if(WITH_PYTHON) add_subdirectory(python) diff --git a/cmake/PaddleConfig.cmake.in b/cmake/PaddleConfig.cmake.in new file mode 100644 index 0000000000000..d32c23f6f6edd --- /dev/null +++ b/cmake/PaddleConfig.cmake.in @@ -0,0 +1,33 @@ +# Paddle CMake configuration file +# ------- +# +# Finds the Paddle library +# +# This will define the following variables: +# +# PADDLE_FOUND -- True if the system has the Paddle library +# PADDLE_INCLUDE_DIRS -- The include directories for Paddle +# PADDLE_LIBRARIES -- Libraries to link against + +get_filename_component(PADDLE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_FILE}/../.." ABSOLUTE) + +# include directories +set(PADDLE_INCLUDE_DIRS + ${PADDLE_INSTALL_PREFIX}/include + ${PADDLE_INSTALL_PREFIX}/include/third_party +) + +# Library dependencies. +set(PADDLE_LIBRARIES_DIRS ${PADDLE_INSTALL_PREFIX}/lib) +link_directories(${PADDLE_LIBRARIES_DIRS}) + +file(GLOB PADDLE_LIBRARIES ${PADDLE_LIBRARIES_DIRS}/lib*) + +find_package(PythonLibs @PY_VERSION@ REQUIRED) +list(APPEND PADDLE_INCLUDE_DIRS ${PYTHON_INCLUDE_DIRS}) +list(APPEND PADDLE_LIBRARIES ${PYTHON_LIBRARIES}) + +if(@WITH_GPU@) + find_package(CUDA @CUDA_VERSION@ REQUIRED) + list(APPEND PADDLE_LIBRARIES ${CUDA_LIBRARIES}) +endif() diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index f5fc9b8b9cf8f..cb1fdf72b8e2a 100755 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -12,11 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# make package for paddle fluid shared and static library -set(PADDLE_INSTALL_DIR - "${CMAKE_BINARY_DIR}/paddle_install_dir" - CACHE STRING "A path setting paddle shared and static libraries") - +# make package for paddle inference shared and static library set(PADDLE_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_install_dir" CACHE STRING "A path setting paddle inference shared and static libraries") @@ -214,7 +210,7 @@ endfunction() # inference library for only inference set(inference_lib_deps third_party paddle_inference paddle_inference_c paddle_inference_shared paddle_inference_c_shared) -add_custom_target(inference_lib_dist DEPENDS ${inference_lib_deps}) +add_custom_target(inference_lib_dist ALL DEPENDS ${inference_lib_deps}) set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/threadpool") copy( @@ -387,158 +383,5 @@ if(WITH_STRIP AND NOT WIN32) ) endif() -# fluid library for both train and inference -set(fluid_lib_deps inference_lib_dist) -add_custom_target(fluid_lib_dist ALL DEPENDS ${fluid_lib_deps}) - -set(dst_dir "${PADDLE_INSTALL_DIR}/paddle/fluid") -set(module "inference") -if(WIN32) - copy( - fluid_lib_dist - SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h - ${paddle_inference_lib} - DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} - ${dst_dir}/${module}) -else() - copy( - fluid_lib_dist - SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h - ${paddle_inference_lib} - DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}) -endif() - -set(module "framework") -set(framework_lib_deps framework_proto data_feed_proto trainer_desc_proto) -add_dependencies(fluid_lib_dist ${framework_lib_deps}) -copy( - fluid_lib_dist - SRCS ${src_dir}/${module}/*.h - ${src_dir}/${module}/details/*.h - ${PADDLE_BINARY_DIR}/paddle/fluid/framework/trainer_desc.pb.h - ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h - ${PADDLE_BINARY_DIR}/paddle/fluid/framework/data_feed.pb.h - ${src_dir}/${module}/ir/memory_optimize_pass/*.h - ${src_dir}/${module}/ir/*.h - ${src_dir}/${module}/fleet/*.h - DSTS ${dst_dir}/${module} - ${dst_dir}/${module}/details - ${dst_dir}/${module} - ${dst_dir}/${module} - ${dst_dir}/${module} - ${dst_dir}/${module}/ir/memory_optimize_pass - ${dst_dir}/${module}/ir - ${dst_dir}/${module}/fleet) - -set(module "operators") -copy( - fluid_lib_dist - SRCS ${src_dir}/${module}/reader/blocking_queue.h - DSTS ${dst_dir}/${module}/reader/) - -set(module "memory") -copy( - fluid_lib_dist - SRCS ${src_dir}/${module}/allocation/*.h - DSTS ${dst_dir}/${module}/allocation) - -set(module "platform") -set(platform_lib_deps phi_profiler_proto errors) -if(WITH_GPU) - set(platform_lib_deps ${platform_lib_deps} external_error_proto) -endif() - -add_dependencies(fluid_lib_dist ${platform_lib_deps}) -copy( - fluid_lib_dist - SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h - ${PADDLE_BINARY_DIR}/paddle/phi/api/profiler/*.pb.h - DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}) - -set(module "string") -copy( - fluid_lib_dist - SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/${module}/*.h - ${PADDLE_SOURCE_DIR}/paddle/utils/${module}/tinyformat/*.h - DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat) - -set(module "imperative") -copy( - fluid_lib_dist - SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/jit/*.h - DSTS ${dst_dir}/${module} ${dst_dir}/${module}/jit) - -set(module "pybind") -copy( - fluid_lib_dist - SRCS ${CMAKE_CURRENT_BINARY_DIR}/paddle/fluid/${module}/pybind.h - DSTS ${dst_dir}/${module}) - -set(dst_dir "${PADDLE_INSTALL_DIR}/third_party/eigen3") -copy( - inference_lib_dist - SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src - ${EIGEN_INCLUDE_DIR}/unsupported/Eigen - DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported) - -set(dst_dir "${PADDLE_INSTALL_DIR}/third_party/dlpack") -copy( - inference_lib_dist - SRCS ${DLPACK_INCLUDE_DIR}/dlpack - DSTS ${dst_dir}) - -set(dst_dir "${PADDLE_INSTALL_DIR}/third_party/install/zlib") -copy( - inference_lib_dist - SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib) - -# CMakeCache Info -copy( - fluid_lib_dist - SRCS ${PADDLE_INFERENCE_INSTALL_DIR}/third_party - ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt - DSTS ${PADDLE_INSTALL_DIR} ${PADDLE_INSTALL_DIR}) - -# paddle fluid version -function(version version_file) - execute_process( - COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1 - WORKING_DIRECTORY ${PADDLE_SOURCE_DIR} - OUTPUT_VARIABLE PADDLE_GIT_COMMIT) - file( - WRITE ${version_file} - "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n" "WITH_MKL: ${WITH_MKL}\n" - "WITH_MKLDNN: ${WITH_MKLDNN}\n" "WITH_GPU: ${WITH_GPU}\n" - "WITH_ROCM: ${WITH_ROCM}\n" "WITH_IPU: ${WITH_IPU}\n") - if(WITH_GPU) - file(APPEND ${version_file} - "CUDA version: ${CUDA_VERSION}\n" - "CUDNN version: v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}\n") - endif() - if(WITH_ROCM) - file(APPEND ${version_file} - "HIP version: v${HIP_MAJOR_VERSION}.${HIP_MINOR_VERSION}\n" - "MIOpen version: v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}\n") - endif() - if(WITH_IPU) - file(APPEND ${version_file} "PopART version: ${POPART_VERSION}\n") - endif() - file(APPEND ${version_file} - "CXX compiler version: ${CMAKE_CXX_COMPILER_VERSION}\n") - if(TENSORRT_FOUND) - file( - APPEND ${version_file} - "WITH_TENSORRT: ${TENSORRT_FOUND}\n" - "TensorRT version: v${TENSORRT_MAJOR_VERSION}.${TENSORRT_MINOR_VERSION}.${TENSORRT_PATCH_VERSION}.${TENSORRT_BUILD_VERSION}\n" - ) - endif() - if(WITH_LITE) - file(APPEND ${version_file} "WITH_LITE: ${WITH_LITE}\n" - "LITE_GIT_TAG: ${LITE_GIT_TAG}\n") - endif() - -endfunction() -version(${PADDLE_INSTALL_DIR}/version.txt) version(${PADDLE_INFERENCE_INSTALL_DIR}/version.txt) version(${PADDLE_INFERENCE_C_INSTALL_DIR}/version.txt) diff --git a/cmake/paddle_lib.cmake b/cmake/paddle_lib.cmake new file mode 100644 index 0000000000000..535878b5b3a7c --- /dev/null +++ b/cmake/paddle_lib.cmake @@ -0,0 +1,24 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# make package for paddle shared library +set(PADDLE_INSTALL_DIR ${PADDLE_BINARY_DIR}/paddle_install_dir) +set(PADDLE_LIB_TEST_DIR ${PADDLE_BINARY_DIR}/test/paddle_lib) + +configure_file(${PADDLE_SOURCE_DIR}/cmake/PaddleConfig.cmake.in + ${PADDLE_INSTALL_DIR}/cmake/PaddleConfig.cmake @ONLY) +configure_file(${PADDLE_SOURCE_DIR}/test/paddle_lib/CMakeLists.txt.in + ${PADDLE_BINARY_DIR}/test/paddle_lib/CMakeLists.txt @ONLY) + +version(${PADDLE_INSTALL_DIR}/version.txt) diff --git a/cmake/version.cmake b/cmake/version.cmake index 83bd3f1b1bc4a..e6707665a3851 100644 --- a/cmake/version.cmake +++ b/cmake/version.cmake @@ -71,3 +71,46 @@ math(EXPR PADDLE_VERSION_INTEGER "${PADDLE_MAJOR_VER} * 1000000 add_definitions(-DPADDLE_VERSION=${PADDLE_VERSION}) add_definitions(-DPADDLE_VERSION_INTEGER=${PADDLE_VERSION_INTEGER}) message(STATUS "Paddle version is ${PADDLE_VERSION}") + +# write paddle version +function(version version_file) + execute_process( + COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1 + WORKING_DIRECTORY ${PADDLE_SOURCE_DIR} + OUTPUT_VARIABLE PADDLE_GIT_COMMIT) + file( + WRITE ${version_file} + "Paddle version: ${PADDLE_VERSION}\n" + "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n" + "WITH_MKL: ${WITH_MKL}\n" + "WITH_MKLDNN: ${WITH_MKLDNN}\n" + "WITH_GPU: ${WITH_GPU}\n" + "WITH_ROCM: ${WITH_ROCM}\n" + "WITH_IPU: ${WITH_IPU}\n") + if(WITH_GPU) + file(APPEND ${version_file} + "CUDA version: ${CUDA_VERSION}\n" + "CUDNN version: v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}\n") + endif() + if(WITH_ROCM) + file(APPEND ${version_file} + "HIP version: v${HIP_MAJOR_VERSION}.${HIP_MINOR_VERSION}\n" + "MIOpen version: v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}\n") + endif() + if(WITH_IPU) + file(APPEND ${version_file} "PopART version: ${POPART_VERSION}\n") + endif() + file(APPEND ${version_file} + "CXX compiler version: ${CMAKE_CXX_COMPILER_VERSION}\n") + if(TENSORRT_FOUND) + file( + APPEND ${version_file} + "WITH_TENSORRT: ${TENSORRT_FOUND}\n" + "TensorRT version: v${TENSORRT_MAJOR_VERSION}.${TENSORRT_MINOR_VERSION}.${TENSORRT_PATCH_VERSION}.${TENSORRT_BUILD_VERSION}\n" + ) + endif() + if(WITH_LITE) + file(APPEND ${version_file} "WITH_LITE: ${WITH_LITE}\n" + "LITE_GIT_TAG: ${LITE_GIT_TAG}\n") + endif() +endfunction() diff --git a/python/env_dict.py.in b/python/env_dict.py.in index 00ca04dc56cde..d8ae5f9144b65 100644 --- a/python/env_dict.py.in +++ b/python/env_dict.py.in @@ -75,5 +75,8 @@ env_dict={ 'PYBIND_INCLUDE_DIR':'@PYBIND_INCLUDE_DIR@', 'WITH_PYTHON':'@WITH_PYTHON@', 'WITH_CINN':'@WITH_CINN@', - 'CINN_SOURCE_DIR':'@CINN_SOURCE_DIR@' + 'CINN_SOURCE_DIR':'@CINN_SOURCE_DIR@', + 'WITH_CPP_DIST':'@WITH_CPP_DIST@', + 'PADDLE_INSTALL_DIR':'@PADDLE_INSTALL_DIR@', + 'PADDLE_LIB_TEST_DIR':'@PADDLE_LIB_TEST_DIR@' } diff --git a/python/setup.py.in b/python/setup.py.in index c87a5923f5c6a..1c59c4aaa4746 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -782,6 +782,24 @@ if '${WITH_XPU}' == 'ON': headers += list(find_files('*.h', '${PYBIND_INCLUDE_DIR}', True)) # pybind headers +def get_header_install_dir(header): + if 'pb.h' in header: + install_dir = re.sub('${PADDLE_BINARY_DIR}/', '', header) + elif 'third_party' not in header: + # paddle headers + install_dir = re.sub('@PADDLE_SOURCE_DIR@/', '', header) + print('install_dir: ', install_dir) + if 'fluid/jit' in install_dir: + install_dir = re.sub('fluid/jit', 'jit', install_dir) + print('fluid/jit install_dir: ', install_dir) + else: + # third_party + install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header) + patterns = ['install/mkldnn/include', 'pybind/src/extern_pybind/include', 'third_party/xpu/src/extern_xpu/xpu/include/'] + for pattern in patterns: + install_dir = re.sub(pattern, '', install_dir) + return install_dir + class InstallCommand(InstallCommandBase): def finalize_options(self): ret = InstallCommandBase.finalize_options(self) @@ -813,21 +831,7 @@ class InstallHeaders(Command): ('force', 'force')) def mkdir_and_copy_file(self, header): - if 'pb.h' in header: - install_dir = re.sub('${PADDLE_BINARY_DIR}/', '', header) - elif 'third_party' not in header: - # paddle headers - install_dir = re.sub('@PADDLE_SOURCE_DIR@/', '', header) - print('install_dir: ', install_dir) - if 'fluid/jit' in install_dir: - install_dir = re.sub('fluid/jit', 'jit', install_dir) - print('fluid/jit install_dir: ', install_dir) - else: - # third_party - install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header) - patterns = ['install/mkldnn/include', 'pybind/src/extern_pybind/include', 'third_party/xpu/src/extern_xpu/xpu/include/'] - for pattern in patterns: - install_dir = re.sub(pattern, '', install_dir) + install_dir = get_header_install_dir(header) install_dir = os.path.join(self.install_dir, os.path.dirname(install_dir)) if not os.path.exists(install_dir): self.mkpath(install_dir) @@ -885,6 +889,54 @@ if '${WITH_STRIP}' == 'ON': if os.system(command) != 0: raise Exception("strip *.so failed, command: %s" % command) +def install_cpp_dist_and_build_test(paddle_install_dir, paddle_lib_test_dir): + """install cpp distribution and build test target + + TODO(huangjiyi): + 1. This function will be moved when seperating C++ distribution + installation from python package installation. + 2. Reduce the header and library files to be installed. + """ + if '${CMAKE_BUILD_TYPE}' != 'Release': + return + os.makedirs(paddle_install_dir, exist_ok=True) + # install C++ header files + for header in headers: + install_dir = get_header_install_dir(header) + install_dir = os.path.join( + paddle_install_dir, 'include', os.path.dirname(install_dir) + ) + os.makedirs(install_dir, exist_ok=True) + shutil.copy(header, install_dir) + + # install C++ shared libraries + lib_install_dir = os.path.join(paddle_install_dir, 'lib') + os.makedirs(lib_install_dir, exist_ok=True) + # install libpaddle.ext + paddle_libs = glob.glob('${PADDLE_BINARY_DIR}/paddle/fluid/pybind/${FLUID_CORE_NAME}.*') + for lib in paddle_libs: + shutil.copy(lib, lib_install_dir) + # install dependent libraries + libs_path = package_dir['paddle.libs'] + for lib in package_data['paddle.libs']: + lib_path = os.path.join(libs_path, lib) + shutil.copy(lib_path, lib_install_dir) + + # build test target + cmake_args = ["cmake", paddle_lib_test_dir, "-B", paddle_lib_test_dir] + if os.getenv("GENERATOR") == "Ninja": + cmake_args.append("-GNinja") + subprocess.check_call(cmake_args) + subprocess.check_call(["cmake", "--build", paddle_lib_test_dir]) + + +# install cpp distribution +if '${WITH_CPP_DIST}' == 'ON': + paddle_install_dir = '${PADDLE_INSTALL_DIR}' + paddle_lib_test_dir = '${PADDLE_LIB_TEST_DIR}' + install_cpp_dist_and_build_test(paddle_install_dir, paddle_lib_test_dir) + + with redirect_stdout(): setup(name='${PACKAGE_NAME}', version='${PADDLE_VERSION}', diff --git a/setup.py b/setup.py index 9e617b5e340fc..6a305243bbe3d 100644 --- a/setup.py +++ b/setup.py @@ -1545,6 +1545,52 @@ def check_build_dependency(): raise RuntimeError(missing_modules.format(dependency=dependency)) +def install_cpp_dist_and_build_test(install_dir, lib_test_dir, headers, libs): + """install cpp distribution and build test target + + TODO(huangjiyi): + 1. This function will be moved when seperating C++ distribution + installation from python package installation. + 2. Reduce the header and library files to be installed. + """ + if env_dict.get("CMAKE_BUILD_TYPE") != 'Release': + return + os.makedirs(install_dir, exist_ok=True) + # install C++ header files + for header in headers: + header_install_dir = get_header_install_dir(header) + header_install_dir = os.path.join( + install_dir, 'include', os.path.dirname(header_install_dir) + ) + os.makedirs(header_install_dir, exist_ok=True) + shutil.copy(header, header_install_dir) + + # install C++ shared libraries + lib_install_dir = os.path.join(install_dir, 'lib') + os.makedirs(lib_install_dir, exist_ok=True) + # install libpaddle.ext + paddle_libs = glob.glob( + paddle_binary_dir + + '/paddle/fluid/pybind/' + + env_dict.get("FLUID_CORE_NAME") + + '.*' + ) + for lib in paddle_libs: + shutil.copy(lib, lib_install_dir) + # install dependent libraries + libs_path = paddle_binary_dir + '/python/paddle/libs' + for lib in libs: + lib_path = os.path.join(libs_path, lib) + shutil.copy(lib_path, lib_install_dir) + + # build test target + cmake_args = [CMAKE, lib_test_dir, "-B", lib_test_dir] + if os.getenv("GENERATOR") == "Ninja": + cmake_args.append("-GNinja") + subprocess.check_call(cmake_args) + subprocess.check_call([CMAKE, "--build", lib_test_dir]) + + def main(): # Parse the command line and check arguments before we proceed with building steps and setup parse_input_command(filter_args_list) @@ -1617,6 +1663,17 @@ def main(): if os.system(command) != 0: raise Exception("strip *.so failed, command: %s" % command) + # install cpp distribution + if env_dict.get("WITH_CPP_DIST") == 'ON': + paddle_install_dir = env_dict.get("PADDLE_INSTALL_DIR") + paddle_lib_test_dir = env_dict.get("PADDLE_LIB_TEST_DIR") + install_cpp_dist_and_build_test( + paddle_install_dir, + paddle_lib_test_dir, + headers, + package_data['paddle.libs'], + ) + setup( name=package_name, version=paddle_version, diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 066875945a79c..96bd6ca3c263d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -135,6 +135,15 @@ if(WITH_TESTING) endif() endif() +if(WITH_CPP_DIST) + add_test(NAME test_paddle_lib + COMMAND ${PADDLE_BINARY_DIR}/test/paddle_lib/test_paddle_lib) + if(WITH_GPU) + add_test(NAME test_paddle_lib_gpu + COMMAND ${PADDLE_BINARY_DIR}/test/paddle_lib/test_paddle_lib_gpu) + endif() +endif() + get_property(test_srcs GLOBAL PROPERTY TEST_SRCS) get_property(test_names GLOBAL PROPERTY TEST_NAMES) diff --git a/test/paddle_lib/CMakeLists.txt.in b/test/paddle_lib/CMakeLists.txt.in new file mode 100644 index 0000000000000..ec0c4c00f1905 --- /dev/null +++ b/test/paddle_lib/CMakeLists.txt.in @@ -0,0 +1,14 @@ +cmake_minimum_required(VERSION 3.15) +project(test_paddle_lib) + +list(APPEND CMAKE_PREFIX_PATH "@PADDLE_BINARY_DIR@/paddle_install_dir") +find_package(Paddle REQUIRED) +include_directories(${PADDLE_INCLUDE_DIRS}) + +add_executable(test_paddle_lib test_paddle_lib.cc) +target_link_libraries(test_paddle_lib ${PADDLE_LIBRARIES}) + +if(@WITH_GPU@) + add_executable(test_paddle_lib_gpu test_paddle_lib_gpu.cc) + target_link_libraries(test_paddle_lib_gpu ${PADDLE_LIBRARIES}) +endif() diff --git a/test/paddle_lib/test_paddle_lib.cc b/test/paddle_lib/test_paddle_lib.cc new file mode 100644 index 0000000000000..9c32d7e2c5b60 --- /dev/null +++ b/test/paddle_lib/test_paddle_lib.cc @@ -0,0 +1,27 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/extension.h" + +int main() { + int data[] = {1, 2, 3, 4}; + auto tensor = paddle::from_blob(data, {2, 2}, phi::DataType::INT32); + + assert(tensor.numel() == 4); + assert(tensor.dtype() == phi::DataType::INT32); + assert(tensor.is_cpu()); + assert(tensor.template data() == data); +} diff --git a/test/paddle_lib/test_paddle_lib_gpu.cc b/test/paddle_lib/test_paddle_lib_gpu.cc new file mode 100644 index 0000000000000..4cd27c53f186d --- /dev/null +++ b/test/paddle_lib/test_paddle_lib_gpu.cc @@ -0,0 +1,25 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/extension.h" + +int main() { + float data[] = {1., 2., 3., 4.}; + auto tensor = paddle::from_blob(data, {2, 2}, phi::DataType::FLOAT32); + auto gpu_tensor = + paddle::experimental::copy_to(tensor, phi::GPUPlace(), false); + assert(gpu_tensor.is_gpu()); +} From 3650c4a85dd2a0e52885897cfa4a3303a7f45645 Mon Sep 17 00:00:00 2001 From: liuzhenhai93 Date: Wed, 26 Apr 2023 19:59:48 +0800 Subject: [PATCH 087/405] =?UTF-8?q?=20pp=20=E7=AD=96=E7=95=A5=E8=B0=83?= =?UTF-8?q?=E6=95=B4=E5=90=8E=EF=BC=8C=E6=A8=A1=E5=9E=8B=E8=BD=AC=E6=8D=A2?= =?UTF-8?q?=EF=BC=8C=E4=BB=A5=E4=BE=BF=E6=A8=A1=E5=9E=8B=E7=83=AD=E5=90=AF?= =?UTF-8?q?=20(#52927)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish --- .../parallel_layers/pp_layers.py | 4 +- .../fleet/utils/pp_parallel_adaptor.py | 612 ++++++++++++++++++ .../unittests/collective/fleet/CMakeLists.txt | 13 + .../fleet/hybrid_parallel_pp_transformer.py | 4 +- .../hybrid_parallel_pp_transformer_save.py | 101 +++ ..._pp_transformer_save_with_virtual_stage.py | 105 +++ ...allel_pp_transformer_with_virtual_stage.py | 5 +- .../fleet/test_parallel_dygraph_pp_adaptor.py | 163 +++++ .../unittests/collective/fleet/testslist.csv | 1 + python/paddle/framework/io.py | 9 +- test/auto_parallel/CMakeLists.txt | 2 +- test/distributed_passes/CMakeLists.txt | 2 +- test/ir/inference/CMakeLists.txt | 2 +- 13 files changed, 1012 insertions(+), 11 deletions(-) create mode 100644 python/paddle/distributed/fleet/utils/pp_parallel_adaptor.py create mode 100644 python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer_save.py create mode 100644 python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer_save_with_virtual_stage.py create mode 100644 python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_pp_adaptor.py diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py index f3be9894a9cfe..bc403da76ba91 100755 --- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py +++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py @@ -564,7 +564,7 @@ def _segment_network_for_interleave(self, seg_method): self.segment_parts = seg.do_segment() logger.info( - "segment result:" + f"segment with method: {seg_method}; result: " + ", ".join(str(arg) for arg in self.segment_parts) ) @@ -594,7 +594,7 @@ def _segment_network(self, seg_method): self.segment_parts = seg.do_segment() logger.info( - "segment result:" + f"segment with method: {seg_method}; result: " + ", ".join(str(arg) for arg in self.segment_parts) ) diff --git a/python/paddle/distributed/fleet/utils/pp_parallel_adaptor.py b/python/paddle/distributed/fleet/utils/pp_parallel_adaptor.py new file mode 100644 index 0000000000000..9fa1caf35a547 --- /dev/null +++ b/python/paddle/distributed/fleet/utils/pp_parallel_adaptor.py @@ -0,0 +1,612 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import math +import re +import shutil +from collections import OrderedDict + +import paddle + + +class ParallelConfig: + def __init__(self, mp: int, pp: int, vpp: int = 1, sharding: int = 1): + self.mp = mp + self.pp = pp + self.vpp = vpp + self.sharding = sharding + + def pipe_parallel_group(self, i: int, j: int): + ans = [] + for k in range(self.pp): + ans.append((i, j, k)) + return ans + + +class LayerReNamingHelper: + def __init__(self, template: str): + self._template = template + self._i = -1 + self._last_old_layer_name = None + + def get_new_layer_name(self, old_layer_name: str): + old_layer_name = old_layer_name.split(".")[0] + if ( + self._last_old_layer_name is None + or old_layer_name != self._last_old_layer_name + ): + self._i = self._i + 1 + self._last_old_layer_name = old_layer_name + return self._template.format(self._i) + + +class LayerReNamingManager: + def __init__(self): + self._renaming_helpers = OrderedDict() + self._renaming_helpers["linear"] = LayerReNamingHelper("linear_{}") + self._renaming_helpers["layer_norm"] = LayerReNamingHelper( + "layer_norm_{}" + ) + self._renaming_helpers["embedding"] = LayerReNamingHelper( + "embedding_{}" + ) + + def get_new_layer_name(self, old_name: str): + layer_name = "" + for (k, v) in self._renaming_helpers.items(): + if old_name.startswith(k): + layer_name = v.get_new_layer_name(old_name) + break + return layer_name + + def get_new_param_name(self, old_name: str): + names = old_name.split(".") + layer_name = self.get_new_layer_name(names[0]) + assert layer_name, f"can not rename layer {names[0]}" + names[0] = layer_name + return ".".join(names) + + +class PipeLineModelAdaptor: + def __init__( + self, + src_parallel_config: ParallelConfig, + dst_parallel_config: ParallelConfig, + transformer_layer_num: int, + segment_method: str = "layer", + ): + self._src_parallel_config = src_parallel_config + self._dst_parallel_config = dst_parallel_config + self._transformer_layer_num = transformer_layer_num + self._segment_method = segment_method + + def apply(self, src_model_path: str, dst_model_path: str): + for i in range(self._src_parallel_config.mp): + for j in range(self._src_parallel_config.sharding): + # TODO(liuzhenhai): use multiple processs + layers = [] + + # 1、extract layers in the same pp group + group = self._src_parallel_config.pipe_parallel_group(i, j) + src_dirs = [ + "{}/mp_{:0>2d}_sharding_{:0>2d}_pp_{:0>2d}".format( + src_model_path, *e + ) + for e in group + ] + # first rank extract shared layer + with_shared = True + for dir in src_dirs: + print("extract layer params in dir %s" % dir) + layers.extend(self.extract_layers(dir, with_shared)) + with_shared = False + # 2、sort and unique layers + layers = self.sort_layers(layers) + + # 3、resplit layers among pp group according new pp config + layer_segments = self.segment_layers( + layers, self._dst_parallel_config, self._segment_method + ) + dst_group = self._dst_parallel_config.pipe_parallel_group(i, j) + dst_dirs = [ + "{}/mp_{:0>2d}_sharding_{:0>2d}_pp_{:0>2d}".format( + dst_model_path, *e + ) + for e in dst_group + ] + + # 4、merge layers belonging to the same node + for (layer_segment, dir_) in zip(layer_segments, dst_dirs): + print(f"merge {len(layer_segment)} layers to {dir_}") + self.merge_layers(layer_segment, dir_) + + # 5、copy meta_state.pdopt + for (src_dir, dst_dir) in zip(src_dirs, dst_dirs): + shutil.copyfile( + f"{src_dir}/meta_state.pdopt", + f"{dst_dir}/meta_state.pdopt", + ) + + def peek_model(self, model_dir: str): + for i in range(self._src_parallel_config.mp): + for j in range(self._src_parallel_config.sharding): + group = self._src_parallel_config.pipe_parallel_group(i, j) + dirs = [ + "{}/mp_{:0>2d}_sharding_{:0>2d}_pp_{:0>2d}".format( + model_dir, *e + ) + for e in group + ] + for dir in dirs: + print(f"peek partial model in {dir}:") + self.peek_partial_model(dir) + + def peek_partial_model(self, sub_dir: str): + state_dict = paddle.load(f"{sub_dir}/model.pdparams") + for (k, v) in state_dict.items(): + print(f"\t{k} -> {v.name}") + + def extract_layers(self, dir: str, with_shared: bool): + opt = paddle.load(dir + "/model_state.pdopt") + params = paddle.load(dir + "/model.pdparams") + shared_layer_parsed = False + # tname -> (layer, param_name) + tname_to_layer_and_pname = {} + for (k, v) in params.items(): + layer = self._extract_layer_name(k) + assert layer + # special treatment for embedding layer, skip duplicated shared layer + # shared layer may exist or not, if it exist it share weight with _layers.0 + # _layers.shared_layers.embed.word_embeddings.weight -> embedding_0.w_0 + # _layers.shared_layers.embed.position_embeddings.weight -> embedding_1.w_0 + # _layers.0.word_embeddings.weight -> embedding_0.w_0 + # _layers.0.position_embeddings.weight -> embedding_1.w_0 + shared_layer_parsed = shared_layer_parsed or ( + "_layers.shared_layers" in layer + ) + if ( + "_layers.shared_layers" not in layer + and ("word_embeddings" in k or "position_embeddings" in k) + and shared_layer_parsed + ): + continue + tname_to_layer_and_pname[v.name] = (layer, k) + + # get opt-> param mapping + tensor_names = list(tname_to_layer_and_pname.keys()) + opt_names = [ + e for e in opt.keys() if e not in ["master_weights", "LR_Scheduler"] + ] + opt_to_t = self._opt_name_to_tname(tensor_names, opt_names) + # gather tensors belonging to one layer togather + layers = OrderedDict() + for (k, v) in params.items(): + layer, p = tname_to_layer_and_pname[v.name] + if layer not in layers: + layers[layer] = {} + layers[layer]["opt"] = OrderedDict() + layers[layer]["params"] = OrderedDict() + layers[layer]["master_weights"] = OrderedDict() + layers[layer]["params"][p] = v + + for (k, v) in opt.items(): + if k in ["master_weights", "LR_Scheduler"]: + continue + layer, _ = tname_to_layer_and_pname[opt_to_t[v.name]] + layers[layer]["opt"][k] = v + + if "master_weights" in opt: + for (k, v) in opt["master_weights"].items(): + layer, _ = tname_to_layer_and_pname[k] + layers[layer]["master_weights"][k] = v + + if "LR_Scheduler" in opt: + for layer in layers: + layers[layer]["LR_Scheduler"] = opt["LR_Scheduler"] + + ans = [] + + for (layer_name, layer) in layers.items(): + # special treatment for embedding layer + if (not with_shared) and "shared_layers" in layer_name: + continue + file_name = f"./tmp_layer_files/{layer_name}.tmp" + paddle.save(layer, file_name) + ans.append((layer_name, file_name)) + print(f"save layer {layer_name} to {file_name}") + return ans + + def sort_layers(self, layers: list): + def priority(elem): + layer_name = elem[0] + if "shared_layers" in layer_name: + return -float(0.5) + match = re.search( + r"^_layers((\.\d+)+|(\.shared_layers\.[^\.]+))", layer_name + ) + assert match, f"{layer_name} not a valid layer name" + return float(match.group(1).lstrip(".")) + + # strictly sort layers + print("before sort %s" % ("|".join([e[0] for e in layers]))) + layers.sort(key=priority) + # unique + unique_layers = [] + for e in layers: + if unique_layers and e[0] == unique_layers[-1][0]: + continue + unique_layers.append(e) + print("after sort %s " % ("|".join([e[0] for e in unique_layers]))) + return unique_layers + + def segment_layers( + self, + layers: list, + config: ParallelConfig, + segment_method: str = "layer", + ): + layer_num = len(layers) + stage_num = config.pp * config.vpp + + # segment by weights + def segment_by_layer(): + # assume model is of the structure below + # embedding -> n*(transformer layer) -> [optional output layer] + # segment index + weights = [0 for _ in range(layer_num)] + non_zero_layers = range(1, layer_num - 1) + # input layer is embedding + if self._transformer_layer_num: + assert self._transformer_layer_num < layer_num + non_zero_layers = range(1, 1 + self._transformer_layer_num) + for i in non_zero_layers: + weights[i] = 1 + + part_size = sum(weights) // stage_num + result = [0 for _ in range(stage_num + 1)] + memory_counter = 0 + result_idx = 1 + for idx, weight in enumerate(weights): + memory_counter += weight + if memory_counter == part_size: + result[result_idx] = idx + 1 + result_idx += 1 + memory_counter = 0 + result[stage_num] = layer_num + return result + + def segment_uniform(): + result = [0 for _ in range(stage_num + 1)] + part_size = math.floor(layer_num / stage_num) + extra_layers = layer_num % stage_num + for i in range(1, stage_num): + offset = 1 if i > (stage_num - extra_layers) else 0 + result[i] = int( + min(result[i - 1] + part_size + offset, layer_num) + ) + result[stage_num] = layer_num + return result + + result = ( + segment_uniform() + if (segment_method == "uniform") + else segment_by_layer() + ) + index_segments = [[] for _ in range(config.pp)] + for i in range(stage_num): + index_segments[i % config.pp].append((result[i], result[i + 1])) + + # name layers + segments = [[] for i in range(config.pp)] + for i in range(config.pp): + for (start, end) in index_segments[i]: + for j in range(start, end): + if config.vpp > 1: + segments[i].append( + ( + [f"_layers.{start}.{j - start}"], + layers[j][1], + ) + ) + else: + segments[i].append(([f"_layers.{j}"], layers[j][1])) + + shared_layer_exist = any( + "_layers.shared_layers" in e[0] for e in layers + ) + if shared_layer_exist: + # special treatment for shared layer + if config.vpp > 1: + segments[0] = [ + ([layers[0][0], segments[0][0][0][0]], layers[0][1]) + ] + segments[0][1:] + else: + segments[0] = [([layers[0][0]], layers[0][1])] + segments[0][1:] + + for i in range(1, config.pp): + segments[i] = [([layers[0][0]], layers[0][1])] + segments[i] + + for (pp_rank, segs) in enumerate(segments): + print(f"segmentment result for pp_rank {pp_rank}:") + print(50 * "=") + for seg in segs: + print(f"{seg[0]} => {seg[1]}") + return segments + + def merge_layers(self, layers_segment: list, save_dir: str): + params = OrderedDict() + opt = OrderedDict() + master_weights = OrderedDict() + renaming_manager = LayerReNamingManager() + + def merge(src, dst, map_k=None): + for (k, v) in src.items(): + k = map_k(k) if map_k is not None else k + dst[k] = v + + lr_scheduler = None + for (layer_names, file_path) in layers_segment: + print("load %s" % file_path) + layer = paddle.load(file_path) + + def get_param_name_mapper(layer_name): + # replace layer name + def map_param_name(param_name): + layer_pre = self._extract_layer_name(param_name) + return layer_name + param_name[len(layer_pre) :] + + return map_param_name + + ( + layer_params, + layer_opt, + layer_master_weight, + ) = self._map_tensor_names( + layer["params"], + layer["opt"], + layer["master_weights"], + renaming_manager, + ) + for layer_name in layer_names: + merge(layer_params, params, get_param_name_mapper(layer_name)) + merge(layer_opt, opt) + merge(layer_master_weight, master_weights) + lr_scheduler = layer["LR_Scheduler"] + + opt = self._pack_opt_state_dict(opt, master_weights, lr_scheduler) + paddle.save(params, save_dir + "/model.pdparams") + paddle.save(opt, save_dir + "/model_state.pdopt") + + def _pack_opt_state_dict(self, opt, master_weights, lr_scheduler): + opt["master_weights"] = master_weights + opt["LR_Scheduler"] = lr_scheduler + return opt + + def _extract_layer_name(self, param_name: str): + match = re.search( + r"^_layers((\.\d+)+|(\.shared_layers\.[^\.]+))", param_name + ) + layer_name = "" + return "" if (not match) else match.group() + + # map opt names to tensor name + def _opt_name_to_tname(self, tensor_names, opt_names): + tensor_names = set(tensor_names) + all_names = [] + all_names.extend(list(tensor_names)) + all_names.extend(opt_names) + all_names.sort() + pre_t_name = "" + opt_to_t = {} + for n in all_names: + if n in tensor_names: + # we get a param + pre_t_name = n + else: + assert pre_t_name + opt_to_t[n] = pre_t_name + return opt_to_t + + def _map_tensor_names(self, params, opt, master_weights, renaming_manager): + opt_renamed = OrderedDict() + master_weights_renamed = OrderedDict() + # old name to new name + t_name_mapping = {} + # map tensor names + for (k, v) in params.items(): + t_name_mapping[v.name] = renaming_manager.get_new_param_name(v.name) + v.name = t_name_mapping[v.name] + # map opt names + opt_to_tname = self._opt_name_to_tname( + t_name_mapping.keys(), opt.keys() + ) + for (k, v) in opt.items(): + old_t_name = opt_to_tname[k] + t_name = t_name_mapping[old_t_name] + opt_name = t_name + k[len(old_t_name) :] + v.name = opt_name + opt_renamed[opt_name] = v + + # map master names + for (k, v) in master_weights.items(): + t_name = t_name_mapping[k] + v.name = t_name + v.name[len(k) :] + master_weights_renamed[t_name] = v + return (params, opt_renamed, master_weights_renamed) + + +def parse_args(): + + parser = argparse.ArgumentParser( + prog='model converter', description='converter a model' + ) + parser.add_argument( + '--src_path', + type=str, + default="./output/epoch_0_step_30", + help='path of the model to convert', + ) + + parser.add_argument( + '--dst_path', + type=str, + default="./test_adapt", + help='path to saved the converted model', + ) + + parser.add_argument( + '--src_mp', + type=int, + default=2, + help='mp degree of the origin triaing task that dumpped this model', + ) + + parser.add_argument( + '--src_pp', + type=int, + default=2, + help='pp degree of the origin triaing task that dumpped this model', + ) + + parser.add_argument( + '--src_vp', + type=int, + default=2, + help='vp degree of the origin triaing task that dumpped this model', + ) + + parser.add_argument( + '--dst_mp', + type=int, + default=None, + help='mp degree of the origin triaing task that dumpped this model', + ) + + parser.add_argument( + '--dst_pp', + type=int, + default=None, + help='pp degree of the expected triaing task that would recover this model', + ) + + parser.add_argument( + '--dst_vp', + type=int, + default=2, + help='vp degree of the expected triaing task that would recover this model', + ) + + parser.add_argument( + '--sharding', + type=int, + default=1, + help=" sharding degree of both the origin triaing task that dumpped this model and the expected triaing task that would recover this model", + ) + + parser.add_argument( + '--method', + type=str, + default="adapt_model", + help='vp degree of the expected triaing task that would recover this model', + ) + + parser.add_argument( + '--segment_method', + type=str, + default="layer", + help='method to segment layers to pp or vp stages', + ) + + parser.add_argument( + '--transformer_layer_num', + type=int, + default=0, + help='transformer_layer_num of the model', + ) + # assume model is of the structure below + # embedding -> n*[transformer layer] -> optional output layer + + args = parser.parse_args() + + if args.dst_mp is None: + args.dst_mp = args.src_mp + if args.dst_pp is None: + args.dst_pp = args.src_pp + + assert args.src_mp == args.dst_mp, "src mp {} dst mp {}".format( + args.src_mp, args.dst_mp + ) + + assert args.method in [ + 'peek_model', + 'adapt_model', + ], "method should be in ['peek_model', 'adapt_model']" + assert args.segment_method in [ + "uniform", + "layer", + ], "segment_method should be 'uniform' or 'layer" + + print( + "adapt model dumped by task with pp degree:{}, vp degree:{}, mp degree:{} to task with pp degree:{}, vp degree:{}, mp degree:{}".format( + args.src_pp, + args.src_vp, + args.src_mp, + args.dst_pp, + args.dst_vp, + args.dst_mp, + ) + ) + + return args + + +def adaptor_from_args(args): + src_parallel_config = ParallelConfig( + args.src_mp, args.src_pp, args.src_vp, args.sharding + ) + + dst_parallel_config = ParallelConfig( + args.dst_mp, args.dst_pp, args.dst_vp, args.sharding + ) + + adaptor = PipeLineModelAdaptor( + src_parallel_config, + dst_parallel_config, + args.transformer_layer_num, + args.segment_method, + ) + return adaptor + + +def main(): + + args = parse_args() + adaptor = adaptor_from_args(args) + if args.method == "peek_model": + adaptor.peek_model(args.dst_path) + elif args.method == "adapt_model": + adaptor.apply(args.src_path, args.dst_path) + + +if __name__ == "__main__": + """ + Usage: + python pp_parallel_adaptor.py --src_mp xxx --src_path xxx --method \ + adapt_model/peek_model --dst_path xxx --sharding xxx --segment_method xxx --transformer_layer_num xxx + + for the meaning of a specific arg, please use: + python pp_parallel_adaptor.py -h + """ + main() diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/CMakeLists.txt b/python/paddle/fluid/tests/unittests/collective/fleet/CMakeLists.txt index 637dffe42898a..dad5f1d4b5b6b 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/collective/fleet/CMakeLists.txt @@ -168,6 +168,19 @@ if((WITH_GPU) AND LOCAL_ALL_PLAT) test_parallel_dygraph_pipeline_parallel_with_virtual_stage PROPERTIES TIMEOUT "500") endif() +if((WITH_GPU) AND LOCAL_ALL_PLAT) + bash_test_modules( + test_parallel_dygraph_pp_adaptor + START_BASH + ../../dist_test.sh + LABELS + "RUN_TYPE=DIST" + ENVS + "PADDLE_DIST_UT_PORT=21976;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python" + ) + set_tests_properties(test_parallel_dygraph_pp_adaptor PROPERTIES TIMEOUT + "500") +endif() if((WITH_GPU OR WITH_XPU) AND (LINUX)) py_test_modules( test_fleet_localsgd_meta_optimizer MODULES diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer.py index 6c1ae3eac44e2..216f37796daf0 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer.py @@ -121,11 +121,11 @@ def forward(self, out, label): class ModelPipe(PipelineLayer): - def __init__(self, topology): + def __init__(self, topology, transformer_layer_num: int = 6): self.descs = [] self.descs.append(LayerDesc(EmbeddingPipe)) - for x in range(6): + for x in range(transformer_layer_num): self.descs.append(LayerDesc(TransformerNetPipe)) self.descs.append(lambda x: x[0]) diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer_save.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer_save.py new file mode 100644 index 0000000000000..a8cf970f73d22 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer_save.py @@ -0,0 +1,101 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest + +import numpy as np +from hybrid_parallel_pp_transformer import ModelPipe, set_random_seed + +import paddle +import paddle.distributed as dist +from paddle.distributed import fleet + +batch_size = 8 +length = 8 +micro_batch_size = 2 +vocab_size = 128 +transformer_layer_num = 8 + + +class TestDistPPSaveTraning(unittest.TestCase): + def setUp(self): + strategy = fleet.DistributedStrategy() + self.model_parallel_size = 1 + self.data_parallel_size = 1 + self.pipeline_parallel_size = 2 + strategy.hybrid_configs = { + "dp_degree": self.data_parallel_size, + "mp_degree": self.model_parallel_size, + "pp_degree": self.pipeline_parallel_size, + } + strategy.pipeline_configs = { + "accumulate_steps": batch_size // micro_batch_size, + "micro_batch_size": micro_batch_size, + } + fleet.init(is_collective=True, strategy=strategy) + + def test_pp_model(self): + print(f"pwd {os.getcwd()}") + hcg = fleet.get_hybrid_communicate_group() + word_size = hcg.get_model_parallel_world_size() + dp_id = hcg.get_data_parallel_rank() + pp_id = hcg.get_stage_id() + rank_id = dist.get_rank() + topology = hcg.topology() + set_random_seed(1024, dp_id, rank_id) + + model = ModelPipe(topology, transformer_layer_num=transformer_layer_num) + scheduler = paddle.optimizer.lr.PiecewiseDecay( + boundaries=[2], values=[0.001, 0.002], verbose=True + ) + optimizer = paddle.optimizer.SGD( + learning_rate=scheduler, parameters=model.parameters() + ) + + model = fleet.distributed_model(model) + optimizer = fleet.distributed_optimizer(optimizer) + output_dir = "{}/mp_00_sharding_00_pp_{:0>2d}".format( + "./pp_transformer", pp_id + ) + try: + os.makedirs(output_dir) + except: + # dir is already created, do nothing + pass + for step_id in range(2): + x_data = np.random.randint(0, vocab_size, size=[batch_size, length]) + x = paddle.to_tensor(x_data) + x.stop_gradient = True + loss = model.train_batch([x, x], optimizer, scheduler) + + paddle.save( + model.state_dict(), + os.path.join(output_dir, "model.pdparams"), + ) + + paddle.save( + optimizer.state_dict(), + os.path.join(output_dir, "model_state.pdopt"), + ) + meta_dict = { + "epoch": 0, + "step": 2, + "cuda_rng_state": paddle.get_cuda_rng_state(), + } + paddle.save(meta_dict, os.path.join(output_dir, "meta_state.pdopt")) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer_save_with_virtual_stage.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer_save_with_virtual_stage.py new file mode 100644 index 0000000000000..372cbe7f48d93 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer_save_with_virtual_stage.py @@ -0,0 +1,105 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest + +import numpy as np +from hybrid_parallel_pp_transformer_with_virtual_stage import ( + ModelPipe, + set_random_seed, +) + +import paddle +import paddle.distributed as dist +from paddle.distributed import fleet + +batch_size = 8 +length = 8 +micro_batch_size = 2 +vocab_size = 128 + +transformer_layer_num = 8 + + +class TestDistPPSaveTraning(unittest.TestCase): + def setUp(self): + strategy = fleet.DistributedStrategy() + self.model_parallel_size = 1 + self.data_parallel_size = 1 + self.pipeline_parallel_size = 2 + strategy.hybrid_configs = { + "dp_degree": self.data_parallel_size, + "mp_degree": self.model_parallel_size, + "pp_degree": self.pipeline_parallel_size, + } + strategy.pipeline_configs = { + "accumulate_steps": batch_size // micro_batch_size, + "micro_batch_size": micro_batch_size, + } + fleet.init(is_collective=True, strategy=strategy) + + def test_pp_model(self): + print(f"pwd {os.getcwd()}") + hcg = fleet.get_hybrid_communicate_group() + word_size = hcg.get_model_parallel_world_size() + dp_id = hcg.get_data_parallel_rank() + pp_id = hcg.get_stage_id() + rank_id = dist.get_rank() + topology = hcg.topology() + set_random_seed(1024, dp_id, rank_id) + + model = ModelPipe(topology, transformer_layer_num=transformer_layer_num) + scheduler = paddle.optimizer.lr.PiecewiseDecay( + boundaries=[2], values=[0.001, 0.002], verbose=True + ) + optimizer = paddle.optimizer.SGD( + learning_rate=scheduler, parameters=model.parameters() + ) + + model = fleet.distributed_model(model) + optimizer = fleet.distributed_optimizer(optimizer) + + output_dir = "{}/mp_00_sharding_00_pp_{:0>2d}".format( + "./pp_transformer_vp", pp_id + ) + try: + os.makedirs(output_dir) + except: + # dir is already created, do nothing + pass + for step_id in range(2): + x_data = np.random.randint(0, vocab_size, size=[batch_size, length]) + x = paddle.to_tensor(x_data) + x.stop_gradient = True + loss = model.train_batch([x, x], optimizer, scheduler) + + paddle.save( + model.state_dict(), + os.path.join(output_dir, "model.pdparams"), + ) + paddle.save( + optimizer.state_dict(), + os.path.join(output_dir, "model_state.pdopt"), + ) + meta_dict = { + "epoch": 0, + "step": 2, + "cuda_rng_state": paddle.get_cuda_rng_state(), + } + paddle.save(meta_dict, os.path.join(output_dir, "meta_state.pdopt")) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py index 21cc9134e4d3d..9f43fc4c9ef8e 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py @@ -120,11 +120,10 @@ def forward(self, out, label): class ModelPipe(PipelineLayer): - def __init__(self, topology): + def __init__(self, topology, transformer_layer_num: int = 8): self.descs = [] self.descs.append(LayerDesc(EmbeddingPipe)) - - for x in range(8): + for x in range(transformer_layer_num): self.descs.append(LayerDesc(TransformerNetPipe)) self.descs.append(lambda x: x[0]) diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_pp_adaptor.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_pp_adaptor.py new file mode 100644 index 0000000000000..3b11210991d3f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_pp_adaptor.py @@ -0,0 +1,163 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil +import unittest + +from test_parallel_dygraph_dataparallel import TestMultipleGpus + +import paddle +from paddle.distributed.fleet.utils.pp_parallel_adaptor import ( + ParallelConfig, + PipeLineModelAdaptor, + adaptor_from_args, + parse_args, +) + + +class TestPPAdaptor(TestMultipleGpus): + def test_parse_args(self): + args = parse_args() + self.assertEqual(args.src_mp, args.dst_mp) + adaptor = adaptor_from_args(args) + self.assertTrue(adaptor is not None) + + def test_hybrid_parallel_transformer_unbalanced_data(self): + print(f"pwd {os.getcwd()}") + self.run_mnist_2gpu('hybrid_parallel_pp_transformer_save.py') + self.run_mnist_2gpu( + 'hybrid_parallel_pp_transformer_save_with_virtual_stage.py' + ) + # test pp adaptor + dir1 = "./pp_transformer" + p_config1 = ParallelConfig(mp=1, pp=2, vpp=1, sharding=1) + dir2 = "./pp_transformer_vp" + p_config2 = ParallelConfig(mp=1, pp=2, vpp=2, sharding=1) + + pp_to_vp = PipeLineModelAdaptor( + src_parallel_config=p_config1, + dst_parallel_config=p_config2, + transformer_layer_num=8, + segment_method="layer", + ) + vp_to_pp = PipeLineModelAdaptor( + src_parallel_config=p_config2, + dst_parallel_config=p_config1, + transformer_layer_num=8, + segment_method="layer", + ) + + def check_converted_model(converted_model_dir, expected_model_dir): + # for compatibility, converted_model_dir may contain more key than + # expected model, which does not hinder model recovering + for i in range(p_config1.pp): + sub_converted_model_dir = ( + "{}/mp_00_sharding_00_pp_{:0>2d}".format( + converted_model_dir, i + ) + ) + sub_expected_model_dir = ( + "{}/mp_00_sharding_00_pp_{:0>2d}".format( + expected_model_dir, i + ) + ) + print( + f"converted_model_dir: {sub_converted_model_dir}; expected_model_dir: {sub_expected_model_dir}" + ) + + def check_names(dict_1, dict_2): + for (k, v) in dict_2.items(): + self.assertTrue(k in dict_1) + self.assertEqual( + getattr(v, "name", ""), + getattr(dict_1[k], "name", ""), + ) + + # check param + params_1 = paddle.load( + f"{sub_converted_model_dir}/model.pdparams" + ) + params_2 = paddle.load( + f"{sub_expected_model_dir}/model.pdparams" + ) + check_names(params_1, params_2) + del params_1 + del params_2 + # check opt + opt_1 = paddle.load( + f"{sub_converted_model_dir}/model_state.pdopt" + ) + opt_2 = paddle.load( + f"{sub_expected_model_dir}/model_state.pdopt" + ) + check_names(opt_1, opt_2) + # check master wieghts + if "master_weights" in opt_2: + self.assertTrue("master_weights" in opt_1) + check_names( + opt_2["master_weights"], opt_1["master_weights"] + ) + + def create_dir_if_nonexist(dir: str): + if not os.path.exists(dir): + os.makedirs(dir) + + # check pp to vp + tmp_dir1 = "./tmp_pp_to_vp" + create_dir_if_nonexist(tmp_dir1) + pp_to_vp.apply(dir1, tmp_dir1) + # browse the converted model + pp_to_vp.peek_model(tmp_dir1) + # check + check_converted_model(tmp_dir1, dir2) + + # check vp to pp + tmp_dir2 = "./tmp_vp_to_pp" + create_dir_if_nonexist(tmp_dir2) + vp_to_pp.apply(dir2, tmp_dir2) + vp_to_pp.peek_model(tmp_dir2) + check_converted_model(tmp_dir2, dir1) + + # check uniform segment + tmp_dir3 = "./tmp_vp_to_pp_uniform" + create_dir_if_nonexist(tmp_dir3) + vp_to_pp_uniform = PipeLineModelAdaptor( + src_parallel_config=p_config2, + dst_parallel_config=p_config1, + transformer_layer_num=8, + segment_method="uniform", + ) + vp_to_pp_uniform.apply(dir2, tmp_dir3) + vp_to_pp_uniform.peek_model(tmp_dir3) + + tmp_dir4 = "./tmp_pp_to_pp_uniform" + create_dir_if_nonexist(tmp_dir4) + pp_to_pp_uniform = PipeLineModelAdaptor( + src_parallel_config=p_config1, + dst_parallel_config=p_config1, + transformer_layer_num=8, + segment_method="uniform", + ) + pp_to_pp_uniform.apply(dir1, tmp_dir4) + pp_to_pp_uniform.peek_model(tmp_dir4) + check_converted_model(tmp_dir3, tmp_dir4) + + # rm dirs + for d in [dir1, dir2, tmp_dir1, tmp_dir2, tmp_dir3, tmp_dir4]: + shutil.rmtree(d, ignore_errors=True) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/testslist.csv b/python/paddle/fluid/tests/unittests/collective/fleet/testslist.csv index 459a3e39df8e8..688b5e759b1db 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/testslist.csv +++ b/python/paddle/fluid/tests/unittests/collective/fleet/testslist.csv @@ -14,6 +14,7 @@ test_dygraph_sharding_stage3_for_eager,,,350,DIST,../../dist_test.sh,2,,http_pro test_communicator_half_async,,,120,DIST,test_runner.py,2,,FLAGS_communicator_send_queue_size=1;FLAGS_communicator_max_merge_var_num=1;http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL test_parallel_dygraph_pipeline_parallel,,GPU,500,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_parallel_dygraph_pipeline_parallel_with_virtual_stage,,GPU,500,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_parallel_dygraph_pp_adaptor,,GPU,500,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_fleet_localsgd_meta_optimizer,LINUX,GPU;XPU,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_parallel_class_center_sample,,GPU,120,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL test_pipeline,,,120,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py index a482d8bed4150..e5d011453f29c 100644 --- a/python/paddle/framework/io.py +++ b/python/paddle/framework/io.py @@ -1071,11 +1071,18 @@ def load(path, **configs): # paddle2.0: paddle.save/load if "StructuredToParameterName@@" in load_result: - for key in load_result["StructuredToParameterName@@"]: + for (key, name) in load_result[ + "StructuredToParameterName@@" + ].items(): if isinstance(load_result[key], np.ndarray): load_result[key] = _ndarray_to_tensor( load_result[key], config.return_numpy ) + # default name is "generatedxxx" which is set in Tensor init, if not set + if not config.return_numpy and getattr( + load_result[key], "name", "" + ): + load_result[key].name = name if ( not config.keep_name_table diff --git a/test/auto_parallel/CMakeLists.txt b/test/auto_parallel/CMakeLists.txt index 74b2fdb47a17e..e8660fc7b02c5 100644 --- a/test/auto_parallel/CMakeLists.txt +++ b/test/auto_parallel/CMakeLists.txt @@ -76,7 +76,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU) py_test_modules(test_pass_quantization MODULES test_pass_quantization) set_tests_properties(test_pass_quantization PROPERTIES TIMEOUT 60) py_test_modules(test_tuning_recompute MODULES test_tuning_recompute) - set_tests_properties(test_tuning_recompute PROPERTIES TIMEOUT 240) + set_tests_properties(test_tuning_recompute PROPERTIES TIMEOUT 300) py_test_modules(test_fused_linear_pass MODULES test_fused_linear_pass) set_tests_properties(test_fused_linear_pass PROPERTIES TIMEOUT 20) py_test_modules(test_align_tool MODULES test_align_tool) diff --git a/test/distributed_passes/CMakeLists.txt b/test/distributed_passes/CMakeLists.txt index 83d2f0fb52994..e2b8697fc8591 100644 --- a/test/distributed_passes/CMakeLists.txt +++ b/test/distributed_passes/CMakeLists.txt @@ -29,6 +29,6 @@ endif() foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) list(APPEND DIST_TEST_OPS ${TEST_OP}) - set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 120) + set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 200) set_tests_properties(${TEST_OP} PROPERTIES LABELS "RUN_TYPE=DIST") endforeach() diff --git a/test/ir/inference/CMakeLists.txt b/test/ir/inference/CMakeLists.txt index b3bb181ea4a81..90620dab0fd3d 100755 --- a/test/ir/inference/CMakeLists.txt +++ b/test/ir/inference/CMakeLists.txt @@ -197,7 +197,7 @@ if(WITH_GPU AND TENSORRT_FOUND) set_tests_properties(test_trt_fc_fuse_quant_dequant_pass PROPERTIES TIMEOUT 100) set_tests_properties(test_trt_conv_quant_dequant_pass PROPERTIES TIMEOUT 100) - set_tests_properties(test_trt_matmul_quant_dequant PROPERTIES TIMEOUT 100) + set_tests_properties(test_trt_matmul_quant_dequant PROPERTIES TIMEOUT 180) set_tests_properties(test_trt_conv3d_op PROPERTIES TIMEOUT 60) set_tests_properties(test_trt_conv3d_transpose_op PROPERTIES TIMEOUT 60) set_tests_properties(test_trt_nearest_interp_v2_op PROPERTIES TIMEOUT 30) From 0b6dd5355e5327912c2476e167277618079d9fce Mon Sep 17 00:00:00 2001 From: zhwesky2010 <1183042833@qq.com> Date: Wed, 26 Apr 2023 21:19:18 +0800 Subject: [PATCH 088/405] [Zero-Dim] distributed scatter/all_to_all support input 0D tensor (#53186) --- .../fluid/operators/collective/c_allgather_op.cc | 9 +++++++-- .../operators/collective/c_allgather_op.cu.cc | 6 +----- .../fluid/operators/collective/c_allgather_op.h | 5 +---- .../operators/collective/c_allgather_op_xpu.cc | 5 +---- paddle/phi/core/ddim.cc | 4 ++-- .../paddle/distributed/communication/scatter.py | 6 ++---- .../communication/stream/all_gather.py | 6 +++++- .../communication/stream/all_to_all.py | 15 +++++++++++++-- .../distributed/communication/stream/scatter.py | 6 +++++- 9 files changed, 37 insertions(+), 25 deletions(-) diff --git a/paddle/fluid/operators/collective/c_allgather_op.cc b/paddle/fluid/operators/collective/c_allgather_op.cc index 04019756ffe3e..b78d9504dafb7 100644 --- a/paddle/fluid/operators/collective/c_allgather_op.cc +++ b/paddle/fluid/operators/collective/c_allgather_op.cc @@ -31,8 +31,13 @@ class CAllGatherOp : public framework::OperatorWithKernel { platform::errors::InvalidArgument( "The value of nranks should be >=2.")); framework::DDim dim = ctx->GetInputDim("X"); - dim[0] = dim[0] * nranks; - if (dim[0] < 0) dim[0] = -1; + // 0D use stack/unstack while others use concat/split + if (dim.size() == 0) { + dim = phi::make_ddim({nranks}); + } else { + dim[0] = dim[0] * nranks; + if (dim[0] < 0) dim[0] = -1; + } ctx->SetOutputDim("Out", dim); } }; diff --git a/paddle/fluid/operators/collective/c_allgather_op.cu.cc b/paddle/fluid/operators/collective/c_allgather_op.cu.cc index 93be43a1a324a..70b7d70dc93b3 100644 --- a/paddle/fluid/operators/collective/c_allgather_op.cu.cc +++ b/paddle/fluid/operators/collective/c_allgather_op.cu.cc @@ -57,13 +57,9 @@ class CAllGatherOpCUDAKernel : public framework::OpKernel { platform::errors::InvalidArgument( "nranks: %s should equal to %s", nranks, comm->nranks())); - framework::DDim out_dims = in->dims(); - out_dims[0] *= nranks; - out->mutable_data(out_dims, place); - int64_t send_numel = in->numel(); const T* send_buff = in->data(); - T* recv_buff = out->data(); + T* recv_buff = out->mutable_data(place); gpuStream_t stream = nullptr; if (ctx.Attr("use_calc_stream")) { diff --git a/paddle/fluid/operators/collective/c_allgather_op.h b/paddle/fluid/operators/collective/c_allgather_op.h index e896f96ead532..c5373bf130438 100644 --- a/paddle/fluid/operators/collective/c_allgather_op.h +++ b/paddle/fluid/operators/collective/c_allgather_op.h @@ -39,15 +39,12 @@ class CAllGatherOpCPUKernel : public framework::OpKernel { #if defined(PADDLE_WITH_GLOO) auto in = ctx.Input("X"); auto out = ctx.Output("Out"); - framework::DDim out_dims = in->dims(); auto place = ctx.GetPlace(); - auto gloo = paddle::framework::GlooWrapper::GetInstance(); auto nranks = gloo->Size(); - out_dims[0] *= nranks; int64_t send_numel = in->numel(); const T* send_buff = in->data(); - T* recv_buff = out->mutable_data(out_dims, place); + T* recv_buff = out->mutable_data(place); PADDLE_ENFORCE_EQ( gloo->IsInitialized(), diff --git a/paddle/fluid/operators/collective/c_allgather_op_xpu.cc b/paddle/fluid/operators/collective/c_allgather_op_xpu.cc index 1e7d3f3a9fec1..c4fdb0fdf290e 100644 --- a/paddle/fluid/operators/collective/c_allgather_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_allgather_op_xpu.cc @@ -43,12 +43,9 @@ class CAllGatherOpXPUKernel : public framework::OpKernel { platform::errors::InvalidArgument( "nranks: %s should equal to %s", nranks, comm->nranks())); - framework::DDim out_dims = in->dims(); - out_dims[0] *= nranks; - size_t numel = in->numel(); const void* sendbuff = in->data(); - void* recvbuff = out->mutable_data(out_dims, place); + void* recvbuff = out->mutable_data(place); XPUStream stream = nullptr; if (ctx.Attr("use_calc_stream")) { diff --git a/paddle/phi/core/ddim.cc b/paddle/phi/core/ddim.cc index 3256458e02be9..05ca29843b42f 100644 --- a/paddle/phi/core/ddim.cc +++ b/paddle/phi/core/ddim.cc @@ -154,7 +154,7 @@ DDim flatten_to_1d(const DDim& src) { return DDim({product(src)}); } DDim stride(const DDim& ddim) { DDim strides; strides.rank_ = ddim.size(); - strides[ddim.size() - 1] = 1; + if (ddim.size() > 0) strides[ddim.size() - 1] = 1; for (int i = ddim.size() - 2; i >= 0; --i) { strides[i] = strides[i + 1] * ddim[i + 1]; } @@ -164,7 +164,7 @@ DDim stride(const DDim& ddim) { DDim stride_numel(const DDim& ddim) { DDim strides; strides.rank_ = ddim.size(); - strides[ddim.size() - 1] = ddim[ddim.size() - 1]; + if (ddim.size() > 0) strides[ddim.size() - 1] = ddim[ddim.size() - 1]; for (int i = ddim.size() - 2; i >= 0; --i) { strides[i] = strides[i + 1] * ddim[i]; } diff --git a/python/paddle/distributed/communication/scatter.py b/python/paddle/distributed/communication/scatter.py index a8ba94e49c814..2826779d55738 100644 --- a/python/paddle/distributed/communication/scatter.py +++ b/python/paddle/distributed/communication/scatter.py @@ -122,8 +122,7 @@ def scatter_object_list( in_obj_sizes.append(obj_size) max_obj_size_tensor = max(in_obj_sizes) else: - # NOTE: shape can be [] after 0D tensor support - max_obj_size_tensor = paddle.empty([1], dtype="int64") + max_obj_size_tensor = paddle.empty([], dtype="int64") stream.broadcast(max_obj_size_tensor, src) max_obj_size = int(max_obj_size_tensor.item()) @@ -137,8 +136,7 @@ def scatter_object_list( out_tensor = paddle.empty([max_obj_size], dtype="uint8") scatter(out_tensor, in_tensor_list if rank == src else None, src, group) - # NOTE: shape can be [] after 0D tensor support - out_tensor_size = paddle.empty([1], dtype="int64") + out_tensor_size = paddle.empty([], dtype="int64") scatter(out_tensor_size, in_obj_sizes if rank == src else None, src, group) out_object_list.clear() diff --git a/python/paddle/distributed/communication/stream/all_gather.py b/python/paddle/distributed/communication/stream/all_gather.py index 4d02753a1a634..69d9c5d52e080 100644 --- a/python/paddle/distributed/communication/stream/all_gather.py +++ b/python/paddle/distributed/communication/stream/all_gather.py @@ -108,7 +108,11 @@ def _all_gather_in_static_mode(tensor_list, tensor, group, sync_op): }, ) tensor_list.clear() - tensor_list.extend(paddle.split(out, nranks, 0)) + # 0D use stack/unstack while others use concat/split + if len(tensor.shape) == 0: + tensor_list.extend(paddle.unstack(out, 0)) + else: + tensor_list.extend(paddle.split(out, nranks, 0)) def all_gather( diff --git a/python/paddle/distributed/communication/stream/all_to_all.py b/python/paddle/distributed/communication/stream/all_to_all.py index d8793601f729a..38b1d2fcb3e82 100644 --- a/python/paddle/distributed/communication/stream/all_to_all.py +++ b/python/paddle/distributed/communication/stream/all_to_all.py @@ -78,7 +78,12 @@ def _all_to_all_in_static_mode( if isinstance(in_tensor_or_tensor_list, list): if len(in_tensor_or_tensor_list) == 0: raise RuntimeError("The input tensor_list should not be empty.") - in_tensor = paddle.concat(in_tensor_or_tensor_list, axis=0) + # 0D use stack/unstack while others use concat/split + if len(in_tensor_or_tensor_list[0].shape) == 0: + in_tensor = paddle.stack(in_tensor_or_tensor_list, axis=0) + else: + in_tensor = paddle.concat(in_tensor_or_tensor_list, axis=0) + out_tensor = out_tensor_or_tensor_list if isinstance(out_tensor_or_tensor_list, list): if len(out_tensor_or_tensor_list) != 0: @@ -110,7 +115,13 @@ def _all_to_all_in_static_mode( if isinstance(out_tensor_or_tensor_list, list): if not sync_op: dist.wait(out_tensor, use_calc_stream=False) - out_tensor_or_tensor_list.extend(paddle.split(out_tensor, nranks, 0)) + # 0D use stack/unstack while others use concat/split + if len(in_tensor_or_tensor_list[0].shape) == 0: + out_tensor_or_tensor_list.extend(paddle.unstack(out_tensor, 0)) + else: + out_tensor_or_tensor_list.extend( + paddle.split(out_tensor, nranks, 0) + ) return None diff --git a/python/paddle/distributed/communication/stream/scatter.py b/python/paddle/distributed/communication/stream/scatter.py index c4a6a66afbcd4..c112516a1fc10 100644 --- a/python/paddle/distributed/communication/stream/scatter.py +++ b/python/paddle/distributed/communication/stream/scatter.py @@ -91,7 +91,11 @@ def _scatter_in_static_mode( ) else: tensor_list = [tensor for _ in range(nranks)] - input_tensor = paddle.concat(tensor_list, axis=0) + # 0D use stack/unstack while others use concat/split + if len(tensor_list[0].shape) == 0: + input_tensor = paddle.stack(tensor_list, axis=0) + else: + input_tensor = paddle.concat(tensor_list, axis=0) ring_id = 0 if group is None else group.id From bfeedd29eafd8aeb623061d9d59c821cba8b7c2d Mon Sep 17 00:00:00 2001 From: mengziheng <121283369+mengziheng@users.noreply.github.com> Date: Thu, 27 Apr 2023 09:49:56 +0800 Subject: [PATCH 089/405] Pad grad (#53374) * add pad op * add_some_code * modify some code * add some code * add some code * modify some code * add some code * modify some code * Update composite_backward_api.h * modify some code * add some code * add some code * add some code --- paddle/fluid/operators/pad_op.cc | 25 ++++ .../composite_backward_api.h | 27 ++++ .../prim/api/manual_prim/eager_prim_api.cc | 11 ++ .../prim/api/manual_prim/prim_manual_api.h | 8 ++ .../prim/api/manual_prim/static_prim_api.cc | 27 ++++ paddle/phi/api/yaml/legacy_backward.yaml | 1 + .../fluid/tests/unittests/test_pad_op.py | 134 ++++++++++-------- 7 files changed, 176 insertions(+), 57 deletions(-) diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc index fd23f57793970..e2a0b3e025381 100644 --- a/paddle/fluid/operators/pad_op.cc +++ b/paddle/fluid/operators/pad_op.cc @@ -17,6 +17,9 @@ limitations under the License. */ #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/complex.h" +#include "paddle/fluid/prim/api/composite_backward/composite_backward_api.h" +#include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h" +#include "paddle/fluid/prim/utils/static/desc_tensor.h" #include "paddle/phi/infermeta/unary.h" namespace paddle { @@ -129,6 +132,27 @@ class PadOpGradMaker : public framework::SingleGradOpMaker { } }; +class PadCompositeGradOpMaker : public prim::CompositeGradOpMakerBase { + using prim::CompositeGradOpMakerBase::CompositeGradOpMakerBase; + + public: + void Apply() override { + paddle::Tensor x = this->GetSingleForwardInput("X"); + paddle::Tensor out_grad = this->GetSingleOutputGrad("Out"); + paddle::Tensor x_grad = this->GetSingleInputGrad("X"); + auto* dx_ptr = this->GetOutputPtr(&x_grad); + std::string dx_name = this->GetOutputName(x_grad); + + std::vector paddings = + static_cast>(this->Attr>("paddings")); + float pad_value = static_cast(this->Attr("pad_value")); + VLOG(6) << "Runing add_grad composite func"; + + prim::pad_grad(x, out_grad, paddings, pad_value, dx_ptr); + this->RecoverOutputName(x_grad, dx_name); + } +}; + template class PadOpDoubleGradMaker : public framework::SingleGradOpMaker { public: @@ -155,6 +179,7 @@ REGISTER_OPERATOR(pad, ops::PadOpMaker, ops::PadOpGradMaker, ops::PadOpGradMaker, + ops::PadCompositeGradOpMaker, PadInferShapeFunctor); REGISTER_OPERATOR(pad_grad, ops::PadOpGrad, diff --git a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h index 099ebc81b900b..9790f36ec590a 100644 --- a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h +++ b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h @@ -1806,6 +1806,32 @@ void roll_grad(const Tensor& x, } } +template +void pad_grad(const Tensor& input, + const Tensor& out_grad, + const std::vector& paddings, + const Scalar& pad_value, + Tensor* input_grad) { + if (input_grad) { + size_t rank = input.dims().size(); + auto out_dims = out_grad.dims(); + + std::vector starts(rank, 0); + std::vector ends(rank, 0); + std::vector axes(rank, 0); + std::vector infer_flags(rank, 1); + std::vector decrease_axis({}); + for (size_t i = 0; i < rank; ++i) { + starts.push_back(static_cast(paddings[2 * i])); + ends.push_back(static_cast(out_dims[i] - paddings[2 * i + 1])); + axes.push_back(i); + } + auto out_tmp = + slice(out_grad, axes, starts, ends, infer_flags, decrease_axis); + set_output(out_tmp, input_grad); + } +} + template void scatter_nd_add_grad(const Tensor& index, const Tensor& updates, @@ -1821,5 +1847,6 @@ void scatter_nd_add_grad(const Tensor& index, set_output(tmp_updates_grad, updates_grad); } } + } // namespace prim } // namespace paddle diff --git a/paddle/fluid/prim/api/manual_prim/eager_prim_api.cc b/paddle/fluid/prim/api/manual_prim/eager_prim_api.cc index 852add94fed6f..d667f0fabd71e 100644 --- a/paddle/fluid/prim/api/manual_prim/eager_prim_api.cc +++ b/paddle/fluid/prim/api/manual_prim/eager_prim_api.cc @@ -33,5 +33,16 @@ Tensor cast(const Tensor& x, DataType dtype) { return ::cast_ad_func(x, dtype); } +template <> +Tensor slice(const Tensor& input, + const std::vector& axes, + const IntArray& starts, + const IntArray& ends, + const std::vector& infer_flags, + const std::vector& decrease_axis) { + VLOG(4) << "Eager Prim API slice_ad_func call"; + return ::slice_ad_func(input, axes, starts, ends, infer_flags, decrease_axis); +} + } // namespace prim } // namespace paddle diff --git a/paddle/fluid/prim/api/manual_prim/prim_manual_api.h b/paddle/fluid/prim/api/manual_prim/prim_manual_api.h index 383ded54f548d..2146edba60db8 100644 --- a/paddle/fluid/prim/api/manual_prim/prim_manual_api.h +++ b/paddle/fluid/prim/api/manual_prim/prim_manual_api.h @@ -38,5 +38,13 @@ Tensor full(const IntArray& shape, template Tensor cast(const Tensor& x, DataType dtype); +template +Tensor slice(const Tensor& input, + const std::vector& axes, + const IntArray& starts, + const IntArray& ends, + const std::vector& infer_flags, + const std::vector& decrease_axis); + } // namespace prim } // namespace paddle diff --git a/paddle/fluid/prim/api/manual_prim/static_prim_api.cc b/paddle/fluid/prim/api/manual_prim/static_prim_api.cc index efe5e53d173c3..0db4497e6289b 100644 --- a/paddle/fluid/prim/api/manual_prim/static_prim_api.cc +++ b/paddle/fluid/prim/api/manual_prim/static_prim_api.cc @@ -127,5 +127,32 @@ Tensor cast(const Tensor& x, DataType dtype) { return out; } +template <> +Tensor slice(const Tensor& input, + const std::vector& axes, + const IntArray& starts, + const IntArray& ends, + const std::vector& infer_flags, + const std::vector& decrease_axis) { + framework::BlockDesc* block = StaticCompositeContext::Instance().GetBlock(); + framework::OpDesc* op = block->AppendOp(); + op->SetType("slice"); + op->SetInput( + "Input", + {std::static_pointer_cast(input.impl())->Name()}); + auto out = empty({}, phi::DataType::FLOAT32, paddle::Place()); + op->SetOutput( + "Out", {std::static_pointer_cast(out.impl())->Name()}); + op->SetAttr("axes", unsafe_vector_cast(axes)); + op->SetAttr("starts", unsafe_vector_cast(starts.GetData())); + op->SetAttr("ends", unsafe_vector_cast(ends.GetData())); + op->SetAttr("infer_flags", unsafe_vector_cast(infer_flags)); + op->SetAttr("decrease_axis", unsafe_vector_cast(decrease_axis)); + op->CheckAttrs(); + op->InferVarType(block); + op->InferShape(*block); + return out; +} + } // namespace prim } // namespace paddle diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index 3f4acc31e6a99..c80e79e3ff207 100755 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -722,6 +722,7 @@ func : pad_grad param: [out_grad, paddings, pad_value] no_need_buffer : x + composite : pad_grad(x, out_grad, paddings, pad_value, x_grad) backward : pad_double_grad - backward_op : pool2d_double_grad diff --git a/python/paddle/fluid/tests/unittests/test_pad_op.py b/python/paddle/fluid/tests/unittests/test_pad_op.py index a40fd7a7409bc..5fd186338444d 100644 --- a/python/paddle/fluid/tests/unittests/test_pad_op.py +++ b/python/paddle/fluid/tests/unittests/test_pad_op.py @@ -24,7 +24,9 @@ def pad_wrapper(x, paddings, pad_value): - return paddle._C_ops.pad(x, paddings, float(pad_value)) + return paddle.nn.functional.pad( + x, pad=list(paddings), mode='constant', value=pad_value + ) class TestPadOp(OpTest): @@ -37,7 +39,7 @@ def setUp(self): 'X': np.random.random(self.shape).astype(self.dtype), } self.attrs = {} - self.attrs['paddings'] = np.array(self.paddings).flatten() + self.attrs['paddings'] = list(np.array(self.paddings).flatten()) self.attrs['pad_value'] = self.pad_value self.outputs = { 'Out': np.pad( @@ -47,6 +49,9 @@ def setUp(self): constant_values=self.pad_value, ) } + self.prim_op_type = "prim" + self.public_python_api = pad_wrapper + self.enable_cinn = False def get_dtype(self): return np.float64 @@ -55,7 +60,7 @@ def test_check_output(self): self.check_output() def test_check_grad_normal(self): - self.check_grad(['X'], 'Out') + self.check_grad(['X'], 'Out', check_prim=True) def initTestCase(self): self.shape = (16, 16) @@ -111,16 +116,19 @@ def test_check_grad_normal(self): class TestPadOpError(unittest.TestCase): def test_errors(self): - with program_guard(Program(), Program()): - input_data = np.random.random((2, 2)).astype("float32") + with paddle.fluid.framework._static_guard(): + with program_guard(Program(), Program()): + input_data = np.random.random((2, 2)).astype("float32") - def test_Variable(): - paddle.nn.functional.pad(x=input_data, pad=[1, 1, 1, 1]) + def test_Variable(): + paddle.nn.functional.pad(x=input_data, pad=[1, 1, 1, 1]) - self.assertRaises(TypeError, test_Variable) + self.assertRaises(TypeError, test_Variable) - data = paddle.static.data(name='data', shape=[4], dtype='float16') - paddle.nn.functional.pad(x=data, pad=[0, 1]) + data = paddle.static.data( + name='data', shape=[4], dtype='float16' + ) + paddle.nn.functional.pad(x=data, pad=[0, 1]) class TestPaddingValueTensor(UnittestBase): @@ -129,34 +137,40 @@ def init_info(self): self.save_path = os.path.join(self.temp_dir.name, self.path_prefix()) def test_static(self): - main_prog = Program() - starup_prog = Program() - with program_guard(main_prog, starup_prog): - fc = paddle.nn.Linear(4, 10) - x = paddle.randn([2, 4]) - x.stop_gradient = False - feat = fc(x) # [2,3,10] - - out = self.call_func(feat) - - sgd = paddle.optimizer.SGD() - sgd.minimize(paddle.mean(out)) - self.assertTrue(self.var_prefix() in str(main_prog)) - - exe = paddle.static.Executor() - exe.run(starup_prog) - res = exe.run(fetch_list=[feat, out]) - gt = np.pad(res[0], [1, 1], 'constant', constant_values=[1.0, 1.0]) - np.testing.assert_allclose(res[1], gt) - paddle.static.save_inference_model( - self.save_path, [x], [feat, out], exe - ) - # Test for Inference Predictor - infer_outs = self.infer_prog() - gt = np.pad( - infer_outs[0], [1, 1], 'constant', constant_values=[1.0, 1.0] - ) - np.testing.assert_allclose(infer_outs[1], gt) + with paddle.fluid.framework._static_guard(): + main_prog = Program() + starup_prog = Program() + with program_guard(main_prog, starup_prog): + fc = paddle.nn.Linear(4, 10) + x = paddle.randn([2, 4]) + x.stop_gradient = False + feat = fc(x) # [2,3,10] + + out = self.call_func(feat) + + sgd = paddle.optimizer.SGD() + sgd.minimize(paddle.mean(out)) + self.assertTrue(self.var_prefix() in str(main_prog)) + + exe = paddle.static.Executor() + exe.run(starup_prog) + res = exe.run(fetch_list=[feat, out]) + gt = np.pad( + res[0], [1, 1], 'constant', constant_values=[1.0, 1.0] + ) + np.testing.assert_allclose(res[1], gt) + paddle.static.save_inference_model( + self.save_path, [x], [feat, out], exe + ) + # Test for Inference Predictor + infer_outs = self.infer_prog() + gt = np.pad( + infer_outs[0], + [1, 1], + 'constant', + constant_values=[1.0, 1.0], + ) + np.testing.assert_allclose(infer_outs[1], gt) def path_prefix(self): return 'padding_value' @@ -183,23 +197,26 @@ def call_func(self, x): class TestPaddingValueTensor3(unittest.TestCase): def test_static(self): - np_x = np.random.random((16, 16)).astype('float32') - main_prog = Program() - starup_prog = Program() - with program_guard(main_prog, starup_prog): - x = paddle.assign(np_x).astype('float32') - pad_value = paddle.assign([0.0]).astype('float64') - y = paddle.nn.functional.pad(x, [0, 1, 2, 3], value=pad_value) - loss = y.sum() - optimize_ops, params_grads = paddle.optimizer.SGD(0.01).minimize( - loss + with paddle.fluid.framework._static_guard(): + np_x = np.random.random((16, 16)).astype('float32') + main_prog = Program() + starup_prog = Program() + with program_guard(main_prog, starup_prog): + x = paddle.assign(np_x).astype('float32') + pad_value = paddle.assign([0.0]).astype('float64') + y = paddle.nn.functional.pad(x, [0, 1, 2, 3], value=pad_value) + loss = y.sum() + optimize_ops, params_grads = paddle.optimizer.SGD( + 0.01 + ).minimize(loss) + + exe = paddle.static.Executor(paddle.CPUPlace()) + res = exe.run( + main_prog, fetch_list=[y] + [g for p, g in params_grads] ) - - exe = paddle.static.Executor(paddle.CPUPlace()) - res = exe.run(main_prog, fetch_list=[y] + [g for p, g in params_grads]) - pd_out = res[0] - np_out = np.pad(np_x, [(0, 1), (2, 3)], constant_values=0.0) - np.testing.assert_allclose(pd_out, np_out) + pd_out = res[0] + np_out = np.pad(np_x, [(0, 1), (2, 3)], constant_values=0.0) + np.testing.assert_allclose(pd_out, np_out) @unittest.skipIf( @@ -215,13 +232,16 @@ def setUp(self): self.python_api = pad_wrapper x = np.random.random(self.shape).astype(np.float32) self.attrs = {} - self.attrs['paddings'] = np.array(self.paddings).flatten() + self.attrs['paddings'] = list(np.array(self.paddings).flatten()) self.attrs['pad_value'] = self.pad_value out = np.pad( x, self.paddings, mode='constant', constant_values=self.pad_value ) self.inputs = {'X': convert_float_to_uint16(x)} self.outputs = {'Out': convert_float_to_uint16(out)} + self.enable_cinn = False + self.prim_op_type = "prim" + self.public_python_api = pad_wrapper def initTestCase(self): self.shape = (16, 16) @@ -234,9 +254,9 @@ def test_check_output(self): def test_check_grad(self): place = core.CUDAPlace(0) - self.check_grad_with_place(place, ['X'], 'Out') + self.check_grad_with_place(place, ['X'], 'Out', check_prim=True) if __name__ == '__main__': - paddle.enable_static() + # paddle.enable_static() unittest.main() From 82ac39132c49ca2792a37c2166f9d9242069d63d Mon Sep 17 00:00:00 2001 From: cyberslack_lee Date: Thu, 27 Apr 2023 10:15:50 +0800 Subject: [PATCH 090/405] =?UTF-8?q?=E3=80=90Hackathon4=E3=80=91No5=20nexta?= =?UTF-8?q?fter=20=20(#52544)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- paddle/phi/api/yaml/ops.yaml | 10 ++ paddle/phi/kernels/cpu/nextafter_kernel.cc | 22 ++++ paddle/phi/kernels/gpu/nextafter_kernel.cu | 22 ++++ .../phi/kernels/impl/nextafter_kernel_impl.h | 84 +++++++++++++ paddle/phi/kernels/nextafter_kernel.h | 28 +++++ python/paddle/__init__.py | 2 + .../tests/unittests/test_nextafter_op.py | 118 ++++++++++++++++++ python/paddle/tensor/__init__.py | 2 + python/paddle/tensor/math.py | 36 ++++++ 9 files changed, 324 insertions(+) create mode 100644 paddle/phi/kernels/cpu/nextafter_kernel.cc create mode 100644 paddle/phi/kernels/gpu/nextafter_kernel.cu create mode 100644 paddle/phi/kernels/impl/nextafter_kernel_impl.h create mode 100644 paddle/phi/kernels/nextafter_kernel.h create mode 100644 python/paddle/fluid/tests/unittests/test_nextafter_op.py diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml index 507fef3309660..8fc8c4c9b081c 100644 --- a/paddle/phi/api/yaml/ops.yaml +++ b/paddle/phi/api/yaml/ops.yaml @@ -1421,6 +1421,16 @@ data_transform : skip_transform : out_size, size_tensor, scale_tensor +- op : nextafter + args : (Tensor x, Tensor y) + output : Tensor(out) + infer_meta : + func : ElementwiseInferMeta + param: [x, y] + kernel : + func : nextafter + data_type : x + - op : nll_loss args : (Tensor input, Tensor label, Tensor weight, int64_t ignore_index = -100, str reduction = "mean") output : Tensor(out), Tensor(total_weight) diff --git a/paddle/phi/kernels/cpu/nextafter_kernel.cc b/paddle/phi/kernels/cpu/nextafter_kernel.cc new file mode 100644 index 0000000000000..ac4ab00a4d3fe --- /dev/null +++ b/paddle/phi/kernels/cpu/nextafter_kernel.cc @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/nextafter_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/nextafter_kernel_impl.h" + +PD_REGISTER_KERNEL( + nextafter, CPU, ALL_LAYOUT, phi::NextafterKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/nextafter_kernel.cu b/paddle/phi/kernels/gpu/nextafter_kernel.cu new file mode 100644 index 0000000000000..e0ac8212853c9 --- /dev/null +++ b/paddle/phi/kernels/gpu/nextafter_kernel.cu @@ -0,0 +1,22 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/nextafter_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/nextafter_kernel_impl.h" + +PD_REGISTER_KERNEL( + nextafter, GPU, ALL_LAYOUT, phi::NextafterKernel, float, double) {} diff --git a/paddle/phi/kernels/impl/nextafter_kernel_impl.h b/paddle/phi/kernels/impl/nextafter_kernel_impl.h new file mode 100644 index 0000000000000..6d54009282528 --- /dev/null +++ b/paddle/phi/kernels/impl/nextafter_kernel_impl.h @@ -0,0 +1,84 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" +#include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/funcs/math.h" +#include "paddle/phi/kernels/nextafter_kernel.h" +namespace phi { +template +struct NextafterOut { + using type = T; +}; + +template <> +struct NextafterOut { + using type = double; +}; + +template <> +struct NextafterOut { + using type = double; +}; +template +struct NextafterFunctor { + NextafterFunctor(const T* x, + const T* y, + typename NextafterOut::type* out, + int64_t numel) + : x_(x), y_(y), out_(out), numel_(numel) {} + + HOSTDEVICE void operator()(int64_t idx) const { + out_[idx] = static_cast::type>(std::nextafter( + static_cast(x_[idx]), static_cast(y_[idx]))); + } + const T* x_; + const T* y_; + typename NextafterOut::type* out_; + int64_t numel_; +}; +template <> +struct NextafterFunctor { + NextafterFunctor(const double* x, const double* y, double* out, int64_t numel) + : x_(x), y_(y), out_(out), numel_(numel) {} + + HOSTDEVICE void operator()(int64_t idx) const { + out_[idx] = std::nextafter(x_[idx], y_[idx]); + } + + const double* x_; + const double* y_; + double* out_; + int64_t numel_; +}; + +template +void NextafterKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { + auto* out_data = ctx.template Alloc(out); + auto x_data = x.data(); + auto y_data = y.data(); + auto x_numel = x.numel(); + + phi::funcs::ForRange for_range(ctx, x_numel); + phi::NextafterFunctor functor(x_data, y_data, out_data, x_numel); + for_range(functor); +} + +} // namespace phi diff --git a/paddle/phi/kernels/nextafter_kernel.h b/paddle/phi/kernels/nextafter_kernel.h new file mode 100644 index 0000000000000..3a185e39bd940 --- /dev/null +++ b/paddle/phi/kernels/nextafter_kernel.h @@ -0,0 +1,28 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void NextafterKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out); + +} // namespace phi diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index f319ab27c063a..d155dce987c08 100644 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -299,6 +299,7 @@ from .tensor.math import trapezoid # noqa: F401 from .tensor.math import cumulative_trapezoid # noqa: F401 from .tensor.math import vander # noqa: F401 +from .tensor.math import nextafter # noqa: F401 from .tensor.random import bernoulli # noqa: F401 from .tensor.random import poisson # noqa: F401 @@ -688,4 +689,5 @@ 'cumulative_trapezoid', 'polar', 'vander', + 'nextafter', ] diff --git a/python/paddle/fluid/tests/unittests/test_nextafter_op.py b/python/paddle/fluid/tests/unittests/test_nextafter_op.py new file mode 100644 index 0000000000000..5048778e1b7b7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_nextafter_op.py @@ -0,0 +1,118 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +from eager_op_test import OpTest + +import paddle + + +def ref_nextafter(x, y): + out = np.nextafter(x, y) + return out + + +class TestNextafterAPI(unittest.TestCase): + def setUp(self): + self.x = np.random.rand(2, 3, 4, 5).astype('float32') + self.y = np.random.rand(2, 3, 4, 5).astype('float32') + self.x1 = np.array([0, 0, 10]).astype("float32") + self.y1 = np.array([np.inf, -np.inf, 10]).astype("float32") + self.x2 = np.random.rand(100).astype("float32") + self.y2 = np.random.rand(100).astype("float32") + self.place = ( + paddle.CUDAPlace(0) + if paddle.is_compiled_with_cuda() + else paddle.CPUPlace() + ) + + def test_static_api(self): + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.static.data( + name='x', shape=self.x.shape, dtype='float32' + ) + y = paddle.static.data( + name='y', shape=self.y.shape, dtype='float32' + ) + out = paddle.nextafter(x, y) + exe = paddle.static.Executor(self.place) + res = exe.run(feed={'x': self.x, 'y': self.y}, fetch_list=[out]) + out_ref = ref_nextafter(self.x, self.y) + np.testing.assert_allclose(out_ref, res[0], rtol=1e-05) + + with paddle.static.program_guard(paddle.static.Program()): + x1 = paddle.static.data( + name='x', shape=self.x1.shape, dtype='float32' + ) + y1 = paddle.static.data( + name='y', shape=self.y1.shape, dtype='float32' + ) + out = paddle.nextafter(x1, y1) + exe = paddle.static.Executor(self.place) + res = exe.run(feed={'x': self.x1, 'y': self.y1}, fetch_list=[out]) + out_ref = ref_nextafter(self.x1, self.y1) + np.testing.assert_allclose(out_ref, res[0], rtol=1e-05) + + with paddle.static.program_guard(paddle.static.Program()): + x2 = paddle.static.data( + name='x', shape=self.x2.shape, dtype='float32' + ) + y2 = paddle.static.data( + name='y', shape=self.y2.shape, dtype='float32' + ) + out = paddle.nextafter(x2, y2) + exe = paddle.static.Executor(self.place) + res = exe.run(feed={'x': self.x2, 'y': self.y2}, fetch_list=[out]) + out_ref = ref_nextafter(self.x2, self.y2) + np.testing.assert_allclose(out_ref, res[0], rtol=1e-05) + + def test_dygraph_api(self): + paddle.disable_static(self.place) + x = paddle.to_tensor(self.x) + y = paddle.to_tensor(self.y) + out = paddle.nextafter(x, y) + out_ref = ref_nextafter(self.x, self.y) + np.testing.assert_allclose(out_ref, out.numpy(), rtol=1e-05) + paddle.enable_static() + + +class TestNextafterOP(OpTest): + def setUp(self): + self.op_type = "nextafter" + self.python_api = paddle.nextafter + self.init_dtype() + + x = np.array([1, 2]).astype(self.dtype) + y = np.array([2, 1]).astype(self.dtype) + out = np.nextafter(x, y) + self.inputs = {'x': x, 'y': y} + self.outputs = {'out': out} + + def test_check_output(self): + self.check_output() + + def init_dtype(self): + self.dtype = np.float64 + + +class TestNextafterOPFP32(TestNextafterOP): + def init_dtype(self): + self.dtype = np.float32 + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py index b78ac0e57c22e..bea2fd7323d9f 100644 --- a/python/paddle/tensor/__init__.py +++ b/python/paddle/tensor/__init__.py @@ -251,6 +251,7 @@ from .math import sigmoid # noqa: F401 from .math import sigmoid_ # noqa: F401 from .math import vander # noqa: F401 +from .math import nextafter # noqa: F401 from .random import multinomial # noqa: F401 from .random import standard_normal # noqa: F401 @@ -540,6 +541,7 @@ 'sigmoid', 'sigmoid_', 'vander', + 'nextafter', ] # this list used in math_op_patch.py for magic_method bind diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index 2f94f0a7e2013..37519996a2acd 100644 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -5486,3 +5486,39 @@ def vander(x, n=None, increasing=False, name=None): res[:, 1:] = paddle.cumprod(res[:, 1:], dim=-1) res = res[:, ::-1] if not increasing else res return res + + +def nextafter(x, y, name=None): + r""" + Return the next floating-point value after input towards other, elementwise. + The shapes of input and other must be broadcastable. + + Args: + x (Tensor): An N-D Tensor, the data type is float32, float64. + y (Tensor): An N-D Tensor, the data type is float32, float64. + name(str, optional):Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + + Returns: + out (Tensor): An N-D Tensor, the shape and data type is the same with input. + + Examples: + .. code-block:: python + + import paddle + out = paddle.nextafter(paddle.to_tensor([1.0,2.0]),paddle.to_tensor([2.0,1.0])) + print(out) + #Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True, + # [1.00000012, 1.99999988]) + """ + if in_dygraph_mode(): + return _C_ops.nextafter(x, y) + else: + check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'nextafter') + check_variable_and_dtype(y, 'y', ['float32', 'float64'], 'nextafter') + op_type = "nextafter" + helper = LayerHelper(op_type, **locals()) + inputs = {"x": x, "y": y} + out = helper.create_variable_for_type_inference(dtype=paddle.float32) + outputs = {"out": out} + helper.append_op(type=op_type, inputs=inputs, outputs=outputs) + return out From 8a6ad6e5d353ab116c6c6c60245295fe71b2a391 Mon Sep 17 00:00:00 2001 From: superwinner1 <82640284+superwinner1@users.noreply.github.com> Date: Thu, 27 Apr 2023 10:21:14 +0800 Subject: [PATCH 091/405] =?UTF-8?q?=E3=80=90Hackathon=20No.55=E3=80=91add?= =?UTF-8?q?=20fmax=20BF16=20test=20(#51925)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../phi/kernels/funcs/elementwise_functor.h | 11 +++++++ .../kernels/gpu/elementwise_grad_kernel.cu | 1 + paddle/phi/kernels/kps/elementwise_kernel.cu | 1 + .../fluid/tests/unittests/test_fmax_op.py | 31 ++++++++++++++++++- 4 files changed, 43 insertions(+), 1 deletion(-) diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h index dc78bd7098411..af0ae3b6447be 100644 --- a/paddle/phi/kernels/funcs/elementwise_functor.h +++ b/paddle/phi/kernels/funcs/elementwise_functor.h @@ -229,6 +229,17 @@ struct FMaxFunctor { } }; +template <> +struct FMaxFunctor { + inline HOSTDEVICE dtype::bfloat16 operator()(const dtype::bfloat16 a, + const dtype::bfloat16 b) const { + float float_a = static_cast(a); + float float_b = static_cast(b); + auto result = std::fmax(float_a, float_b); + return static_cast(result); + } +}; + template <> struct FMaxFunctor { inline HOSTDEVICE int operator()(const int a, const int b) const { diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu index b3ad0dacae37c..30e222663da10 100644 --- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu @@ -97,6 +97,7 @@ PD_REGISTER_KERNEL(fmax_grad, double, int, phi::dtype::float16, + phi::dtype::bfloat16, int64_t) {} PD_REGISTER_KERNEL(fmin_grad, diff --git a/paddle/phi/kernels/kps/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu index 80a969c4fabb4..cbf811a9830b0 100644 --- a/paddle/phi/kernels/kps/elementwise_kernel.cu +++ b/paddle/phi/kernels/kps/elementwise_kernel.cu @@ -155,6 +155,7 @@ PD_REGISTER_KERNEL(fmax, double, int, float16, + bfloat16, int64_t) {} PD_REGISTER_KERNEL(fmin, diff --git a/python/paddle/fluid/tests/unittests/test_fmax_op.py b/python/paddle/fluid/tests/unittests/test_fmax_op.py index 19417919df248..0271854aebb72 100644 --- a/python/paddle/fluid/tests/unittests/test_fmax_op.py +++ b/python/paddle/fluid/tests/unittests/test_fmax_op.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from eager_op_test import OpTest +from eager_op_test import OpTest, convert_float_to_uint16 import paddle from paddle.fluid import core @@ -241,5 +241,34 @@ def test_check_grad_normal(self): self.check_grad(['X', 'Y'], 'Out') +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA and not support the bfloat16", +) +class TestFmaxBF16OP(OpTest): + def setUp(self): + self.op_type = "elementwise_fmax" + self.python_api = paddle.fmax + self.dtype = np.uint16 + x = np.random.uniform(0.1, 1, [13, 17]).astype("float32") + sgn = np.random.choice([-1, 1], [13, 17]).astype("float32") + y = x + sgn * np.random.uniform(0.1, 1, [13, 17]).astype("float32") + out = np.fmax(x, y) + self.inputs = { + 'X': convert_float_to_uint16(x), + 'Y': convert_float_to_uint16(y), + } + self.outputs = {'Out': convert_float_to_uint16(out)} + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place) + + def test_check_grad(self): + place = core.CUDAPlace(0) + self.check_grad_with_place(place, ['X', 'Y'], 'Out') + + if __name__ == "__main__": unittest.main() From 166964b109a7bbbfbc2dea040610eb1f38c9602e Mon Sep 17 00:00:00 2001 From: risemeup1 <62429225+risemeup1@users.noreply.github.com> Date: Thu, 27 Apr 2023 10:21:40 +0800 Subject: [PATCH 092/405] update cmake3.16 to 3.18 (#53288) * update cmake3.16 to 3.18 * test * Update Dockerfile.ubuntu --- cmake/external/dlpack.cmake | 1 + tools/dockerfile/Dockerfile.ubuntu | 4 ++-- tools/dockerfile/Dockerfile.ubuntu18 | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/cmake/external/dlpack.cmake b/cmake/external/dlpack.cmake index 727202a434683..fc631c1d184aa 100644 --- a/cmake/external/dlpack.cmake +++ b/cmake/external/dlpack.cmake @@ -17,6 +17,7 @@ include(ExternalProject) set(DLPACK_PREFIX_DIR ${THIRD_PARTY_PATH}/dlpack) set(DLPACK_REPOSITORY ${GIT_URL}/dmlc/dlpack.git) + set(DLPACK_TAG v0.4) set(DLPACK_INCLUDE_DIR ${THIRD_PARTY_PATH}/dlpack/src/extern_dlpack/include) diff --git a/tools/dockerfile/Dockerfile.ubuntu b/tools/dockerfile/Dockerfile.ubuntu index fba5365f8b25e..fd7e476f710ab 100644 --- a/tools/dockerfile/Dockerfile.ubuntu +++ b/tools/dockerfile/Dockerfile.ubuntu @@ -42,8 +42,8 @@ RUN wget https://github.com/koalaman/shellcheck/releases/download/v0.7.1/shellch # install cmake WORKDIR /home -RUN wget -q https://cmake.org/files/v3.16/cmake-3.16.0-Linux-x86_64.tar.gz && tar -zxvf cmake-3.16.0-Linux-x86_64.tar.gz && rm cmake-3.16.0-Linux-x86_64.tar.gz -ENV PATH=/home/cmake-3.16.0-Linux-x86_64/bin:$PATH +RUN wget -q https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.tar.gz && tar -zxvf cmake-3.18.0-Linux-x86_64.tar.gz && rm cmake-3.18.0-Linux-x86_64.tar.gz +ENV PATH=/home/cmake-3.18.0-Linux-x86_64/bin:$PATH # Install Python3.7 RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \ diff --git a/tools/dockerfile/Dockerfile.ubuntu18 b/tools/dockerfile/Dockerfile.ubuntu18 index 35397967f283f..41f8f97db0018 100644 --- a/tools/dockerfile/Dockerfile.ubuntu18 +++ b/tools/dockerfile/Dockerfile.ubuntu18 @@ -50,8 +50,8 @@ ENV PATH=/usr/local/gcc-8.2/bin:$PATH # install cmake WORKDIR /home -RUN wget -q https://cmake.org/files/v3.16/cmake-3.16.0-Linux-x86_64.tar.gz && tar -zxvf cmake-3.16.0-Linux-x86_64.tar.gz && rm cmake-3.16.0-Linux-x86_64.tar.gz -ENV PATH=/home/cmake-3.16.0-Linux-x86_64/bin:$PATH +RUN wget -q https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.tar.gz && tar -zxvf cmake-3.18.0-Linux-x86_64.tar.gz && rm cmake-3.18.0-Linux-x86_64.tar.gz +ENV PATH=/home/cmake-3.18.0-Linux-x86_64/bin:$PATH RUN apt-get update && \ From eee9c7887aaffb92cb2247c81a58fc88dcc2bd73 Mon Sep 17 00:00:00 2001 From: huangjiyi <43315610+huangjiyi@users.noreply.github.com> Date: Thu, 27 Apr 2023 10:30:37 +0800 Subject: [PATCH 093/405] Register fluid xpu kerenls to phi [part 2] (#53188) * update * fix bug --- .../c_softmax_with_cross_entropy_op_xpu.cc | 20 ++++++++----- .../operators/collective/c_split_op_xpu.cc | 13 ++++---- .../collective/c_sync_calc_stream_op_xpu.cc | 4 +-- .../collective/c_sync_comm_stream_op_xpu.cc | 4 +-- .../operators/controlflow/logical_op_xpu.h | 7 ++++- .../controlflow/logicaland_op_xpu.cc | 27 +++++++++++------ .../controlflow/logicalnot_op_xpu.cc | 19 +++++++----- .../operators/controlflow/logicalor_op_xpu.cc | 26 ++++++++++------ .../detection/iou_similarity_op_xpu.cc | 5 ++-- .../fused/fused_gemm_epilogue_op_xpu.cc | 30 ++++++++++--------- paddle/fluid/operators/load_combine_op_xpu.cc | 17 ++++++----- paddle/fluid/operators/lod_reset_op.cc | 16 +++++----- paddle/fluid/operators/log_loss_op_xpu.cc | 13 ++++---- .../operators/metrics/accuracy_op_xpu.cc | 8 ++--- .../optimizers/lars_momentum_op_xpu.cc | 12 +++++--- .../operators/reduce_ops/logsumexp_op_xpu.cc | 4 +++ 16 files changed, 134 insertions(+), 91 deletions(-) diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op_xpu.cc b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op_xpu.cc index 83824c8108401..695c28d77554a 100644 --- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op_xpu.cc @@ -29,7 +29,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class CSoftmaxWithCrossEntropyOp : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -468,7 +468,7 @@ struct CSoftmaxWithCrossEntropyFunctor { } }; -template +template class CSoftmaxWithCrossEntropyGrad : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -535,9 +535,13 @@ class CSoftmaxWithCrossEntropyGrad : public framework::OpKernel { namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_XPU_KERNEL(c_softmax_with_cross_entropy, - ops::CSoftmaxWithCrossEntropyOp); - -REGISTER_OP_XPU_KERNEL( - c_softmax_with_cross_entropy_grad, - ops::CSoftmaxWithCrossEntropyGrad); +PD_REGISTER_STRUCT_KERNEL(c_softmax_with_cross_entropy, + XPU, + ALL_LAYOUT, + ops::CSoftmaxWithCrossEntropyOp, + float) {} +PD_REGISTER_STRUCT_KERNEL(c_softmax_with_cross_entropy_grad, + XPU, + ALL_LAYOUT, + ops::CSoftmaxWithCrossEntropyGrad, + float) {} diff --git a/paddle/fluid/operators/collective/c_split_op_xpu.cc b/paddle/fluid/operators/collective/c_split_op_xpu.cc index bad920a11ff5e..d573a83d708c4 100644 --- a/paddle/fluid/operators/collective/c_split_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_split_op_xpu.cc @@ -22,7 +22,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class CSplitOpXPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -87,7 +87,10 @@ class CSplitOpXPUKernel : public framework::OpKernel { namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_XPU_KERNEL(c_split, - ops::CSplitOpXPUKernel, - ops::CSplitOpXPUKernel, - ops::CSplitOpXPUKernel); +PD_REGISTER_STRUCT_KERNEL(c_split, + XPU, + ALL_LAYOUT, + ops::CSplitOpXPUKernel, + float, + int, + plat::float16) {} diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op_xpu.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op_xpu.cc index 4917057059ffc..0b432cab281fc 100644 --- a/paddle/fluid/operators/collective/c_sync_calc_stream_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op_xpu.cc @@ -17,5 +17,5 @@ limitations under the License. */ namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_XPU_KERNEL(c_sync_calc_stream, - ops::CSyncCalcStreamKernel) +PD_REGISTER_STRUCT_KERNEL( + c_sync_calc_stream, XPU, ALL_LAYOUT, ops::CSyncCalcStreamKernel, float) {} diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op_xpu.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op_xpu.cc index bbb3b62bf3cf6..ce2c20d57f0b3 100644 --- a/paddle/fluid/operators/collective/c_sync_comm_stream_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_xpu.cc @@ -17,5 +17,5 @@ limitations under the License. */ namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_XPU_KERNEL(c_sync_comm_stream, - ops::CSyncCommStreamKernel); +PD_REGISTER_STRUCT_KERNEL( + c_sync_comm_stream, XPU, ALL_LAYOUT, ops::CSyncCommStreamKernel, float) {} diff --git a/paddle/fluid/operators/controlflow/logical_op_xpu.h b/paddle/fluid/operators/controlflow/logical_op_xpu.h index 8afefd6837449..614db61558f79 100644 --- a/paddle/fluid/operators/controlflow/logical_op_xpu.h +++ b/paddle/fluid/operators/controlflow/logical_op_xpu.h @@ -156,7 +156,12 @@ class BinaryLogicalOpXPUKernel : public framework::OpKernel { } }; -template +#define DEFINE_BINARY_LOGICAL_OP_XPU_KERNEL(op_name, xpu_type) \ + template \ + class BinaryLogical##op_name##CPUKernel \ + : public CReduceOpCPUKernel {}; + +template class UnaryLogicalOpXPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { diff --git a/paddle/fluid/operators/controlflow/logicaland_op_xpu.cc b/paddle/fluid/operators/controlflow/logicaland_op_xpu.cc index 6248b6e0b0637..563317f209ebc 100644 --- a/paddle/fluid/operators/controlflow/logicaland_op_xpu.cc +++ b/paddle/fluid/operators/controlflow/logicaland_op_xpu.cc @@ -14,14 +14,23 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU #include "paddle/fluid/operators/controlflow/logical_op_xpu.h" + +namespace paddle { +namespace operators { +DEFINE_BINARY_LOGICAL_OP_XPU_KERNEL(AND, XpuLogicalType::XPU_AND); +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; -REGISTER_OP_XPU_KERNEL( - logical_and, - ops::BinaryLogicalOpXPUKernel, - ops::BinaryLogicalOpXPUKernel, - ops::BinaryLogicalOpXPUKernel, - ops::BinaryLogicalOpXPUKernel, - ops::BinaryLogicalOpXPUKernel, - ops::BinaryLogicalOpXPUKernel, - ops::BinaryLogicalOpXPUKernel); +PD_REGISTER_STRUCT_KERNEL(logical_and, + XPU, + ALL_LAYOUT, + ops::BinaryLogicalANDCPUKernel, + bool, + int8_t, + int16_t, + int, + int64_t, + float, + double) {} #endif diff --git a/paddle/fluid/operators/controlflow/logicalnot_op_xpu.cc b/paddle/fluid/operators/controlflow/logicalnot_op_xpu.cc index b8336c7201c3b..1431816810b1e 100644 --- a/paddle/fluid/operators/controlflow/logicalnot_op_xpu.cc +++ b/paddle/fluid/operators/controlflow/logicalnot_op_xpu.cc @@ -15,12 +15,15 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU #include "paddle/fluid/operators/controlflow/logical_op_xpu.h" namespace ops = paddle::operators; -REGISTER_OP_XPU_KERNEL(logicalnot, - ops::UnaryLogicalOpXPUKernel, - ops::UnaryLogicalOpXPUKernel, - ops::UnaryLogicalOpXPUKernel, - ops::UnaryLogicalOpXPUKernel, - ops::UnaryLogicalOpXPUKernel, - ops::UnaryLogicalOpXPUKernel, - ops::UnaryLogicalOpXPUKernel); +PD_REGISTER_STRUCT_KERNEL(logicalnot, + XPU, + ALL_LAYOUT, + ops::UnaryLogicalOpXPUKernel, + bool, + int8_t, + int16_t, + int, + int64_t, + float, + double) {} #endif diff --git a/paddle/fluid/operators/controlflow/logicalor_op_xpu.cc b/paddle/fluid/operators/controlflow/logicalor_op_xpu.cc index 126596841a29f..03c707222e44e 100644 --- a/paddle/fluid/operators/controlflow/logicalor_op_xpu.cc +++ b/paddle/fluid/operators/controlflow/logicalor_op_xpu.cc @@ -15,14 +15,22 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU #include "paddle/fluid/operators/controlflow/logical_op_xpu.h" +namespace paddle { +namespace operators { +DEFINE_BINARY_LOGICAL_OP_XPU_KERNEL(OR, XpuLogicalType::XPU_OR); +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; -REGISTER_OP_XPU_KERNEL( - logical_or, - ops::BinaryLogicalOpXPUKernel, - ops::BinaryLogicalOpXPUKernel, - ops::BinaryLogicalOpXPUKernel, - ops::BinaryLogicalOpXPUKernel, - ops::BinaryLogicalOpXPUKernel, - ops::BinaryLogicalOpXPUKernel, - ops::BinaryLogicalOpXPUKernel); +PD_REGISTER_STRUCT_KERNEL(logical_or, + XPU, + ALL_LAYOUT, + ops::BinaryLogicalORCPUKernel, + bool, + int8_t, + int16_t, + int, + int64_t, + float, + double) {} #endif diff --git a/paddle/fluid/operators/detection/iou_similarity_op_xpu.cc b/paddle/fluid/operators/detection/iou_similarity_op_xpu.cc index 1dc669075b17a..27ffa64c2a892 100644 --- a/paddle/fluid/operators/detection/iou_similarity_op_xpu.cc +++ b/paddle/fluid/operators/detection/iou_similarity_op_xpu.cc @@ -19,7 +19,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class XPUIOUSimilarityKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -57,6 +57,7 @@ class XPUIOUSimilarityKernel : public framework::OpKernel { namespace ops = paddle::operators; using XPU = paddle::platform::XPUDeviceContext; -REGISTER_OP_XPU_KERNEL(iou_similarity, ops::XPUIOUSimilarityKernel); +PD_REGISTER_STRUCT_KERNEL( + iou_similarity, XPU, ALL_LAYOUT, ops::XPUIOUSimilarityKernel, float) {} #endif diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc b/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc index 82b437b943cb4..6594df2f5164f 100644 --- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc +++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc @@ -22,7 +22,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class FusedGemmEpilogueXPUKernel : public framework::OpKernel { using XPUType = typename XPUTypeTrait::Type; @@ -102,7 +102,7 @@ class FusedGemmEpilogueXPUKernel : public framework::OpKernel { } }; -template +template class FusedGemmEpilogueXPUGradKernel : public framework::OpKernel { using XPUType = typename XPUTypeTrait::Type; @@ -227,15 +227,17 @@ class FusedGemmEpilogueXPUGradKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; - -REGISTER_OP_XPU_KERNEL( - fused_gemm_epilogue, - ops::FusedGemmEpilogueXPUKernel, - ops::FusedGemmEpilogueXPUKernel); - -REGISTER_OP_XPU_KERNEL( - fused_gemm_epilogue_grad, - ops::FusedGemmEpilogueXPUGradKernel, - ops::FusedGemmEpilogueXPUGradKernel); +namespace plat = paddle::platform; + +PD_REGISTER_STRUCT_KERNEL(fused_gemm_epilogue, + XPU, + ALL_LAYOUT, + ops::FusedGemmEpilogueXPUKernel, + float, + plat::float16) {} +PD_REGISTER_STRUCT_KERNEL(fused_gemm_epilogue_grad, + XPU, + ALL_LAYOUT, + ops::FusedGemmEpilogueXPUGradKernel, + float, + plat::float16) {} diff --git a/paddle/fluid/operators/load_combine_op_xpu.cc b/paddle/fluid/operators/load_combine_op_xpu.cc index 307c9042c543d..d285af37cda98 100644 --- a/paddle/fluid/operators/load_combine_op_xpu.cc +++ b/paddle/fluid/operators/load_combine_op_xpu.cc @@ -15,11 +15,12 @@ limitations under the License. */ #include "paddle/fluid/operators/load_combine_op.h" namespace ops = paddle::operators; -using XPUCtx = paddle::platform::XPUDeviceContext; - -REGISTER_OP_XPU_KERNEL(load_combine, - ops::LoadCombineOpKernel, - ops::LoadCombineOpKernel, - ops::LoadCombineOpKernel, - ops::LoadCombineOpKernel, - ops::LoadCombineOpKernel); +PD_REGISTER_STRUCT_KERNEL(load_combine, + XPU, + ALL_LAYOUT, + ops::LoadCombineOpKernel, + float, + double, + int, + int8_t, + int64_t) {} diff --git a/paddle/fluid/operators/lod_reset_op.cc b/paddle/fluid/operators/lod_reset_op.cc index aa5c6bc249cda..ae464e7b47161 100644 --- a/paddle/fluid/operators/lod_reset_op.cc +++ b/paddle/fluid/operators/lod_reset_op.cc @@ -259,13 +259,15 @@ PD_REGISTER_STRUCT_KERNEL(lod_reset, int64_t) {} #ifdef PADDLE_WITH_XPU -using XPUCtx = paddle::platform::XPUDeviceContext; -REGISTER_OP_XPU_KERNEL(lod_reset, - ops::LoDResetKernel, - ops::LoDResetKernel, - ops::LoDResetKernel, - ops::LoDResetKernel, - ops::LoDResetKernel); +PD_REGISTER_STRUCT_KERNEL(lod_reset, + XPU, + ALL_LAYOUT, + ops::LoDResetKernel, + plat::float16, + float, + double, + int, + int64_t) {} #endif PD_REGISTER_STRUCT_KERNEL(lod_reset_grad, diff --git a/paddle/fluid/operators/log_loss_op_xpu.cc b/paddle/fluid/operators/log_loss_op_xpu.cc index 87e6d42e98ad5..6c0c7f30d8e49 100644 --- a/paddle/fluid/operators/log_loss_op_xpu.cc +++ b/paddle/fluid/operators/log_loss_op_xpu.cc @@ -17,7 +17,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class LogLossXPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -37,7 +37,7 @@ class LogLossXPUKernel : public framework::OpKernel { PADDLE_ENFORCE_XDNN_SUCCESS(r, "log_loss"); } }; -template +template class LogLossGradXPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -67,10 +67,9 @@ class LogLossGradXPUKernel : public framework::OpKernel { } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_XPU_KERNEL( - log_loss, ops::LogLossXPUKernel); -REGISTER_OP_XPU_KERNEL( - log_loss_grad, - ops::LogLossGradXPUKernel); +PD_REGISTER_STRUCT_KERNEL( + log_loss, XPU, ALL_LAYOUT, ops::LogLossXPUKernel, float) {} +PD_REGISTER_STRUCT_KERNEL( + log_loss_grad, XPU, ALL_LAYOUT, ops::LogLossGradXPUKernel, float) {} #endif diff --git a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc index 737228902b6e7..0ac30b3e87347 100644 --- a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc +++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc @@ -21,7 +21,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class AccuracyXPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -73,8 +73,6 @@ class AccuracyXPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_XPU_KERNEL( - accuracy, - ops::AccuracyXPUKernel); - +PD_REGISTER_STRUCT_KERNEL( + accuracy, XPU, ALL_LAYOUT, ops::AccuracyXPUKernel, float) {} #endif diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op_xpu.cc b/paddle/fluid/operators/optimizers/lars_momentum_op_xpu.cc index 267e064b3065e..52b57252b0abe 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op_xpu.cc +++ b/paddle/fluid/operators/optimizers/lars_momentum_op_xpu.cc @@ -20,7 +20,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class LarsMomentumOpXPUKernel : public framework::OpKernel { using XPUType = typename XPUTypeTrait::Type; @@ -115,7 +115,11 @@ class LarsMomentumOpXPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_XPU_KERNEL(lars_momentum, - ops::LarsMomentumOpXPUKernel, - ops::LarsMomentumOpXPUKernel); +namespace plat = paddle::platform; +PD_REGISTER_STRUCT_KERNEL(lars_momentum, + XPU, + ALL_LAYOUT, + ops::LarsMomentumOpXPUKernel, + float, + plat::float16) {} #endif diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc b/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc index e250b5585da06..b23fee1a012df 100644 --- a/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc +++ b/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc @@ -70,6 +70,10 @@ class XPULogsumexpKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; +// This kernel can not be registered in phi, because op logsumexp should run +// phi::LogsumexpKernel rather than XPULogsumexpKernel here. And if register +// xpu logsumexp kernel in phi, op logsumexp will run XPULogsumexpKernel here +// and raise error. REGISTER_OP_XPU_KERNEL( logsumexp, ops::XPULogsumexpKernel); From 2e1ac529c67ba5a090230749683eb845bfb3afce Mon Sep 17 00:00:00 2001 From: houj04 <35131887+houj04@users.noreply.github.com> Date: Thu, 27 Apr 2023 10:34:39 +0800 Subject: [PATCH 094/405] [XPU] remove scale_loss in parallel.py (#53337) * [XPU] remove scale_loss in parallel.py * [XPU] throw Unimplemented when using Reducer --- paddle/fluid/imperative/reducer.cc | 4 ++- .../allocation/naive_best_fit_allocator.cc | 4 +-- .../fluid/operators/reader/buffered_reader.cc | 2 +- .../platform/device/xpu/xpu_resource_pool.cc | 8 ++--- paddle/phi/kernels/xpu/bmm_grad_kernel.cc | 2 +- paddle/phi/kernels/xpu/bmm_kernel.cc | 2 +- paddle/phi/kernels/xpu/xpu_api_wrapper.h | 6 ++-- python/paddle/distributed/parallel.py | 17 +--------- .../fluid/dygraph/tensor_patch_methods.py | 33 +++++-------------- 9 files changed, 24 insertions(+), 54 deletions(-) diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index f90e1243d0f8d..33fbfc1d33746 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -61,7 +61,9 @@ void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) { VLOG(4) << "after div 2" << *tensor; } else if (platform::is_xpu_place(tensor->place())) { #ifdef PADDLE_WITH_XPU_BKCL -// TODO(liuyuhui) support xpu about div nranks in the future + PADDLE_THROW( + platform::errors::Unimplemented("DivNRanks is not supported on XPU / " + "XPU_BKCL, use EagerReducer instead.")); #endif } } diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index 8c9eb889add6c..29ac488e9c52b 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -149,7 +149,7 @@ void *Alloc(const platform::XPUPlace &place, size_t size) { VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); void *p = nullptr; - platform::XPUDeviceGuard gurad(place.device); + platform::XPUDeviceGuard guard(place.device); int ret = xpu_malloc(reinterpret_cast(&p), size); if (ret != XPU_SUCCESS) { VLOG(10) << "xpu memory malloc(" << size << ") failed, try again"; @@ -182,7 +182,7 @@ void Free(const platform::XPUPlace &place, VLOG(10) << "Free " << size << " bytes on " << platform::Place(place); VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); - platform::XPUDeviceGuard gurad(place.device); + platform::XPUDeviceGuard guard(place.device); xpu_free(p); #else PADDLE_THROW( diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index f0f54eafaa36b..b1e29ea69249d 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -268,7 +268,7 @@ void BufferedReader::ReadAsync(size_t i) { xpu_ptrs.emplace_back(xpu[i].mutable_data(place_, cpu[i].type())); } - platform::XPUDeviceGuard gurad(place_.device); + platform::XPUDeviceGuard guard(place_.device); int r = xpu_event_record(events_[i].get(), compute_stream_); PADDLE_ENFORCE_XDNN_SUCCESS(r, "xpu_event_record"); r = xpu_stream_wait_event(stream_.get(), events_[i].get()); diff --git a/paddle/fluid/platform/device/xpu/xpu_resource_pool.cc b/paddle/fluid/platform/device/xpu/xpu_resource_pool.cc index dccb8a9f295d6..5b6cbac603e5d 100644 --- a/paddle/fluid/platform/device/xpu/xpu_resource_pool.cc +++ b/paddle/fluid/platform/device/xpu/xpu_resource_pool.cc @@ -22,14 +22,14 @@ XpuStreamResourcePool::XpuStreamResourcePool() { pool_.reserve(dev_cnt); for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) { auto creator = [dev_idx] { - platform::XPUDeviceGuard gurad(dev_idx); + platform::XPUDeviceGuard guard(dev_idx); xpuStream stream; xpu_stream_create(&stream); return stream; }; auto deleter = [dev_idx](xpuStream stream) { - platform::XPUDeviceGuard gurad(dev_idx); + platform::XPUDeviceGuard guard(dev_idx); xpu_stream_destroy(stream); }; @@ -63,14 +63,14 @@ XpuEventResourcePool::XpuEventResourcePool() { pool_.reserve(dev_cnt); for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) { auto creator = [dev_idx] { - platform::XPUDeviceGuard gurad(dev_idx); + platform::XPUDeviceGuard guard(dev_idx); xpuEventHandle event; xpu_event_create(&event); return event; }; auto deleter = [dev_idx](xpuEventHandle event) { - platform::XPUDeviceGuard gurad(dev_idx); + platform::XPUDeviceGuard guard(dev_idx); xpu_event_destroy(event); }; diff --git a/paddle/phi/kernels/xpu/bmm_grad_kernel.cc b/paddle/phi/kernels/xpu/bmm_grad_kernel.cc index 5f4a0d9a99d39..4c3b3dcd2c9b0 100644 --- a/paddle/phi/kernels/xpu/bmm_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/bmm_grad_kernel.cc @@ -33,7 +33,7 @@ void MatMul(const Context& dev_ctx, MatMulXPUFunction(a, b, out, trans_a, trans_b, xpu_ctx); } else if (fccal_type == XPUFCCalcType::FC_FLOAT) { MatMulXPUFunction(a, b, out, trans_a, trans_b, xpu_ctx); - } else if (fccal_type == XPUFCCalcType::FC_INT_WITH_LL) { + } else if (fccal_type == XPUFCCalcType::FC_INT32_WITH_LL) { MatMulXPUFunction(a, b, out, trans_a, trans_b, xpu_ctx); } else { MatMulXPUFunction(a, b, out, trans_a, trans_b, xpu_ctx); diff --git a/paddle/phi/kernels/xpu/bmm_kernel.cc b/paddle/phi/kernels/xpu/bmm_kernel.cc index b68a5fc3c0076..04afc3ef1b007 100644 --- a/paddle/phi/kernels/xpu/bmm_kernel.cc +++ b/paddle/phi/kernels/xpu/bmm_kernel.cc @@ -68,7 +68,7 @@ void BmmKernel(const Context& dev_ctx, MatMulXPUFunction(x, y, out, trans_x, trans_y, xpu_ctx); } else if (fccal_type == XPUFCCalcType::FC_FLOAT) { MatMulXPUFunction(x, y, out, trans_x, trans_y, xpu_ctx); - } else if (fccal_type == XPUFCCalcType::FC_INT_WITH_LL) { + } else if (fccal_type == XPUFCCalcType::FC_INT32_WITH_LL) { MatMulXPUFunction(x, y, out, trans_x, trans_y, xpu_ctx); } else { MatMulXPUFunction(x, y, out, trans_x, trans_y, xpu_ctx); diff --git a/paddle/phi/kernels/xpu/xpu_api_wrapper.h b/paddle/phi/kernels/xpu/xpu_api_wrapper.h index 5bbe1163552ca..b75eaa1589323 100644 --- a/paddle/phi/kernels/xpu/xpu_api_wrapper.h +++ b/paddle/phi/kernels/xpu/xpu_api_wrapper.h @@ -30,7 +30,7 @@ enum XPUFCCalcType { FC_INT16 = 0, FC_INT32, FC_FLOAT, - FC_INT_WITH_LL, + FC_INT32_WITH_LL, }; template @@ -42,8 +42,8 @@ XPUFCCalcType FCCalcType() { return XPUFCCalcType::FC_INT32; } else if (std::getenv("XPU_PADDLE_FC_LOCAL_INT16") != nullptr) { return XPUFCCalcType::FC_FLOAT; - } else if (std::getenv("XPU_PADDLE_FC_INT_WITH_LL") != nullptr) { - return XPUFCCalcType::FC_INT_WITH_LL; + } else if (std::getenv("XPU_PADDLE_FC_INT32_WITH_LL") != nullptr) { + return XPUFCCalcType::FC_INT32_WITH_LL; } return XPUFCCalcType::FC_INT16; } diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index 3fdf7cdcdd954..30a3e82eaf1b2 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -47,7 +47,7 @@ # (TODO: GhostScreaming) It will be removed later. from paddle.framework import _set_expected_place from paddle.framework import base as imperative_base -from paddle.framework import core, in_dygraph_mode, to_variable +from paddle.framework import core, in_dygraph_mode from paddle.nn.layer import layers from paddle.utils import deprecated @@ -117,21 +117,6 @@ def _split_tensors(coalesced_grads_and_grad_vars): assert g_var.shape == g_shape -def scale_loss(loss): - # TODO(liuyuhui) Currently only for xpu. Will be removed in the future. - if not paddle.distributed.ParallelEnv().world_size > 1: - return loss - - loss_scale = to_variable( - np.array([paddle.distributed.ParallelEnv().world_size]).astype( - "float32" - ) - ) - loss_scale.stop_gradient = True - scaled_loss = loss / loss_scale - return scaled_loss - - @imperative_base.no_grad @framework.dygraph_only def build_groups(vars, group_size): diff --git a/python/paddle/fluid/dygraph/tensor_patch_methods.py b/python/paddle/fluid/dygraph/tensor_patch_methods.py index 882a333b5ebf7..90d896f7c7dc1 100644 --- a/python/paddle/fluid/dygraph/tensor_patch_methods.py +++ b/python/paddle/fluid/dygraph/tensor_patch_methods.py @@ -275,8 +275,6 @@ def backward(self, grad_tensor=None, retain_graph=False): # 4: [5000.] """ - from paddle.distributed.parallel import scale_loss - if framework._non_static_mode(): if in_profiler_mode(): record_event = profiler.RecordEvent( @@ -306,30 +304,15 @@ def backward(self, grad_tensor=None, retain_graph=False): if _grad_scalar: # When using amp with Fleet DistributedStrategy, we do loss scaling implicitly. self = _grad_scalar.scale(self) - if paddle.is_compiled_with_xpu(): - # TODO(liuyuhui): Currently only for xpu. Will be removed in the future. - scaled_loss = scale_loss(self) - if framework.global_var._in_eager_mode_: - core.eager.run_backward( - [scaled_loss], grad_tensor, retain_graph - ) - else: - core.dygraph_run_backward( - [scaled_loss], - [grad_tensor], - retain_graph, - framework._dygraph_tracer(), - ) + if framework.global_var._in_eager_mode_: + core.eager.run_backward([self], grad_tensor, retain_graph) else: - if framework.global_var._in_eager_mode_: - core.eager.run_backward([self], grad_tensor, retain_graph) - else: - core.dygraph_run_backward( - [self], - [grad_tensor], - retain_graph, - framework._dygraph_tracer(), - ) + core.dygraph_run_backward( + [self], + [grad_tensor], + retain_graph, + framework._dygraph_tracer(), + ) if in_profiler_mode(): record_event.end() else: From 0fac328106f3d7062eceafc48bcc1e8ae92c464b Mon Sep 17 00:00:00 2001 From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com> Date: Thu, 27 Apr 2023 10:45:15 +0800 Subject: [PATCH 095/405] remove some [-Wunused-parameter] warning (#53365) * test,test=develop * test,test=develop * test,test=develop * test,test=develop * test,test=develop * test,test=develop * test,test=develop * test,test=develop --- paddle/fluid/distributed/collective/process_group.h | 2 +- paddle/fluid/distributed/ps/service/brpc_ps_client.h | 4 ++-- paddle/fluid/eager/to_static/run_program_op_node.h | 8 ++++---- paddle/fluid/framework/data_set.h | 2 +- paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc | 2 +- paddle/phi/kernels/empty_kernel.cc | 2 +- paddle/phi/kernels/flatten_kernel.cc | 6 +++--- paddle/phi/kernels/funcs/activation_functor.h | 4 ++-- paddle/phi/kernels/funcs/compound_functors.h | 8 +++++--- paddle/phi/kernels/funcs/detail/activation_functions.h | 4 ++-- paddle/phi/kernels/funcs/gather.h | 6 +++--- paddle/phi/kernels/funcs/jit/helper.h | 2 +- paddle/phi/kernels/funcs/reduce_functor.h | 2 +- paddle/phi/kernels/funcs/scatter.h | 2 +- paddle/phi/kernels/funcs/strided_memcpy.h | 6 +++--- paddle/phi/kernels/squeeze_grad_kernel.cc | 2 +- paddle/phi/kernels/squeeze_kernel.cc | 4 ++-- paddle/phi/kernels/unsqueeze_kernel.cc | 2 +- 18 files changed, 35 insertions(+), 33 deletions(-) diff --git a/paddle/fluid/distributed/collective/process_group.h b/paddle/fluid/distributed/collective/process_group.h index 447fc5d1b3c7b..eff17c9d4e061 100644 --- a/paddle/fluid/distributed/collective/process_group.h +++ b/paddle/fluid/distributed/collective/process_group.h @@ -486,7 +486,7 @@ class ProcessGroup { virtual std::shared_ptr Reduce( std::vector&, // NOLINT std::vector&, // NOLINT - const ReduceOptions& opts) { + const ReduceOptions& opts UNUSED) { PADDLE_THROW(phi::errors::InvalidArgument( "ProcessGroup%s does not support reduce", GetBackendName())); } diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.h b/paddle/fluid/distributed/ps/service/brpc_ps_client.h index bbaecc498a80a..d902824bfd60c 100755 --- a/paddle/fluid/distributed/ps/service/brpc_ps_client.h +++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.h @@ -30,7 +30,7 @@ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/tensor_util.h" - +#include "paddle/phi/core/macros.h" namespace brpc { class Channel; class Controller; @@ -63,7 +63,7 @@ class DownpourPsClientService : public PsService { PsResponseMessage *response, ::google::protobuf::Closure *done); - virtual void FLService(::google::protobuf::RpcController *controller, + virtual void FLService(::google::protobuf::RpcController *controller UNUSED, const CoordinatorReqMessage *request, CoordinatorResMessage *response, ::google::protobuf::Closure *done) { diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h index 6b3c73dbdb39b..57defbaee4dca 100644 --- a/paddle/fluid/eager/to_static/run_program_op_node.h +++ b/paddle/fluid/eager/to_static/run_program_op_node.h @@ -457,8 +457,8 @@ inline void RunProgramAPI( } inline void RunProgramGradAPI( - const std::vector &x, - const std::vector ¶ms, + const std::vector &x UNUSED, + const std::vector ¶ms UNUSED, const std::vector &out_grad, const std::vector &step_scope, // NOLINT const paddle::framework::AttributeMap &attrs, @@ -610,8 +610,8 @@ class GradNodeRunProgram : public egr::GradNodeBase { egr::kSlotSmallVectorSize> operator()(paddle::small_vector, egr::kSlotSmallVectorSize> &grads, // NOLINT - bool create_graph, - bool is_new_grad) override { + bool create_graph UNUSED, + bool is_new_grad UNUSED) override { VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram"; paddle::small_vector, egr::kSlotSmallVectorSize> hooked_grads = GradNodeRunProgram::ApplyGradientHooks(grads); diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h index 1bc60993e36a0..9af5fbfc6b4a5 100644 --- a/paddle/fluid/framework/data_set.h +++ b/paddle/fluid/framework/data_set.h @@ -58,7 +58,7 @@ class Dataset { const uint16_t start_sample_layer UNUSED, const bool with_hierachy UNUSED, const uint16_t seed_ UNUSED, - const uint16_t sample_slot) {} + const uint16_t sample_slot UNUSED) {} // set file list virtual void SetFileList(const std::vector& filelist) = 0; // set readers' num diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc index 2ba103ce0fcae..a5274c5f7ae7c 100644 --- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc @@ -44,7 +44,7 @@ class FCMKLDNNHandler const phi::DenseTensor* x, const phi::DenseTensor* weights, const phi::DenseTensor* bias, - phi::DenseTensor* out, + phi::DenseTensor* out UNUSED, const int in_num_col_dims, dnnl::engine onednn_engine, platform::Place cpu_place) diff --git a/paddle/phi/kernels/empty_kernel.cc b/paddle/phi/kernels/empty_kernel.cc index 0877a8e24468c..8df5e9a543eb2 100644 --- a/paddle/phi/kernels/empty_kernel.cc +++ b/paddle/phi/kernels/empty_kernel.cc @@ -30,7 +30,7 @@ void EmptyKernel(const Context& dev_ctx, template void EmptyLikeKernel(const Context& dev_ctx, - const DenseTensor& x, + const DenseTensor& x UNUSED, DataType dtype UNUSED, DenseTensor* out) { dev_ctx.template Alloc(out); diff --git a/paddle/phi/kernels/flatten_kernel.cc b/paddle/phi/kernels/flatten_kernel.cc index 939e270613650..67d56f1c46aa3 100644 --- a/paddle/phi/kernels/flatten_kernel.cc +++ b/paddle/phi/kernels/flatten_kernel.cc @@ -25,8 +25,8 @@ namespace phi { template void FlattenInferKernel(const Context& dev_ctx, const DenseTensor& x, - int start_axis, - int stop_axis, + int start_axis UNUSED, + int stop_axis UNUSED, DenseTensor* out) { dev_ctx.Alloc(out, x.dtype()); auto out_dims = out->dims(); @@ -43,7 +43,7 @@ void FlattenKernel(const Context& dev_ctx, int start_axis, int stop_axis, DenseTensor* out, - DenseTensor* xshape) { + DenseTensor* xshape UNUSED) { FlattenInferKernel(dev_ctx, x, start_axis, stop_axis, out); } diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h index 6a3554318e5e6..aefa653928633 100644 --- a/paddle/phi/kernels/funcs/activation_functor.h +++ b/paddle/phi/kernels/funcs/activation_functor.h @@ -1198,7 +1198,7 @@ struct TanhGradFunctor : public BaseActivationFunctor { typename Out, typename dOut, typename dX> - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + void operator()(Device d, X x UNUSED, Out out, dOut dout, dX dx) const { dx.device(d) = dout * (static_cast(1) - out * out); } @@ -1794,7 +1794,7 @@ struct SigmoidGradFunctor : public BaseActivationFunctor { typename Out, typename dOut, typename dX> - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + void operator()(Device d, X x UNUSED, Out out, dOut dout, dX dx) const { dx.device(d) = dout * out * (static_cast(1) - out); } diff --git a/paddle/phi/kernels/funcs/compound_functors.h b/paddle/phi/kernels/funcs/compound_functors.h index 0fd3fd0e932fc..020c6dcd87f30 100644 --- a/paddle/phi/kernels/funcs/compound_functors.h +++ b/paddle/phi/kernels/funcs/compound_functors.h @@ -96,12 +96,12 @@ struct BinaryCompoundGradDyFunctor { unary_fun_(unary_fun), d_unary_fun_(d_unary_fun) {} - inline HOSTDEVICE T Recompute(T x, T y, T out, T dout) { + inline HOSTDEVICE T Recompute(T x, T y, T out UNUSED, T dout) { return dout * d_binary_fun_.Dy(x, unary_fun_(y)) * d_unary_fun_.UseX(y); } inline HOSTDEVICE T - UseIntermediateOut(T x, T y, T intermediate_out, T out, T dout) { + UseIntermediateOut(T x, T y, T intermediate_out, T out UNUSED, T dout) { if (InPlace) { return dout * d_binary_fun_.Dy(x, intermediate_out) * d_unary_fun_.UseOut(intermediate_out); @@ -111,7 +111,9 @@ struct BinaryCompoundGradDyFunctor { } } - inline HOSTDEVICE T GetIntermediateOut(T x, T y) { return unary_fun_(y); } + inline HOSTDEVICE T GetIntermediateOut(T x UNUSED, T y) { + return unary_fun_(y); + } private: DBinaryFun d_binary_fun_; diff --git a/paddle/phi/kernels/funcs/detail/activation_functions.h b/paddle/phi/kernels/funcs/detail/activation_functions.h index 26be2a83280c3..f1352df226094 100644 --- a/paddle/phi/kernels/funcs/detail/activation_functions.h +++ b/paddle/phi/kernels/funcs/detail/activation_functions.h @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/phi/backends/cpu/cpu_info.h" #include "paddle/phi/core/hostdevice.h" - +#include "paddle/phi/core/macros.h" namespace phi { namespace funcs { namespace detail { @@ -104,7 +104,7 @@ DEVICE T TanhV2(const T a) { namespace backward { template -DEVICE T Identity(const T a, const T b) { +DEVICE T Identity(const T a, const T b UNUSED) { return a; } diff --git a/paddle/phi/kernels/funcs/gather.h b/paddle/phi/kernels/funcs/gather.h index f1ab1a16f1224..50f7f4fa0322c 100644 --- a/paddle/phi/kernels/funcs/gather.h +++ b/paddle/phi/kernels/funcs/gather.h @@ -21,8 +21,8 @@ limitations under the License. */ #include "paddle/phi/common/place.h" #include "paddle/phi/core/ddim.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/macros.h" #include "paddle/phi/kernels/funcs/math_function.h" - namespace phi { namespace funcs { @@ -34,7 +34,7 @@ namespace funcs { * return: output tensor */ template -void CPUGather(const phi::CPUContext& ctx, +void CPUGather(const phi::CPUContext& ctx UNUSED, const DenseTensor& src, const DenseTensor& index, DenseTensor* output) { @@ -95,7 +95,7 @@ void CPUGather(const phi::CPUContext& ctx, } template -void CPUGatherNd(const phi::CPUContext& ctx, +void CPUGatherNd(const phi::CPUContext& ctx UNUSED, const DenseTensor& input, const DenseTensor& index, DenseTensor* output) { diff --git a/paddle/phi/kernels/funcs/jit/helper.h b/paddle/phi/kernels/funcs/jit/helper.h index ff233710a5cf3..7e3394dffd4a2 100644 --- a/paddle/phi/kernels/funcs/jit/helper.h +++ b/paddle/phi/kernels/funcs/jit/helper.h @@ -74,7 +74,7 @@ inline typename std::enable_if< !std::is_same::value || !std::is_same::value, const Kernel*>::type -GetJitCode(const typename KernelTuple::attr_type& attr) { +GetJitCode(const typename KernelTuple::attr_type& attr UNUSED) { return nullptr; } diff --git a/paddle/phi/kernels/funcs/reduce_functor.h b/paddle/phi/kernels/funcs/reduce_functor.h index f1d3772cc25a5..596be9bc33058 100644 --- a/paddle/phi/kernels/funcs/reduce_functor.h +++ b/paddle/phi/kernels/funcs/reduce_functor.h @@ -153,7 +153,7 @@ struct ProdGradFunctor { DX* dx, DY* dy, const Dim& dim, - int size) { + int size UNUSED) { dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse(); } }; diff --git a/paddle/phi/kernels/funcs/scatter.h b/paddle/phi/kernels/funcs/scatter.h index 7c23a35072cb8..d430588541837 100644 --- a/paddle/phi/kernels/funcs/scatter.h +++ b/paddle/phi/kernels/funcs/scatter.h @@ -47,7 +47,7 @@ elementwise_inner_add(const phi::CPUContext& ctx, template typename std::enable_if::value>::type -elementwise_inner_add(const phi::CPUContext& ctx, +elementwise_inner_add(const phi::CPUContext& ctx UNUSED, const T* src_pointer, T* dst_pointer, size_t src_index, diff --git a/paddle/phi/kernels/funcs/strided_memcpy.h b/paddle/phi/kernels/funcs/strided_memcpy.h index d80cf7b2e2421..19a542bd2b5e9 100644 --- a/paddle/phi/kernels/funcs/strided_memcpy.h +++ b/paddle/phi/kernels/funcs/strided_memcpy.h @@ -12,9 +12,9 @@ limitations under the License. */ #pragma once #include -#include "paddle/phi/kernels/funcs/detail/strided_memcpy.h" - #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/macros.h" +#include "paddle/phi/kernels/funcs/detail/strided_memcpy.h" namespace phi { class CPUContext; @@ -65,7 +65,7 @@ inline void CopyWithContext(const Context& ctx, } template <> -inline void CopyWithContext(const phi::CPUContext& ctx, +inline void CopyWithContext(const phi::CPUContext& ctx UNUSED, const Place& dst_place, void* dst, const Place& src_place, diff --git a/paddle/phi/kernels/squeeze_grad_kernel.cc b/paddle/phi/kernels/squeeze_grad_kernel.cc index 0fe2f79073430..473acf9d7a1d1 100644 --- a/paddle/phi/kernels/squeeze_grad_kernel.cc +++ b/paddle/phi/kernels/squeeze_grad_kernel.cc @@ -23,7 +23,7 @@ template void SqueezeGradKernel(const Context& dev_ctx, const DenseTensor& xshape, const DenseTensor& dout, - const IntArray& axes, + const IntArray& axes UNUSED, DenseTensor* dx) { auto xshape_dims = xshape.dims(); auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size()); diff --git a/paddle/phi/kernels/squeeze_kernel.cc b/paddle/phi/kernels/squeeze_kernel.cc index a0b72381601d6..d495b040921b5 100644 --- a/paddle/phi/kernels/squeeze_kernel.cc +++ b/paddle/phi/kernels/squeeze_kernel.cc @@ -23,7 +23,7 @@ namespace phi { template void SqueezeInferKernel(const Context& dev_ctx, const DenseTensor& x, - const IntArray& axes, + const IntArray& axes UNUSED, DenseTensor* out) { auto out_dims = out->dims(); dev_ctx.template Alloc(out); @@ -39,7 +39,7 @@ void SqueezeKernel(const Context& dev_ctx, const DenseTensor& x, const IntArray& axes, DenseTensor* out, - DenseTensor* xshape) { + DenseTensor* xshape UNUSED) { SqueezeInferKernel(dev_ctx, x, axes, out); } diff --git a/paddle/phi/kernels/unsqueeze_kernel.cc b/paddle/phi/kernels/unsqueeze_kernel.cc index 4354b09c753b1..c08c31da4ef0c 100644 --- a/paddle/phi/kernels/unsqueeze_kernel.cc +++ b/paddle/phi/kernels/unsqueeze_kernel.cc @@ -44,7 +44,7 @@ void UnsqueezeKernel(const Context& dev_ctx, const DenseTensor& x, const IntArray& axes, DenseTensor* out, - DenseTensor* xshape) { + DenseTensor* xshape UNUSED) { UnsqueezeInferKernel(dev_ctx, x, axes, out); } } // namespace phi From c50f5fa4e32b22acd3bf3e89ca0968ee65d0c5ea Mon Sep 17 00:00:00 2001 From: engineer1109 Date: Thu, 27 Apr 2023 10:50:24 +0800 Subject: [PATCH 096/405] fix softmax assert error (#53360) --- paddle/phi/kernels/funcs/softmax_impl.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/paddle/phi/kernels/funcs/softmax_impl.h b/paddle/phi/kernels/funcs/softmax_impl.h index 330ac331b6b8e..8f6b0fdd32c4b 100644 --- a/paddle/phi/kernels/funcs/softmax_impl.h +++ b/paddle/phi/kernels/funcs/softmax_impl.h @@ -82,12 +82,12 @@ class SoftmaxEigen { // axis != -1, class dimension split into (axis, remain), max and sum // should be calculated along axis dimension softmax.device(*context.eigen_device()) = - (logits.reshape(batch_axis_remain) - logits.reshape(batch_axis_remain) - .maximum(along_axis) - .eval() - .reshape(batch_one_remain) - .broadcast(one_axis_one) - .reshape(batch_classes)) + (logits.reshape(batch_classes) - logits.reshape(batch_axis_remain) + .maximum(along_axis) + .eval() + .reshape(batch_one_remain) + .broadcast(one_axis_one) + .reshape(batch_classes)) .unaryExpr(ValueClip()); } From 2c12abd7fe3f3329b920afbb222df7fad787f5b6 Mon Sep 17 00:00:00 2001 From: Wilber Date: Thu, 27 Apr 2023 10:56:43 +0800 Subject: [PATCH 097/405] revert pr https://github.com/PaddlePaddle/Paddle/pull/46779 (#53373) --- paddle/fluid/inference/tensorrt/op_teller.cc | 10 ---------- test/ir/inference/test_trt_convert_gather.py | 2 +- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 78e300a8d730d..bb0fbdf6ca848 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -542,16 +542,6 @@ struct SimpleOpTypeSetTeller : public Teller { "the pass."; return false; } - - auto index_var_name = desc.Input("Index")[0]; - auto* index_var_desc = block->FindVar(index_var_name); - - // The index input must be int32 datatype. - if (index_var_desc->GetDataType() != - paddle::framework::proto::VarType_Type::VarType_Type_INT32) { - VLOG(3) << "gather op Index input data type must be int32"; - return false; - } #if !IS_TRT_VERSION_GE(7000) auto* x_var_desc = block->FindVar(desc.Input("X")[0]); const auto x_shape = x_var_desc->GetShape(); diff --git a/test/ir/inference/test_trt_convert_gather.py b/test/ir/inference/test_trt_convert_gather.py index 3c25dd6eff1c9..69a2624b77e09 100644 --- a/test/ir/inference/test_trt_convert_gather.py +++ b/test/ir/inference/test_trt_convert_gather.py @@ -182,7 +182,7 @@ def generate_trt_nodes_num(dynamic_shape): if self.input_num == 3: return 0, 5 else: - if dynamic_shape and self.index_type_int32: + if dynamic_shape: return 1, 3 else: return 0, 4 From 1bd468e24dae9ae1b2793253232bc7a78ec840ef Mon Sep 17 00:00:00 2001 From: JYChen Date: Thu, 27 Apr 2023 11:00:26 +0800 Subject: [PATCH 098/405] Hack__getitem__ from 0-d to 1-d with FLAGS_set_to_1d (#53358) --- paddle/fluid/pybind/eager_method.cc | 16 +++++++++++++--- paddle/fluid/pybind/imperative.cc | 13 +++++++++++++ paddle/phi/kernels/funcs/slice_utils.h | 23 +++++++++++++++++++++-- python/paddle/fft.py | 4 ++-- python/paddle/fluid/variable_index.py | 7 +++++++ 5 files changed, 56 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 1da7bd774531b..28a86b8db8a3c 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -136,17 +136,18 @@ static PyObject* tensor_method_numpy(TensorObject* self, } } if (set_to_1d) { - // 0D Tensor hack process to 1D numpy, will remove in future + // 0D Tensor hack process to 1D numpy, will remove in release 2.6 VLOG(0) << "Warning:: 0D Tensor cannot be used as 'Tensor.numpy()[0]' . In " "order to avoid this problem, " "0D Tensor will be changed to 1D numpy currently, but it's not " "correct and will be " - "removed in future. For Tensor contain only one element, Please " + "removed in release 2.6. For Tensor contain only one element, " + "Please " "modify " " 'Tensor.numpy()[0]' to 'float(Tensor)' as soon as " "possible, " - "otherwise 'Tensor.numpy()[0]' will raise error in future."; + "otherwise 'Tensor.numpy()[0]' will raise error in release 2.6."; py_rank = 1; py_dims[0] = 1; py_strides[0] = sizeof_dtype * numel; @@ -923,7 +924,16 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self, } } + bool set_to_1d = FLAGS_set_to_1d; if (!none_axes.empty()) { + if (set_to_1d) { + // NOTE(zoooo0820): When all axes are decreased, the output will be 1-D + // with FLAGS_set_to_1d=True. In this case, one `None` should be pop out, + // otherwise the output shape will be not correct. + if (static_cast(decrease_axis.size()) == tensor->dims().size()) { + none_axes.pop_back(); + } + } if (!none_axes.empty()) { paddle::Tensor new_out; { diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 39a28c7487c30..372fae12ec3bc 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -63,6 +63,7 @@ limitations under the License. */ #include "paddle/phi/core/compat/arg_map_context.h" #include "paddle/phi/core/type_defs.h" +PHI_DECLARE_bool(set_to_1d); namespace paddle { namespace pybind { @@ -1064,7 +1065,19 @@ void BindImperative(py::module *m_ptr) { } tracer->TraceOp(op_type, ins, outs, std::move(attrs)); } + + bool set_to_1d = FLAGS_set_to_1d; if (!none_axes.empty()) { + if (set_to_1d) { + // NOTE(zoooo0820): When all axes are decreased, the output + // will be 1-D with FLAGS_set_to_1d=True. In this case, one + // `None` should be pop out, otherwise the output shape will be + // not correct. + if (static_cast(decrease_axis.size()) == + tensor->dims().size()) { + none_axes.pop_back(); + } + } if (!none_axes.empty()) { // Deal with cases that decrease_axes is not empty // For example: diff --git a/paddle/phi/kernels/funcs/slice_utils.h b/paddle/phi/kernels/funcs/slice_utils.h index 9bbb7681dd888..81a35b6774ae2 100644 --- a/paddle/phi/kernels/funcs/slice_utils.h +++ b/paddle/phi/kernels/funcs/slice_utils.h @@ -13,10 +13,13 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include - #include #include +#include "paddle/phi/core/flags.h" + +PHI_DECLARE_bool(set_to_1d); namespace phi { @@ -202,7 +205,23 @@ inline DDim GetDecreasedDims(const DDim slice_dims, new_shape.push_back(decreased_dims[i]); } } - + if (FLAGS_set_to_1d && new_shape.size() == 0) { + // NOTE(zoooo0820): Hack procssing to 1-D, when axes decrease to 0-D in + // slice. This will remove in release 2.6. + VLOG(0) + << "Warning:: In Tensor '__getitem__', if the number of scalar " + "elements " + "in the index is equal to the rank of the Tensor, the output " + "should " + "be 0-D. In order to be consistent with the behavior of previous " + "versions, it will be processed to 1-D. But it is not correct and " + "will be " + "removed in release 2.6. " + "If 1-D is still wanted, please modify the index element from " + "scalar to slice " + "(e.g. 'x[i]' => 'x[i:i+1]'). "; + new_shape.push_back(1); + } decreased_dims = phi::make_ddim(new_shape); } return decreased_dims; diff --git a/python/paddle/fft.py b/python/paddle/fft.py index 48c20f7fdafaf..438c65ae2f044 100644 --- a/python/paddle/fft.py +++ b/python/paddle/fft.py @@ -1371,7 +1371,7 @@ def fftshift(x, axes=None, name=None): elif isinstance(axes, int): shifts = shape[axes] // 2 else: - shifts = paddle.stack([shape[ax] // 2 for ax in axes]) + shifts = paddle.concat([shape[ax : ax + 1] // 2 for ax in axes]) return paddle.roll(x, shifts, axes, name=name) @@ -1416,7 +1416,7 @@ def ifftshift(x, axes=None, name=None): elif isinstance(axes, int): shifts = -shape[axes] // 2 else: - shifts = paddle.stack([-shape[ax] // 2 for ax in axes]) + shifts = paddle.concat([-shape[ax : ax + 1] // 2 for ax in axes]) return paddle.roll(x, shifts, axes, name=name) diff --git a/python/paddle/fluid/variable_index.py b/python/paddle/fluid/variable_index.py index fe7833f048f04..86ba9d5d58dcd 100644 --- a/python/paddle/fluid/variable_index.py +++ b/python/paddle/fluid/variable_index.py @@ -574,6 +574,13 @@ def _getitem_impl_(var, item): out = reverse(out, axis=reverse_axes) + # NOTE(zoooo0820): When all axes are decreased, the output will be 1-D + # with FLAGS_set_to_1d=True. In this case, one `None` should be pop out, + # otherwise the output shape will be not correct. + set_to_1d = paddle.get_flags('FLAGS_set_to_1d')['FLAGS_set_to_1d'] + if set_to_1d and len(decrease_axes) == len(var.shape): + none_axes = none_axes[1:] + if len(none_axes) > 0: # Deal with cases that decrease_axes is not empty # For example: From 35af5818afe85daf16c02fc4e79749236cce0fb8 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Thu, 27 Apr 2023 11:01:52 +0800 Subject: [PATCH 099/405] refine SynchronizeAllDevice (#53370) --- paddle/fluid/platform/profiler_helper.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h index 0f058ad0f8df8..1d34d5fd27b3e 100644 --- a/paddle/fluid/platform/profiler_helper.h +++ b/paddle/fluid/platform/profiler_helper.h @@ -95,28 +95,34 @@ std::vector> GetMemEvents() { void SynchronizeAllDevice() { #ifdef PADDLE_WITH_CUDA + int pre_device_id = GetCurrentDeviceId(); int count = GetGPUDeviceCount(); for (int i = 0; i < count; i++) { SetDeviceId(i); PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); } + SetDeviceId(pre_device_id); #endif #ifdef PADDLE_WITH_HIP + int pre_device_id = GetCurrentDeviceId(); int count = GetGPUDeviceCount(); for (int i = 0; i < count; i++) { SetDeviceId(i); PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); } + SetDeviceId(pre_device_id); #endif #ifdef PADDLE_WITH_CUSTOM_DEVICE auto dev_types = phi::DeviceManager::GetAllCustomDeviceTypes(); for (const auto &dev_type : dev_types) { + int pre_device_id = phi::DeviceManager::GetDevice(dev_type); auto dev_cnt = phi::DeviceManager::GetDeviceCount(dev_type); for (size_t i = 0; i < dev_cnt; i++) { auto place = paddle::platform::CustomPlace(dev_type, i); phi::DeviceManager::SetDevice(place); phi::DeviceManager::SynchronizeDevice(place); } + phi::DeviceManager::SetDevice(dev_type, pre_device_id); } #endif } From c0ee14f6767cda4f38566c2f23eb364e3354c935 Mon Sep 17 00:00:00 2001 From: hua-zi <83271073+hua-zi@users.noreply.github.com> Date: Thu, 27 Apr 2023 11:12:24 +0800 Subject: [PATCH 100/405] updata Adamw.py (#52984) * updata Adamw.py out.backward() -> loss.backward() * Update adamw.py --- python/paddle/optimizer/adamw.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index a525ac194a12c..f8e00eabecf5e 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -90,7 +90,7 @@ class AdamW(Optimizer): name (str, optional): Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. The default value is None. - **Notes**: + Notes: **Currently, AdamW doesn't support sparse parameter optimization.** Examples: @@ -111,7 +111,7 @@ class AdamW(Optimizer): beta1=beta1, beta2=beta2, weight_decay=0.01) - out.backward() + loss.backward() opt.step() opt.clear_grad() @@ -135,7 +135,7 @@ class AdamW(Optimizer): }], weight_decay=0.01, beta1=0.9) - out.backward() + loss.backward() opt.step() opt.clear_grad() From 6768c6ecd8c30f3dd3cdf7e8628693ed9bb2c636 Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com> Date: Thu, 27 Apr 2023 11:29:02 +0800 Subject: [PATCH 101/405] =?UTF-8?q?=E3=80=90prim=E3=80=91Concat=20bug=20(#?= =?UTF-8?q?53350)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * modify concat_grad add sum comp rule * modify opcompat --- .../prim/api/composite_backward/composite_backward_api.h | 2 +- paddle/phi/api/yaml/op_compat.yaml | 6 ++++++ python/paddle/fluid/tests/unittests/CMakeLists.txt | 3 ++- python/paddle/fluid/tests/unittests/test_sum_op.py | 6 ++++-- python/paddle/incubate/autograd/composite_rules.py | 8 ++++++++ 5 files changed, 21 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h index 9790f36ec590a..4784f2fb61751 100644 --- a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h +++ b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h @@ -470,7 +470,7 @@ void concat_grad(const std::vector& x, sections.push_back(x[i].dims()[axis_value]); } std::vector x_grad_tmp = - split(out_grad, phi::IntArray(sections), axis); + split(out_grad, phi::IntArray(sections), axis_value); for (int i = 0; i < x_num; ++i) { set_output(x_grad_tmp.at(i), x_grad.at(i)); } diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml index 8d29a77337320..68778a1c85602 100755 --- a/paddle/phi/api/yaml/op_compat.yaml +++ b/paddle/phi/api/yaml/op_compat.yaml @@ -95,6 +95,12 @@ attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32", bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f] +- op : add_n (sum) + inputs: + {inputs : X} + outputs: + {out : Out} + - op : addmm backward : addmm_grad inputs : diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 670a48faf5c9b..a383e1b0c0624 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -1113,7 +1113,8 @@ set(TEST_CINN_OPS test_dropout_op test_group_norm_op test_tile_op - test_roll_op) + test_roll_op + test_sum_op) foreach(TEST_CINN_OPS ${TEST_CINN_OPS}) if(WITH_CINN) diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py index 0f006eab26ff4..99406f4599c83 100644 --- a/python/paddle/fluid/tests/unittests/test_sum_op.py +++ b/python/paddle/fluid/tests/unittests/test_sum_op.py @@ -45,6 +45,8 @@ class TestSumOp(OpTest): def setUp(self): self.op_type = "sum" self.python_api = sum_wrapper + self.public_python_api = paddle.add_n + self.prim_op_type = "comp" self.init_kernel_type() self.use_mkldnn = False self.init_kernel_type() @@ -60,10 +62,10 @@ def init_kernel_type(self): self.dtype = np.float64 def test_check_output(self): - self.check_output() + self.check_output(check_prim=True) def test_check_grad(self): - self.check_grad(['x0'], 'Out') + self.check_grad(['x0'], 'Out', check_prim=True) class TestSelectedRowsSumOp(unittest.TestCase): diff --git a/python/paddle/incubate/autograd/composite_rules.py b/python/paddle/incubate/autograd/composite_rules.py index 3a1a3ea7d6751..0a47715c6062c 100644 --- a/python/paddle/incubate/autograd/composite_rules.py +++ b/python/paddle/incubate/autograd/composite_rules.py @@ -679,6 +679,14 @@ def group_norm_composite(x, scale, bias, epsilon, groups, data_layout): return out, ret_mean_, ret_var_ +@REGISTER_COMPOSITE('sum') +def sum_composite(x): + ans = 0 + for xi in x: + ans += xi + return ans + + @REGISTER_COMPOSITE('leaky_relu') def leaky_relu_composite(x, negative_slope): """define composite rule of op leaky_relu.""" From e8d296efecd95ecccd3fb71c2dbcd70245119b7d Mon Sep 17 00:00:00 2001 From: HydrogenSulfate <490868991@qq.com> Date: Thu, 27 Apr 2023 11:36:28 +0800 Subject: [PATCH 102/405] Add jacobian and hessian (#53331) * add jacobian and hessian in paddle.autograd * disable unitest 'func_multi_input' for bug in high-order gradient of multiply * add dimension checks * add support for 0-D tensor * change return type from Jacobian to Hessian in hessian function * refine Jacobian _flatten function for single xs * refine support for 0-D tensor * 1. add 'func_multi_input' unitest for multiply_grad_kernel bug fixed already. 2. support non-inplace math operation via magical method overwriting. * add unitest for math operation and raise error when 0-D tensor is indexed * add ndim check on ys and xs according to is_batched, and add one unitest * refine docstring of jacobian and hessian * move paddle.incubate.autograd.Jacobian/Hessian to paddle.incubate.autograd.functional.Jacobian/Hessian * remove single_input unitest case because numerical differentiation is wrong * remove 3 unitest for numerical result(reference result) is wrong * 1. rename autodiff.py to autograd.py 2. increase TIMEOUT to 100 * cancel modification for functional Jacobian/Hessian * 1. use tuple as return type instead of list 2. refine docstring * add more unitest case to improve coverage * remove 2 unitest of Hessian for numerical result is wrong * remove 1 unitest of Hessian for numerical result is wrong * remove 1 unitest of Hessian for numerical result is wrong * change unit test to shape check * correct doc and replace incubate API to stable API in _grad --- python/paddle/autograd/__init__.py | 3 + python/paddle/autograd/autograd.py | 712 +++++++++++++++++++++++++ test/autograd/CMakeLists.txt | 1 + test/autograd/test_autograd_dynamic.py | 668 +++++++++++++++++++++++ 4 files changed, 1384 insertions(+) create mode 100644 python/paddle/autograd/autograd.py create mode 100644 test/autograd/test_autograd_dynamic.py diff --git a/python/paddle/autograd/__init__.py b/python/paddle/autograd/__init__.py index 16dbef6ae44b4..40bcd036f184d 100644 --- a/python/paddle/autograd/__init__.py +++ b/python/paddle/autograd/__init__.py @@ -18,12 +18,15 @@ from ..fluid.dygraph.base import is_grad_enabled # noqa: F401 from ..fluid.dygraph.base import set_grad_enabled # noqa: F401 from . import backward_mode # noqa: F401 +from .autograd import jacobian, hessian # noqa: F401 from .backward_mode import backward # noqa: F401 from .py_layer import PyLayer # noqa: F401 from .py_layer import PyLayerContext # noqa: F401 from .saved_tensors_hooks import saved_tensors_hooks __all__ = [ # noqa + 'jacobian', + 'hessian', 'backward', 'PyLayer', 'PyLayerContext', diff --git a/python/paddle/autograd/autograd.py b/python/paddle/autograd/autograd.py new file mode 100644 index 0000000000000..cfd5442138c3a --- /dev/null +++ b/python/paddle/autograd/autograd.py @@ -0,0 +1,712 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Sequence, Tuple, Union + +import paddle +from paddle.fluid import framework + + +def as_tensors(xs): + if isinstance(xs, framework.Variable): + return xs + elif isinstance(xs, Sequence): + return tuple(xs) + else: + return xs + + +class Jacobian: + r"""Computes the Jacobian matrix of given xs and ys. + + Once the Jacobian ``J`` is constructed, you can use a multidimensional index + to retrieve the submatrix of ``J``, as same as slicing a Tensor. The + submatrix is lazily evaluated along row axis, and will be cached once + evaluated. + + you can retrieve the submatrix by + following methods: + + * J[:], retrieving the full matrix. + * J[:, :, j], retrieving the partial derivatives w.r.t. the j'th input + variable. + * J[:, i, :], retrieving the partial derivatives w.r.t. the i'th output + variable. + * J[:, i, j], retrieving the partial derivatives w.r.t. the i'th output + variable and the j'th input variable. + + Notes: + + Eclipsis index is not supported currently. + + Args: + + ys (Tensor|Tuple[Tensor, ...]): The output derived from xs . + xs (Tensor|Tuple[Tensor, ...]): The input tensor(s) . + is_batched (bool): If true, the first axis is batch axis. Defaults to + False. + + Returns: + + Jacobian (Object): A python object retains the Jacobian matrix. + + """ + + def __init__(self, ys, xs, is_batched=False): + if not is_batched: + if not 0 <= len(xs.shape) <= 1: + raise ValueError( + f"xs.ndim should be 0 or 1 when is_batched=False" + f" but got {len(xs.shape)}" + ) + if not 0 <= len(ys.shape) <= 1: + raise ValueError( + f"ys.ndim should be 0 or 1 when is_batched=False" + f" but got {len(ys.shape)}" + ) + self._jacobian = _JacobianNoBatch(ys, xs) + else: + if not 1 <= len(ys.shape) <= 2: + raise ValueError( + f"ys.ndim should be 1 or 2 when is_batched=True" + f" but got {len(ys.shape)}" + ) + if not 1 <= len(xs.shape) <= 2: + raise ValueError( + f"xs.ndim should be 1 or 2 when is_batched=True" + f" but got {len(xs.shape)}" + ) + self._jacobian = _JacobianBatchFirst(ys, xs) + + @property + def shape(self): + """The shape of flattened Jacobian matrix.""" + return self._jacobian.shape + + def __getitem__(self, indexes): + return self._jacobian[indexes] + + def __getattr__(self, __name: str): + if __name == "shape": + return getattr(self._jacobian, __name) + if __name == "_evaluate_all": + return getattr(self._jacobian, __name) + return getattr(self._jacobian._evaluate_all(), __name) + + def __add__(self, other): + lhs = self._evaluate_all() + rhs = other._evaluate_all() if isinstance(other, Jacobian) else other + return lhs + rhs + + def __sub__(self, other): + lhs = self._evaluate_all() + rhs = other._evaluate_all() if isinstance(other, Jacobian) else other + return lhs - rhs + + def __mul__(self, other): + lhs = self._evaluate_all() + rhs = other._evaluate_all() if isinstance(other, Jacobian) else other + return lhs * rhs + + def __div__(self, other): + lhs = self._evaluate_all() + rhs = other._evaluate_all() if isinstance(other, Jacobian) else other + return lhs / rhs + + def __truediv__(self, other): + lhs = self._evaluate_all() + rhs = other._evaluate_all() if isinstance(other, Jacobian) else other + return lhs / rhs + + def __pow__(self, other): + lhs = self._evaluate_all() + rhs = other._evaluate_all() if isinstance(other, Jacobian) else other + return lhs**rhs + + def __mod__(self, other): + lhs = self._evaluate_all() + rhs = other._evaluate_all() if isinstance(other, Jacobian) else other + return lhs % rhs + + def __floordiv__(self, other): + lhs = self._evaluate_all() + rhs = other._evaluate_all() if isinstance(other, Jacobian) else other + return lhs // rhs + + def __matmul__(self, other): + lhs = self._evaluate_all() + rhs = other._evaluate_all() if isinstance(other, Jacobian) else other + return lhs @ rhs + + def __eq__(self, other): + lhs = self._evaluate_all() + rhs = other._evaluate_all() if isinstance(other, Jacobian) else other + return lhs == rhs + + def __ne__(self, other): + lhs = self._evaluate_all() + rhs = other._evaluate_all() if isinstance(other, Jacobian) else other + return lhs != rhs + + def __lt__(self, other): + lhs = self._evaluate_all() + rhs = other._evaluate_all() if isinstance(other, Jacobian) else other + return lhs < rhs + + def __le__(self, other): + lhs = self._evaluate_all() + rhs = other._evaluate_all() if isinstance(other, Jacobian) else other + return lhs <= rhs + + def __gt__(self, other): + lhs = self._evaluate_all() + rhs = other._evaluate_all() if isinstance(other, Jacobian) else other + return lhs > rhs + + def __ge__(self, other): + lhs = self._evaluate_all() + rhs = other._evaluate_all() if isinstance(other, Jacobian) else other + return lhs >= rhs + + +class Hessian(Jacobian): + pass + + +class _Jacobian: + """The base class for computing Jacobian matrix. + + ``_Jacobian`` implementes the core logic of multidimensional index and lazy + evaluation for Jacobian matrix, subclass only need to overwrite following + methods: + + * ``_lazy_axis()``, return the axis along which will be lazy + evaluating. + * ``_flatten(xs)``, flattens the inputs ``xs``. + * ``_evaluate(index)``, evaluates one slice along ``_lazy_axis`` . + + Notes: + + Because currently PaddlePaddle only support reverse differentiation by + ``paddle.grad``, so lazy evaluation is only supported along the row of + Jacobian matrix, which means that slicing along row will get better + performance. + + """ + + def __init__(self, ys, xs): + self.original_xs_shape = xs.shape + self.original_ys_shape = ys.shape + self._xs = xs + self._ys = ys + if len(self._ys.shape) == 0 and not self.is_batched: + self._ys = self._ys.reshape( + [ + -1, + ] + ) + if len(self._ys.shape) == 1 and self.is_batched: + self._ys = self._ys.reshape([-1, 1]) + + self._flatten_xs = self._flatten(as_tensors(self._xs)) + self._flatten_ys = self._flatten(as_tensors(self._ys)) + self._cache = {} + + @property + def _lazy_axis(self): + """ "The axis of lazily evaluated.""" + raise NotImplementedError + + def _lazy_indexes(self, indexes): + idx = indexes[self._lazy_axis] + return ( + (idx,) + if isinstance(idx, int) + else tuple(range(idx.start, idx.stop, idx.step)) + ) + + def _flatten(self, xs): + raise NotImplementedError + + def _shifted_indexes(self, indexes, lazy_axis_size=0): + idx = indexes[self._lazy_axis] + shifted_lazy_axis_idx = ( + 0 if isinstance(idx, int) else slice(0, lazy_axis_size, 1) + ) + return ( + indexes[: self._lazy_axis] + + (shifted_lazy_axis_idx,) + + indexes[self._lazy_axis + 1 :] + ) + + def __getitem__(self, indexes): + if self.is_batched is False: + if len(self.shape) == 0: + # xs and ys are both 0-D tensor + raise IndexError("0-D tensor can not be indexed.") + elif len(self.shape) == 1: + # either ys or xs is 0-D tensor + indexes = ( + (0, indexes) + if len(self.original_ys_shape) == 0 + else (indexes, 0) + ) + else: + if len(self.shape) == 1: + # xs and ys are both 1-D tensor + indexes = (indexes, 0, 0) + elif len(self.shape) == 2: + # either xs or ys is 1-D tensor + if isinstance(indexes, slice): + indexes = (indexes, slice(None, None, None)) + else: + indexes = ( + (indexes[0], 0, indexes[1]) + if len(self.original_ys_shape) == 1 + else (indexes[0], indexes[1], 0) + ) + + indexes = _multi_index(indexes, self.inner_shape) + + if isinstance(indexes[self._lazy_axis], int): + other_indexes = ( + indexes[: self._lazy_axis] + indexes[self._lazy_axis + 1 :] + ) + return self._cached_evaluate(indexes[self._lazy_axis])[ + other_indexes + ] + lazy_indexes = self._lazy_indexes(indexes) + # Using concat and reshape to replace stack operator temporarily, as + # it is not a primitive operator. + shape = list(self.inner_shape) + shape[self._lazy_axis] = len(lazy_indexes) + part_jac = paddle.concat( + [self._cached_evaluate(i) for i in lazy_indexes], + axis=self._lazy_axis, + ).reshape(shape) + result = part_jac[self._shifted_indexes(indexes, len(lazy_indexes))] + + # squeeze redundant 1 in shape + if len(result.shape) > len(self.shape): + for _ in range(len(result.shape) - len(self.shape)): + result = result.squeeze(-1) + + return result + + def _cached_evaluate(self, k): + if k is None: + return self._cached_evaluate(0).reshape([]) + v = self._cache.get(k) + if v is None: + v = self._evaluate(k) + self._cache[k] = v + return v + + def _evaluate(self, index): + """Evaluate one slice at along lazy axis.""" + raise NotImplementedError + + def _evaluate_all(self): + if len(self.shape) == 0: + return self._cached_evaluate(None) + else: + return self[:] + + +class _JacobianNoBatch(_Jacobian): + """Compute Jacobian matrix without batch dimension. + Suppose the mapping is :math:`f: R^M \to R^N`, the output shape is + ``(N, M)`` . + """ + + def __init__(self, ys, xs): + self.is_batched = False + super().__init__(ys, xs) + # inner_shape is for convenient, it will regard 0-D tensor as 1-D tensor + self.inner_shape = [ + *(self._flatten_ys.shape[0:1]), + *(self._flatten_xs.shape[0:1]), + ] + self.shape = [ + *(self.original_ys_shape[0:1]), + *(self.original_xs_shape[0:1]), + ] + + @property + def _lazy_axis(self): + return 0 + + def _flatten(self, xs): + if not isinstance(xs, Sequence): + return xs.reshape((-1,)) + return paddle.concat(tuple(x.reshape((-1,)) for x in xs)) + + def _evaluate(self, row_index): + return self._flatten( + _grad_for_jacobian( + self._flatten_ys[row_index], + self._xs, + ) + ) + + +class _JacobianBatchFirst(_Jacobian): + """Compute Jacobian matrix with batch at first axis. + Suppose the mapping is :math:`f: R^{B,M} \to R^{B,N}`, the output shape is + ``(B, N, M)`` . + """ + + def __init__(self, ys, xs): + self.is_batched = True + super().__init__(ys, xs) + # inner_shape is for convenient, it will regard 0-D tensor as 1-D tensor + self.inner_shape = [ + *(self._flatten_xs.shape[0:1]), + *(self._flatten_ys.shape[1:2]), + *(self._flatten_xs.shape[1:2]), + ] + self.shape = [ + *(self._flatten_xs.shape[0:1]), + *(self.original_ys_shape[1:2]), + *(self.original_xs_shape[1:2]), + ] + + @property + def _lazy_axis(self): + return 1 + + def _flatten(self, xs): + if not isinstance(xs, Sequence): + return xs.reshape((xs.shape[0], -1)) + return paddle.concat( + tuple(x.reshape((x.shape[0], -1)) for x in as_tensors(xs)), 1 + ) + + def _evaluate(self, row_index): + return self._flatten( + _grad_for_jacobian(self._flatten_ys[:, row_index], self._xs) + ) + + +def _multi_index(indexes, shape): + """A tool for parsing N-dimensional index into a standard format. + + Currently supporting following input format: + * ([positive|negative|slice], ...), the right-most elements can be + omited. + + The standard format after converted is slice tuple which contains N elements: + * ([positive|slice], ..., [positive|slice]) + + Notes: + Ellipsis indexes such as ``(..., i), (i, ...)`` is not supported. + + Args: + indexes (tuple): The input indexes. + shape (tuple): The input shape. + + Returns: + tuple: The standard format index as the above description. + """ + indexes = indexes if isinstance(indexes, Sequence) else (indexes,) + if any(isinstance(i, type(Ellipsis)) for i in indexes): + raise IndexError('Ellipsis index currently is not supported.') + # Fill the right-most elements. + indexes = indexes + (slice(0, None, None),) * (len(shape) - len(indexes)) + # Convert to positive index. + positive_indexes = [] + for i, index in enumerate(indexes): + if isinstance(index, slice): + index = slice( + index.start or 0, index.stop or shape[i], index.step or 1 + ) + positive_indexes.append( + slice( + index.start + shape[i] if index.start < 0 else index.start, + index.stop + shape[i] if index.stop < 0 else index.stop, + # Negative step means index backward, no need to convert to + # positive interger. + index.step, + ) + ) + elif isinstance(index, int): + positive_indexes.append(index + shape[i] if index < 0 else index) + else: + raise TypeError(f'Not supported index type {index}.') + return tuple(positive_indexes) + + +def jacobian( + ys: Union[paddle.Tensor, Tuple[paddle.Tensor, ...]], + xs: Union[paddle.Tensor, Tuple[paddle.Tensor, ...]], + batch_axis: Optional[int] = None, +) -> Union[Tuple[Tuple[Jacobian, ...], ...], Tuple[Jacobian, ...], Jacobian]: + r""" + Computes the Jacobian of the dependent variable ``ys`` versus the independent + variable ``xs``. + + Where ``ys`` represents the output of ``xs`` after a certain operation, ``ys`` and + ``xs`` can be Tensor or tuple of Tensors, ``batch_axis`` indicates the position of + the batch dimension of the parameter data. + + When the input is a tuple Tensors, the returned result is a ``Jacobian`` object with + the same number of nesting levels as ``xs``, and each Jacobian has the same shape as + The ``xs`` tuples are identical in one-to-one correspondence. + + - When ``batch_axis=None``, only 0-dimensional Tensor or 1-dimensional Tensor is + supported, assuming the shape of ``xs`` is ``[N, ]``, the shape of ``ys`` is + ``[M, ]``, then the output Jacobian matrix shape is ``[M, N]``. + + - When ``batch_axis=0``, only 1-dimensional Tensor or 2-dimensional Tensor is + supported, assuming the shape of ``xs`` is ``[B, N]``, The shape of ``ys`` is + ``[B, M]``, then the output Jacobian matrix shape is ``[B, M, N]``. + + After the ``Jacobian`` object is created, the actual calculation process does not + occur, but the lazy evaluation method is used for calculation. It can be + multi-dimensional indexed to obtain the entire Jacobian matrix or sub-matrix, and + the actual calculation will be performed at this time the value is calculated and + the result is returned. At the same time, in the actual evaluation process, the + calculated sub-matrix will be cached to avoid duplicate calculations in the + subsequent indexing process. + + For example, assuming ``Jacobian`` instance ``J`` has shape ``[B, M, N]``, assuming + ``M > 4`` , then ``J[:, 1:4:1, :]`` means to get the values from row ``1`` to row + ``3`` of ``J``. In actual calculation, only the rows ``1`` to ``3`` are evaluated, + and the calculation results of ``1`` to ``3`` will be cached at the granularity of + the row, and will be used next time. When obtaining one or more rows of results + above, the already calculated parts will not be recalculated. + + Args: + + ys (Union[paddle.Tensor, Tuple[paddle.Tensor, ...]]): Output or tuple of outputs derived from xs. + xs (Union[paddle.Tensor, Tuple[paddle.Tensor, ...]]): Input or tuple of inputs. + batch_axis (Optional[int], optional): Index of batch axis. Defaults to None. + + Returns: + + Union[Tuple[Tuple[Jacobian, ...], ...], Tuple[Jacobian, ...], Jacobian]: Jacobian(s) of ys deriveted from xs. + + Examples: + + .. code-block:: python + + import paddle + + x1 = paddle.randn([3, ]) + x2 = paddle.randn([3, ]) + x1.stop_gradient = False + x2.stop_gradient = False + + y = x1 + x2 + + J = paddle.autograd.jacobian(y, (x1, x2)) + J_y_x1 = J[0][:] # evaluate result of dy/dx1 + J_y_x2 = J[1][:] # evaluate result of dy/dx2 + + print(J_y_x1.shape) # [3, 3] + print(J_y_x2.shape) # [3, 3] + """ + + if batch_axis is not None and batch_axis != 0: + raise ValueError( + f"batch_axis should be None or 0, but got {batch_axis}." + ) + + # TODO(HydrogenSulfate): support batch_axis > 0 + is_batched = batch_axis is not None + if isinstance(ys, Sequence) and isinstance(xs, Sequence): + _jacobian = tuple( + tuple(Jacobian(_ys, _xs, is_batched) for _xs in xs) for _ys in ys + ) + elif isinstance(ys, Sequence) and not isinstance(xs, Sequence): + _jacobian = tuple(Jacobian(_ys, xs, is_batched) for _ys in ys) + elif not isinstance(ys, Sequence) and isinstance(xs, Sequence): + _jacobian = tuple(Jacobian(ys, _xs, is_batched) for _xs in xs) + else: + _jacobian = Jacobian(ys, xs, is_batched) + + return _jacobian + + +def hessian( + ys: paddle.Tensor, + xs: Union[paddle.Tensor, Tuple[paddle.Tensor, ...]], + batch_axis: Optional[int] = None, +) -> Union[Tuple[Tuple[Hessian, ...], ...], Hessian]: + r""" + Computes the Jacobian of the dependent variable ``ys`` versus the independent + variable ``xs``. + + Among them, ``ys`` means the output of ``xs`` after a certain operation, ``ys`` can + only be a single Tensor, ``xs`` can be a Tensor or a Tensor tuple, and + ``batch_axis`` means The position of the batch dimension of the parameter data. + + When the input ``xs`` is a Tensor tuple, the returned result is a ``Hessian`` tuple, + assuming that the internal shape of the ``xs`` tuple is composed of + ``([M1, ], [M2, ]) ``, the shape of the returned result consists of + ``(([M1, M1], [M1, M2]), ([M2, M1], [M2, M2]))`` + + - When ``batch_axis=None``, only 0-dimensional Tensor or 1-dimensional Tensor is + supported, assuming that the shape of ``xs`` is ``[N, ]``, and the shape of ``ys`` is ``[ ]``(0-dimensional Tensor), the final output is a single Hessian matrix whose shape is ``[N, N]``. + + - When ``batch_axis=0``, only 1-dimensional Tensor or 2-dimensional Tensor is + supported, assuming that the shape of ``xs`` is ``[B, N]``, and the shape of ``ys`` is `` [B, ]``, the final output Jacobian matrix shape is ``[B, N, N]``. + + After the ``Hessian`` object is created, the complete calculation process does not + occur, but a partial lazy evaluation method is used for calculation. It can be + multi-dimensionally indexed to obtain the entire Hessian matrix or sub-matrix. At + this time, the actual Evaluates the computation and returns the result. At the same + time, in the actual evaluation process, the calculated sub-matrix will be cached to + avoid repeated calculations in the subsequent indexing process. + + Args: + + ys (paddle.Tensor): Output derived from xs which contain one element. + xs (Union[paddle.Tensor, Tuple[paddle.Tensor, ...]]): Input or tuple of inputs. + batch_axis (Optional[int], optional): Index of batch axis. Defaults to None. + + Returns: + + Union[Tuple[Tuple[Hessian, ...], ...], Tuple[Hessian, ...], Hessian]: Hessian(s) of ys deriveted from xs. + + Examples: + + .. code-block:: python + + import paddle + + x1 = paddle.randn([3, ]) + x2 = paddle.randn([4, ]) + x1.stop_gradient = False + x2.stop_gradient = False + + y = x1.sum() + x2.sum() + + H = paddle.autograd.hessian(y, (x1, x2)) + H_y_x1_x1 = H[0][0][:] # evaluate result of ddy/dx1x1 + H_y_x1_x2 = H[0][1][:] # evaluate result of ddy/dx1x2 + H_y_x2_x1 = H[1][0][:] # evaluate result of ddy/dx2x1 + H_y_x2_x2 = H[1][1][:] # evaluate result of ddy/dx2x2 + + print(H_y_x1_x1.shape) # [3, 3] + print(H_y_x1_x2.shape) # [3, 4] + print(H_y_x2_x1.shape) # [4, 3] + print(H_y_x2_x2.shape) # [4, 4] + """ + + if batch_axis is None: + if ys.numel() > 1: + raise ValueError( + f"Only support ys.numel()({ys.numel()})==1 when batch_axis is None." + ) + ys = ys.reshape(()) + elif isinstance(batch_axis, int): + if ys[0].numel() > 1: + raise ValueError( + f"Only support ys[0].numel()({ys.numel()})==1 when batch_axis is int" + ) + # TODO(HydrogenSulfate): support batch_axis > 0 + if batch_axis != 0: + raise ValueError("Only support batch_axis=0 yet.") + ys = ys.reshape((-1,)) + else: + raise ValueError( + f"batch_axis should be None or int, but got {type(batch_axis)}." + ) + + _jacobian = jacobian(ys, xs, batch_axis) + if not isinstance(xs, Sequence): + hessian = jacobian(_jacobian, xs, batch_axis) + + # change classname to Hessian instead of Jacobian. + hessian.__class__ = Hessian + else: + hessian = tuple(jacobian(_j, xs, batch_axis) for _j in _jacobian) + + # change classname to Hessian instead of Jacobian. + for i in range(len(hessian)): + for j in range(len(hessian[0])): + hessian[i][j].__class__ = Hessian + + return hessian + + +def _replace_none_with_zero_tensor(xs, refs): + if xs is None: + xs = paddle.zeros_like(refs) + xs.stop_gradient = refs.stop_gradient + return xs + elif isinstance(xs, Sequence): + return tuple( + _replace_none_with_zero_tensor(x, refs[i]) for i, x in enumerate(xs) + ) + else: + return xs + + +def _grad_for_jacobian(ys, xs, v=None): + """A gradient function that can be used in dynamic graph and static graph. + + The ``grad`` combines ``paddle.grad`` used in dynamic graph and + ``paddle.static.gradients`` used in static graph, and do following changes: + + * The ``allow_unused`` flag is removed and set defaults to true internally, + none in outputs will be replaced by zero tensor. + * The ``create_graph`` flag is removed and set defaults to true internally, + only makes sense in dynamic graph. + * When xs is a single Tensor, ``paddle.grad`` returns a list which only + contains one Tensor. It may confuse users, thus in this case we improve + to return a single Tensor in _grad_for_jacobian interface. + + Args: + ys (Tensor|Sequence[Tensor]): The output tensor or tensor sequence of + the graph to compute gradients. + xs (Tensor|Sequence[Tensor]): The input tensor or tensor sequence of the graph to + compute gradients. The returned values of this API are the + gradients of inputs . + v (Tensor|Sequence[Tensor]|None,optional): The initial gradient values + of outputs . If grad_outputs is None, the initial gradient values of + outputs would be Tensors filled with 1; if grad_outputs is not None, + it must have the same length as outputs , and in this case, the + initial gradient value of the i-th outputs would be: (1) a Tensor + filled with 1 when the i-th element of grad_outputs is None; + (2) the i-th element of grad_outputs when the i-th element of + grad_outputs is a Tensor. Default None. + + Returns: + Tensor|tuple[Tensor]: Tensor or a tuple of Tensors, whose length is the + same as the Tensor number inside inputs, and the i-th returned + Tensor is the sum of gradients of outputs with respect to the i-th + inputs. + """ + if paddle.fluid._non_static_mode(): + # paddle.grad returns a list though the inputs is a signle Tensor. The + # follow code snippet fixes the problem by return the first element of + # xs_grad when the xs is a signle Tensor. + xs_grad = paddle.grad(ys, xs, v, create_graph=True, allow_unused=True) + if ( + isinstance(xs, paddle.fluid.framework.Variable) + and isinstance(xs_grad, Sequence) + and len(xs_grad) > 0 + ): + xs_grad = xs_grad[0] + else: + xs_grad = paddle.static.gradients(ys, xs, v) + if ( + isinstance(xs, framework.Variable) + and isinstance(xs_grad, Sequence) + and len(xs_grad) > 0 + ): + xs_grad = xs_grad[0] + return _replace_none_with_zero_tensor(xs_grad, xs) diff --git a/test/autograd/CMakeLists.txt b/test/autograd/CMakeLists.txt index f1af779f4f46d..592517cb8e3da 100644 --- a/test/autograd/CMakeLists.txt +++ b/test/autograd/CMakeLists.txt @@ -15,6 +15,7 @@ foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS}) endforeach() +set_tests_properties(test_autograd_dynamic PROPERTIES TIMEOUT 100) set_tests_properties(test_autograd_functional_dynamic PROPERTIES TIMEOUT 200) set_tests_properties(test_autograd_functional_static PROPERTIES TIMEOUT 160) set_tests_properties(test_minimize PROPERTIES TIMEOUT 60) diff --git a/test/autograd/test_autograd_dynamic.py b/test/autograd/test_autograd_dynamic.py new file mode 100644 index 0000000000000..e8e5d8b626c5f --- /dev/null +++ b/test/autograd/test_autograd_dynamic.py @@ -0,0 +1,668 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import typing +import unittest + +import config +import numpy as np +import utils + +import paddle +import paddle.nn.functional as F +from paddle.incubate.autograd.utils import as_tensors + + +def make_v(f, inputs): + outputs = as_tensors(f(*inputs)) + return [paddle.ones_like(x) for x in outputs] + + +@utils.place(config.DEVICES) +@utils.parameterize( + (utils.TEST_CASE_NAME, 'func', 'xs'), + ( + ('1d_in_1d_out', utils.square, np.array([2.0, 3.0])), + ( + 'single_in_single_out', + utils.square, + np.random.rand( + 6, + ), + ), + ( + 'multi_in_single_out', + paddle.matmul, + ( + np.random.rand( + 4, + ), + np.random.rand( + 4, + ), + ), + ), + ), +) +class TestJacobianNoBatch(unittest.TestCase): + def setUp(self): + self._dtype = ( + self.xs[0].dtype + if isinstance(self.xs, typing.Sequence) + else self.xs.dtype + ) + self._eps = ( + config.TOLERANCE.get(str(self._dtype)) + .get("first_order_grad") + .get("eps") + ) + self._rtol = ( + config.TOLERANCE.get(str(self._dtype)) + .get("first_order_grad") + .get("rtol") + ) + self._atol = ( + config.TOLERANCE.get(str(self._dtype)) + .get("first_order_grad") + .get("atol") + ) + + def test_jacobian(self): + xs = ( + [paddle.to_tensor(x, stop_gradient=False) for x in self.xs] + if isinstance(self.xs, typing.Sequence) + else paddle.to_tensor(self.xs, stop_gradient=False) + ) + ys = ( + self.func(*xs) if isinstance(xs, typing.Sequence) else self.func(xs) + ) + self._actual = paddle.autograd.jacobian(ys, xs, batch_axis=None) + if isinstance(self._actual, (tuple, list)): + self._actual = paddle.concat([x[:] for x in self._actual], axis=1) + self._expected = self._get_expected() + + Index = collections.namedtuple('Index', ('type', 'value')) + indexes = ( + Index('all', (slice(0, None, None), slice(0, None, None))), + Index('row', (0, slice(0, None, None))), + Index('col', (slice(0, None, None), 0)), + Index('multi-row', (slice(0, 2, 1), slice(0, None, None))), + ) + self.assertEqual(self._actual[:].numpy().dtype, self._expected.dtype) + for index in indexes: + np.testing.assert_allclose( + self._actual.__getitem__(index.value), + self._expected.__getitem__(index.value), + rtol=self._rtol, + atol=self._atol, + err_msg=f'Testcase {index.type} index not passed, value is {index.value}', + ) + + def test_jacobian_attribute_operator(self): + xs = ( + [paddle.to_tensor(x, stop_gradient=False) for x in self.xs] + if isinstance(self.xs, typing.Sequence) + else paddle.to_tensor(self.xs, stop_gradient=False) + ) + ys = ( + self.func(*xs) if isinstance(xs, typing.Sequence) else self.func(xs) + ) + self._actual = paddle.autograd.jacobian(ys, xs, batch_axis=None) + if isinstance(self._actual, (tuple, list)): + self._actual = paddle.concat([x[:] for x in self._actual], axis=1) + self._expected = self._get_expected() + + Index = collections.namedtuple('Index', ('type', 'value')) + indexes = ( + Index('all', (slice(0, None, None), slice(0, None, None))), + Index('row', (0, slice(0, None, None))), + Index('col', (slice(0, None, None), 0)), + Index('multi-row', (slice(0, 2, 1), slice(0, None, None))), + ) + self.assertEqual(self._actual.numpy().dtype, self._expected.dtype) + for index in indexes: + np.testing.assert_allclose( + self._actual.__getitem__(index.value), + self._expected.__getitem__(index.value), + rtol=self._rtol, + atol=self._atol, + err_msg=f'Testcase {index.type} index not passed, value is {index.value}', + ) + + def _get_expected(self): + xs = ( + [paddle.to_tensor(x, stop_gradient=False) for x in self.xs] + if isinstance(self.xs, typing.Sequence) + else paddle.to_tensor(self.xs, stop_gradient=False) + ) + jac = utils._compute_numerical_jacobian( + self.func, xs, self._eps, self._dtype + ) + return utils._np_concat_matrix_sequence(jac, utils.MatrixFormat.NM) + + +@utils.place(config.DEVICES) +@utils.parameterize( + (utils.TEST_CASE_NAME, 'func', 'xs'), + ( + ( + '1d_in_1d_out', + utils.square, + np.array([[1.0, 2.0, 3.0], [3.0, 4.0, 3.0]]), + ), + ('multi_in_single_out', utils.square, np.random.rand(2, 3)), + ), +) +class TestJacobianBatchFirst(unittest.TestCase): + def setUp(self): + self._dtype = ( + self.xs[0].dtype + if isinstance(self.xs, typing.Sequence) + else self.xs.dtype + ) + self._eps = ( + config.TOLERANCE.get(str(self._dtype)) + .get("first_order_grad") + .get("eps") + ) + self._rtol = ( + config.TOLERANCE.get(str(self._dtype)) + .get("first_order_grad") + .get("rtol") + ) + self._atol = ( + config.TOLERANCE.get(str(self._dtype)) + .get("first_order_grad") + .get("atol") + ) + + def test_jacobian(self): + xs = ( + [paddle.to_tensor(x, stop_gradient=False) for x in self.xs] + if isinstance(self.xs, typing.Sequence) + else paddle.to_tensor(self.xs, stop_gradient=False) + ) + ys = ( + self.func(*xs) if isinstance(xs, typing.Sequence) else self.func(xs) + ) + self._actual = paddle.autograd.jacobian(ys, xs, batch_axis=0) + self._expected = self._get_expected() + + Index = collections.namedtuple('Index', ('type', 'value')) + indexes = ( + Index( + 'all', + ( + slice(0, None, None), + slice(0, None, None), + slice(0, None, None), + ), + ), + Index('row', (slice(0, None, None), 0, slice(0, None, None))), + Index('col', (slice(0, None, None), slice(0, None, None), 0)), + Index( + 'batch', + (slice(0, 2, None), slice(0, None, None), slice(0, None, None)), + ), + Index( + 'multi_row', + (slice(0, 1, None), slice(0, 2, 1), slice(0, None, None)), + ), + ) + self.assertEqual(self._actual[:].numpy().dtype, self._expected.dtype) + for index in indexes: + np.testing.assert_allclose( + self._actual.__getitem__(index.value), + self._expected.__getitem__(index.value), + rtol=self._rtol, + atol=self._atol, + err_msg=f'Testcase {index.type} index not passed, value is {index.value}', + ) + + def test_jacobian_attribute_operator(self): + # test for attribute operator "." + xs = ( + [paddle.to_tensor(x, stop_gradient=False) for x in self.xs] + if isinstance(self.xs, typing.Sequence) + else paddle.to_tensor(self.xs, stop_gradient=False) + ) + ys = ( + self.func(*xs) if isinstance(xs, typing.Sequence) else self.func(xs) + ) + self._actual = paddle.autograd.jacobian(ys, xs, batch_axis=0) + self._expected = self._get_expected() + + Index = collections.namedtuple('Index', ('type', 'value')) + indexes = ( + Index( + 'all', + ( + slice(0, None, None), + slice(0, None, None), + slice(0, None, None), + ), + ), + Index('row', (slice(0, None, None), 0, slice(0, None, None))), + Index('col', (slice(0, None, None), slice(0, None, None), 0)), + Index( + 'batch', + (slice(0, 2, None), slice(0, None, None), slice(0, None, None)), + ), + Index( + 'multi_row', + (slice(0, 1, None), slice(0, 2, 1), slice(0, None, None)), + ), + ) + self.assertEqual(self._actual.numpy().dtype, self._expected.dtype) + for index in indexes: + np.testing.assert_allclose( + self._actual.__getitem__(index.value), + self._expected.__getitem__(index.value), + rtol=self._rtol, + atol=self._atol, + err_msg=f'Testcase {index.type} index not passed, value is {index.value}', + ) + + def _get_expected(self): + xs = ( + [paddle.to_tensor(x, stop_gradient=False) for x in self.xs] + if isinstance(self.xs, typing.Sequence) + else paddle.to_tensor(self.xs, stop_gradient=False) + ) + jac = utils._compute_numerical_batch_jacobian( + self.func, xs, self._eps, self._dtype, False + ) + jac = utils._np_concat_matrix_sequence(jac, utils.MatrixFormat.NBM) + return utils._np_transpose_matrix_format( + jac, utils.MatrixFormat.NBM, utils.MatrixFormat.BNM + ) + + +class TestHessianNoBatch(unittest.TestCase): + @classmethod + def setUpClass(self): + self.shape = (4,) + self.dtype = 'float32' + self.np_dtype = np.float32 + self.numerical_delta = ( + config.TOLERANCE.get(self.dtype).get("second_order_grad").get("eps") + ) + self.rtol = ( + config.TOLERANCE.get(self.dtype) + .get("second_order_grad") + .get("rtol") + ) + self.atol = ( + config.TOLERANCE.get(self.dtype) + .get("second_order_grad") + .get("atol") + ) + self.x = paddle.rand(shape=self.shape, dtype=self.dtype) + self.y = paddle.rand(shape=self.shape, dtype=self.dtype) + + def func_create_graph_true(self): + def func(x): + return paddle.sum(F.sigmoid(x)) + + numerical_hessian = utils._compute_numerical_hessian( + func, self.x, self.numerical_delta, self.np_dtype + ) + numerical_hessian = utils._np_concat_matrix_sequence(numerical_hessian) + self.x.stop_gradient = False + hessian = paddle.autograd.hessian(func(self.x), self.x, batch_axis=None) + assert not hessian[:].stop_gradient + np.testing.assert_allclose( + hessian[:].numpy(), numerical_hessian, self.rtol, self.atol + ) + + def func_out_not_single(self): + def func(x): + return x * x + + with self.assertRaises(ValueError): + x = paddle.ones([3]) + paddle.autograd.hessian(func(x), x, batch_axis=None) + + def func_add(self): + def func(x): + return (x * x).sum() + + H = paddle.autograd.hessian(func(self.x), self.x) + expected = np.diag(np.full((self.x.size,), 2.0)) + + expected = expected + 1.0 + actual = H + 1.0 + np.testing.assert_allclose(actual, expected, self.rtol, self.atol) + + def func_sub(self): + def func(x): + return (x * x).sum() + + H = paddle.autograd.hessian(func(self.x), self.x) + expected = np.diag(np.full((self.x.size,), 2.0)) + + expected = expected - 1.0 + actual = H - 1.0 + np.testing.assert_allclose(actual, expected, self.rtol, self.atol) + + def func_mul(self): + def func(x): + return (x * x).sum() + + H = paddle.autograd.hessian(func(self.x), self.x) + expected = np.diag(np.full((self.x.size,), 2.0)) + + expected = expected * 2.0 + actual = H * 2.0 + np.testing.assert_allclose(actual, expected, self.rtol, self.atol) + + def func_div(self): + def func(x): + return (x * x).sum() + + H = paddle.autograd.hessian(func(self.x), self.x) + expected = np.diag(np.full((self.x.size,), 2.0)) + + expected = expected / 2.0 + actual = H / 2.0 + np.testing.assert_allclose(actual, expected, self.rtol, self.atol) + + def func_truediv(self): + def func(x): + return (x * x).sum() + + H = paddle.autograd.hessian(func(self.x), self.x) + expected = np.diag(np.full((self.x.size,), 2.0)) + + expected = expected / 2.0 + actual = H / 2.0 + np.testing.assert_allclose(actual, expected, self.rtol, self.atol) + + def func_pow(self): + def func(x): + return (x * x).sum() + + H = paddle.autograd.hessian(func(self.x), self.x) + expected = np.diag(np.full((self.x.size,), 2.0)) + + expected = expected**3.0 + actual = H**3.0 + np.testing.assert_allclose(actual, expected, self.rtol, self.atol) + + def func_mod(self): + def func(x): + return (x * x).sum() + + H = paddle.autograd.hessian(func(self.x), self.x) + expected = np.diag(np.full((self.x.size,), 2.0)) + + expected = expected % 1.2 + actual = H % 1.2 + np.testing.assert_allclose(actual, expected, self.rtol, self.atol) + + def func_matmul(self): + def func(x): + return (x * x).sum() + + H = paddle.autograd.hessian(func(self.x), self.x) + expected = np.diag(np.full((self.x.size,), 2.0)) + + expected = expected @ expected + actual = H @ H + np.testing.assert_allclose(actual, expected, self.rtol, self.atol) + + def func_eq(self): + def func(x): + return (x * x).sum() + + H = paddle.autograd.hessian(func(self.x), self.x) + expected = np.diag(np.full((self.x.size,), 2.0)) + + expected = expected == expected + actual = H == H + np.testing.assert_allclose(actual, expected, self.rtol, self.atol) + + def func_ne(self): + def func(x): + return (x * x).sum() + + H = paddle.autograd.hessian(func(self.x), self.x) + expected = np.diag(np.full((self.x.size,), 2.0)) + + expected = expected != expected + actual = H != H + np.testing.assert_allclose(actual, expected, self.rtol, self.atol) + + def func_lt(self): + def func(x): + return (x * x).sum() + + H = paddle.autograd.hessian(func(self.x), self.x) + expected = np.diag(np.full((self.x.size,), 2.0)) + + expected = expected < expected + actual = H < H + np.testing.assert_allclose(actual, expected, self.rtol, self.atol) + + def func_le(self): + def func(x): + return (x * x).sum() + + H = paddle.autograd.hessian(func(self.x), self.x) + expected = np.diag(np.full((self.x.size,), 2.0)) + + expected = expected <= expected + actual = H <= H + np.testing.assert_allclose(actual, expected, self.rtol, self.atol) + + def func_gt(self): + def func(x): + return (x * x).sum() + + H = paddle.autograd.hessian(func(self.x), self.x) + expected = np.diag(np.full((self.x.size,), 2.0)) + + expected = expected > expected + actual = H > H + np.testing.assert_allclose(actual, expected, self.rtol, self.atol) + + def func_ge(self): + def func(x): + return (x * x).sum() + + H = paddle.autograd.hessian(func(self.x), self.x) + expected = np.diag(np.full((self.x.size,), 2.0)) + + expected = expected >= expected + actual = H >= H + np.testing.assert_allclose(actual, expected, self.rtol, self.atol) + + def func_0Dtensor_index(self): + x_0d = self.x[0].reshape([]) + + def func(x): + return x * x + + with self.assertRaises(IndexError): + H = paddle.autograd.hessian(func(x_0d), x_0d) + H = H[:] + + def func_2Dtensor(self): + x_2d = self.x.reshape([self.x.shape[0] // 2, 2]) + + def func(x): + return (x * x).sum() + + with self.assertRaises(ValueError): + H = paddle.autograd.hessian(func(x_2d), x_2d) + + def test_all_cases(self): + self.setUpClass() + self.func_create_graph_true() + self.func_out_not_single() + self.func_add() + self.func_sub() + self.func_mul() + self.func_div() + self.func_truediv() + self.func_pow() + self.func_mod() + self.func_matmul() + self.func_eq() + self.func_ne() + self.func_lt() + self.func_le() + self.func_gt() + self.func_ge() + self.func_0Dtensor_index() + self.func_2Dtensor() + + +class TestHessianBatchFirst(unittest.TestCase): + @classmethod + def setUpClass(self): + self.x_shape = (5, 2) + self.weight_shape = (2, 4) + self.y_shape = (5, 2) + self.nbatch, self.nrow = 5, 2 + self.dtype = 'float32' + self.np_dtype = np.float32 + self.numerical_delta = ( + config.TOLERANCE.get(self.dtype).get('second_order_grad').get('eps') + ) + self.rtol = ( + config.TOLERANCE.get(self.dtype) + .get('second_order_grad') + .get('rtol') + ) + self.atol = ( + config.TOLERANCE.get(self.dtype) + .get('second_order_grad') + .get('atol') + ) + self.x = paddle.rand(shape=self.x_shape, dtype=self.dtype) + self.x.stop_gradient = False + self.weight = paddle.rand(shape=self.weight_shape, dtype=self.dtype) + self.weight.stop_gradient = False + self.y = paddle.rand(shape=self.y_shape, dtype=self.dtype) + self.y.stop_gradient = False + + def func_allow_unused(self): + def func(x, y): + return paddle.matmul(x * x, self.weight)[:, 0:1] + + xs_len = 2 + expected = utils._compute_numerical_batch_hessian( + func, [self.x, self.y], self.numerical_delta, self.np_dtype + ) + expected = np.reshape( + np.array(expected), + (xs_len, xs_len, self.nrow, self.nbatch, self.nrow), + ) + expected = [list(row) for row in expected] + expected = utils._np_concat_matrix_sequence(expected) + expected = utils._np_transpose_matrix_format( + expected, utils.MatrixFormat.NBM, utils.MatrixFormat.BNM + ) + + actual = paddle.autograd.hessian( + func(self.x, self.y), [self.x, self.y], batch_axis=0 + ) + actual = paddle.concat( + [ + paddle.concat([actual[i][j][:] for j in range(2)], axis=2) + for i in range(2) + ], + axis=1, + ) + + np.testing.assert_allclose( + actual.shape, expected.shape, rtol=self.rtol, atol=self.atol + ) + + def func_stop_gradient(self): + def func(x): + return paddle.matmul(x * x, self.weight)[:, 0:1] + + expected = utils._compute_numerical_batch_hessian( + func, self.x, self.numerical_delta, self.np_dtype + ) + + x = self.x.clone() + x.stop_gradient = True + H = paddle.autograd.hessian(func(self.x), self.x, batch_axis=0)[:] + actual = utils._np_transpose_matrix_format( + H[:].numpy(), utils.MatrixFormat.BNM, utils.MatrixFormat.NBM + ) + actual = actual.reshape((H.shape[1], -1)) + + np.testing.assert_allclose( + actual.shape, np.asarray(expected).shape, self.rtol, self.atol + ) + + def func_out_not_single(self): + def func(x): + return x * x + + with self.assertRaises(ValueError): + x = paddle.ones((3, 3)) + paddle.autograd.hessian(func(x), x, batch_axis=0) + + def func_batch_axis_except_0(self): + def func(x): + return x * x + + with self.assertRaises(ValueError): + x = paddle.ones([3]) + paddle.autograd.hessian(func(x), x, batch_axis=2) + + def func_ndim_bigger_than_2(self): + def func(x): + return (x * x).sum() + + with self.assertRaises(ValueError): + x = paddle.ones([3, 3, 3, 3]) + paddle.autograd.hessian(func(x), x, batch_axis=0) + + def func_batch_axis_str(self): + def func(x): + return (x * x).sum() + + with self.assertRaises(ValueError): + x = paddle.ones([3, 3, 3, 3]) + paddle.autograd.hessian(func(x), x, batch_axis="0") + + def func_ellipsis_index(self): + def func(x): + return (x * x).sum() + + with self.assertRaises(IndexError): + x = paddle.ones([2, 3]) + H = paddle.autograd.hessian(func(x), x, batch_axis=0)[..., 1] + + def test_all_cases(self): + self.setUpClass() + self.func_allow_unused() + self.func_stop_gradient() + self.func_out_not_single() + self.func_batch_axis_except_0() + self.func_ndim_bigger_than_2() + self.func_batch_axis_str() + self.func_ellipsis_index() + + +if __name__ == "__main__": + np.random.seed(2022) + unittest.main() From 89d1dd2ec0273d1d4d13c8346bff97c3b0e4a7d3 Mon Sep 17 00:00:00 2001 From: HongyuJia Date: Thu, 27 Apr 2023 13:03:41 +0800 Subject: [PATCH 103/405] [CustomOP Unittest] XPU unittest only keep forward test (#53021) --- test/custom_op/custom_relu_op_xpu.cc | 127 -------------- .../test_custom_relu_op_xpu_setup.py | 156 +----------------- 2 files changed, 3 insertions(+), 280 deletions(-) diff --git a/test/custom_op/custom_relu_op_xpu.cc b/test/custom_op/custom_relu_op_xpu.cc index ee717785ad848..8d9e2e2af4980 100644 --- a/test/custom_op/custom_relu_op_xpu.cc +++ b/test/custom_op/custom_relu_op_xpu.cc @@ -31,28 +31,6 @@ void relu_cpu_forward_kernel(const data_t* x_data, } } -template -void relu_cpu_backward_kernel(const data_t* grad_out_data, - const data_t* out_data, - data_t* grad_x_data, - int64_t out_numel) { - for (int64_t i = 0; i < out_numel; ++i) { - grad_x_data[i] = - grad_out_data[i] * (out_data[i] > static_cast(0) ? 1. : 0.); - } -} - -template -void relu_cpu_double_backward_kernel(const data_t* out_data, - const data_t* ddx_data, - data_t* ddout_data, - int64_t ddout_numel) { - for (int64_t i = 0; i < ddout_numel; ++i) { - ddout_data[i] = - ddx_data[i] * (out_data[i] > static_cast(0) ? 1. : 0.); - } -} - std::vector relu_cpu_forward(const paddle::Tensor& x) { CHECK_CPU_INPUT(x); auto out = paddle::empty_like(x); @@ -66,77 +44,12 @@ std::vector relu_cpu_forward(const paddle::Tensor& x) { return {out}; } -std::vector relu_cpu_backward(const paddle::Tensor& x, - const paddle::Tensor& out, - const paddle::Tensor& grad_out) { - auto grad_x = paddle::empty_like(x); - - PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward", ([&] { - relu_cpu_backward_kernel( - grad_out.data(), - out.data(), - grad_x.data(), - out.size()); - })); - - return {grad_x}; -} - -std::vector relu_cpu_double_backward( - const paddle::Tensor& out, const paddle::Tensor& ddx) { - CHECK_CPU_INPUT(out); - CHECK_CPU_INPUT(ddx); - auto ddout = paddle::empty(out.shape(), out.dtype(), out.place()); - - PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_double_backward", ([&] { - relu_cpu_double_backward_kernel( - out.data(), - ddx.data(), - ddout.mutable_data(out.place()), - ddout.size()); - })); - - return {ddout}; -} - std::vector relu_xpu_forward(const paddle::Tensor& x) { CHECK_XPU_INPUT(x); auto out = paddle::relu(x); return {out}; } -std::vector relu_xpu_backward(const paddle::Tensor& x, - const paddle::Tensor& out, - const paddle::Tensor& grad_out) { - CHECK_XPU_INPUT(x); - CHECK_XPU_INPUT(out); - CHECK_XPU_INPUT(grad_out); - auto grad_x = paddle::empty_like(x, x.dtype(), x.place()); - auto ones = paddle::experimental::full_like(x, 1.0, x.dtype(), x.place()); - auto zeros = paddle::experimental::full_like(x, 0.0, x.dtype(), x.place()); - auto condition = paddle::experimental::greater_than(x, zeros); - - grad_x = grad_out * paddle::where(condition, ones, zeros); - - return {grad_x}; -} - -std::vector relu_xpu_double_backward( - const paddle::Tensor& out, const paddle::Tensor& ddx) { - CHECK_XPU_INPUT(out); - CHECK_XPU_INPUT(ddx); - auto ddout = paddle::empty(out.shape(), out.dtype(), out.place()); - auto ones = - paddle::experimental::full_like(out, 1.0, out.dtype(), out.place()); - auto zeros = - paddle::experimental::full_like(out, 0.0, out.dtype(), out.place()); - auto condition = paddle::experimental::greater_than(out, zeros); - - ddout = paddle::multiply(ddx, paddle::where(condition, ones, zeros)); - - return {ddout}; -} - std::vector ReluForward(const paddle::Tensor& x) { if (x.is_cpu()) { return relu_cpu_forward(x); @@ -147,47 +60,7 @@ std::vector ReluForward(const paddle::Tensor& x) { } } -std::vector ReluBackward(const paddle::Tensor& x, - const paddle::Tensor& out, - const paddle::Tensor& grad_out) { - if (x.is_cpu()) { - return relu_cpu_backward(x, out, grad_out); - } else if (x.is_xpu()) { - return relu_xpu_backward(x, out, grad_out); - } else { - PD_THROW("Not implemented."); - } -} - -std::vector ReluDoubleBackward(const paddle::Tensor& out, - const paddle::Tensor& ddx) { - if (out.is_cpu()) { - return relu_cpu_double_backward(out, ddx); - } else if (out.place().GetType() == phi::AllocationType::XPU) { - return relu_xpu_double_backward(out, ddx); - } else { - PD_THROW("Not implemented."); - } -} - -std::vector> ReluDoubleBackwardInferShape( - const std::vector& out_shape, - const std::vector& ddx_shape) { - return {out_shape}; -} - PD_BUILD_OP(custom_relu) .Inputs({"X"}) .Outputs({"Out"}) .SetKernelFn(PD_KERNEL(ReluForward)); - -PD_BUILD_GRAD_OP(custom_relu) - .Inputs({"X", "Out", paddle::Grad("Out")}) - .Outputs({paddle::Grad("X")}) - .SetKernelFn(PD_KERNEL(ReluBackward)); - -PD_BUILD_DOUBLE_GRAD_OP(custom_relu) - .Inputs({"Out", paddle::Grad(paddle::Grad("X"))}) - .Outputs({paddle::Grad(paddle::Grad("Out"))}) - .SetKernelFn(PD_KERNEL(ReluDoubleBackward)) - .SetInferShapeFn(PD_INFER_SHAPE(ReluDoubleBackwardInferShape)); diff --git a/test/custom_op/test_custom_relu_op_xpu_setup.py b/test/custom_op/test_custom_relu_op_xpu_setup.py index e054eadafd03a..db8d56c742911 100644 --- a/test/custom_op/test_custom_relu_op_xpu_setup.py +++ b/test/custom_op/test_custom_relu_op_xpu_setup.py @@ -34,15 +34,7 @@ def custom_relu_dynamic(func, device, dtype, np_x, use_func=True): t.retain_grads() out = func(t) if use_func else paddle.nn.functional.relu(t) - out.retain_grads() - out.stop_gradient = False - - out.backward() - - if t.grad is None: - return out.numpy(), t.grad - else: - return out.numpy(), t.grad.numpy() + return out.numpy() def custom_relu_static( @@ -71,80 +63,6 @@ def custom_relu_static( return out_v -def custom_relu_static_inference(func, device, np_data, np_label, path_prefix): - paddle.set_device(device) - - with static.scope_guard(static.Scope()): - with static.program_guard(static.Program()): - # simple module - data = static.data( - name='data', shape=[None, 1, 28, 28], dtype='float32' - ) - label = static.data(name='label', shape=[None, 1], dtype='int64') - - hidden = static.nn.fc(data, size=128) - hidden = func(hidden) - hidden = static.nn.fc(hidden, size=128) - predict = static.nn.fc(hidden, size=10, activation='softmax') - loss = paddle.nn.functional.cross_entropy(input=hidden, label=label) - avg_loss = paddle.mean(loss) - - opt = paddle.optimizer.SGD(learning_rate=0.1) - opt.minimize(avg_loss) - - # run start up model - exe = static.Executor() - exe.run(static.default_startup_program()) - - # train - for _ in range(4): - exe.run( - static.default_main_program(), - feed={'data': np_data, 'label': np_label}, - fetch_list=[avg_loss], - ) - - # save inference model - static.save_inference_model(path_prefix, [data], [predict], exe) - - # get train predict value - predict_v = exe.run( - static.default_main_program(), - feed={'data': np_data, 'label': np_label}, - fetch_list=[predict], - ) - - return predict_v - - -def custom_relu_double_grad_dynamic(func, device, dtype, np_x, use_func=True): - - paddle.set_device(device) - - t = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False) - t.retain_grads() - - out = func(t) if use_func else paddle.nn.functional.relu(t) - out.retain_grads() - dx = paddle.grad( - outputs=out, - inputs=t, - grad_outputs=paddle.ones_like(t), - create_graph=True, - retain_graph=True, - ) - - ddout = paddle.grad( - outputs=dx[0], - inputs=out.grad, - grad_outputs=paddle.ones_like(t), - create_graph=False, - ) - - assert ddout[0].numpy() is not None - return dx[0].numpy(), ddout[0].numpy() - - class TestNewCustomOpXpuSetUpInstall(unittest.TestCase): def setUp(self): cur_dir = os.path.dirname(os.path.abspath(__file__)) @@ -189,79 +107,11 @@ def test_static(self): def test_dynamic(self): for dtype in self.dtypes: x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) - out, x_grad = custom_relu_dynamic( - self.custom_op, self.device, dtype, x - ) - pd_out, pd_x_grad = custom_relu_dynamic( - self.custom_op, self.device, dtype, x, False - ) - check_output(out, pd_out, "out") - check_output(x_grad, pd_x_grad, "x_grad") - - def test_static_save_and_load_inference_model(self): - paddle.enable_static() - np_data = np.random.random((1, 1, 28, 28)).astype("float32") - np_label = np.random.random((1, 1)).astype("int64") - path_prefix = "self.custom_op_inference/custom_relu" - - predict = custom_relu_static_inference( - self.custom_op, self.device, np_data, np_label, path_prefix - ) - # load inference model - with static.scope_guard(static.Scope()): - exe = static.Executor() - [ - inference_program, - feed_target_names, - fetch_targets, - ] = static.load_inference_model(path_prefix, exe) - predict_infer = exe.run( - inference_program, - feed={feed_target_names[0]: np_data}, - fetch_list=fetch_targets, - ) - check_output(predict, predict_infer, "predict") - paddle.disable_static() - - def test_static_save_and_run_inference_predictor(self): - paddle.enable_static() - np_data = np.random.random((1, 1, 28, 28)).astype("float32") - np_label = np.random.random((1, 1)).astype("int64") - path_prefix = "self.custom_op_inference/custom_relu" - from paddle.inference import Config, create_predictor - - predict = custom_relu_static_inference( - self.custom_op, self.device, np_data, np_label, path_prefix - ) - # load inference model - config = Config(path_prefix + ".pdmodel", path_prefix + ".pdiparams") - predictor = create_predictor(config) - input_tensor = predictor.get_input_handle( - predictor.get_input_names()[0] - ) - input_tensor.reshape(np_data.shape) - input_tensor.copy_from_cpu(np_data.copy()) - predictor.run() - output_tensor = predictor.get_output_handle( - predictor.get_output_names()[0] - ) - predict_infer = output_tensor.copy_to_cpu() - predict = np.array(predict).flatten() - predict_infer = np.array(predict_infer).flatten() - check_output_allclose(predict, predict_infer, "predict") - paddle.disable_static() - - def test_func_double_grad_dynamic(self): - for dtype in self.dtypes: - x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) - out, dx_grad = custom_relu_double_grad_dynamic( - self.custom_op, self.device, dtype, x - ) - pd_out, pd_dx_grad = custom_relu_double_grad_dynamic( + out = custom_relu_dynamic(self.custom_op, self.device, dtype, x) + pd_out = custom_relu_dynamic( self.custom_op, self.device, dtype, x, False ) check_output(out, pd_out, "out") - check_output(dx_grad, pd_dx_grad, "dx_grad") def test_with_dataloader(self): paddle.disable_static() From 3278dec7c887e0742ac032989e0fd6280b927b4a Mon Sep 17 00:00:00 2001 From: HongyuJia Date: Thu, 27 Apr 2023 13:03:50 +0800 Subject: [PATCH 104/405] [Fix CppExtension Unittest] Change CUDAExtension to CppExtension if necessary (#53352) * [Fix CppExtension Unittest] Change CUDAExtension to CppExtension if necessary * Temporarily test cpp_extension under GPU * Split mixed_extension unittest --- .../fluid/tests/cpp_extension/CMakeLists.txt | 15 +- .../cpp_extension/cpp_extension_setup.py | 17 +- .../cpp_extension/test_cpp_extension_jit.py | 5 +- .../cpp_extension/test_cpp_extension_setup.py | 184 -------------- .../test_mixed_extension_setup.py | 234 ++++++++++++++++++ 5 files changed, 259 insertions(+), 196 deletions(-) create mode 100644 python/paddle/fluid/tests/cpp_extension/test_mixed_extension_setup.py diff --git a/python/paddle/fluid/tests/cpp_extension/CMakeLists.txt b/python/paddle/fluid/tests/cpp_extension/CMakeLists.txt index e9292b51751a1..61241edb8fc46 100644 --- a/python/paddle/fluid/tests/cpp_extension/CMakeLists.txt +++ b/python/paddle/fluid/tests/cpp_extension/CMakeLists.txt @@ -1,5 +1,12 @@ -py_test(test_cpp_extension_setup SRCS test_cpp_extension_setup.py) -py_test(test_cpp_extension_jit SRCS test_cpp_extension_jit.py) +if(WITH_TESTING) + if(WITH_GPU) + py_test(test_cpp_extension_setup SRCS test_cpp_extension_setup.py) + py_test(test_cpp_extension_jit SRCS test_cpp_extension_jit.py) -set_tests_properties(test_cpp_extension_setup PROPERTIES TIMEOUT 120) -set_tests_properties(test_cpp_extension_jit PROPERTIES TIMEOUT 120) + set_tests_properties(test_cpp_extension_setup PROPERTIES TIMEOUT 120) + set_tests_properties(test_cpp_extension_jit PROPERTIES TIMEOUT 120) + endif() +endif() + +py_test(test_mixed_extension_setup SRCS test_mixed_extension_setup.py) +set_tests_properties(test_mixed_extension_setup PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/cpp_extension/cpp_extension_setup.py b/python/paddle/fluid/tests/cpp_extension/cpp_extension_setup.py index 5a4ff2afd6c63..ebede6aa5a6ab 100644 --- a/python/paddle/fluid/tests/cpp_extension/cpp_extension_setup.py +++ b/python/paddle/fluid/tests/cpp_extension/cpp_extension_setup.py @@ -17,7 +17,8 @@ from utils import extra_compile_args -from paddle.utils.cpp_extension import CUDAExtension, setup +import paddle +from paddle.utils.cpp_extension import CppExtension, CUDAExtension, setup paddle_includes = [] for site_packages_path in getsitepackages(): @@ -30,14 +31,16 @@ # Add current dir, search custom_power.h paddle_includes.append(os.path.dirname(os.path.abspath(__file__))) +sources = ["custom_extension.cc", "custom_sub.cc"] +Extension = CppExtension +if paddle.is_compiled_with_cuda(): + sources.append("custom_relu_forward.cu") + Extension = CUDAExtension + setup( name='custom_cpp_extension', - ext_modules=CUDAExtension( - sources=[ - "custom_extension.cc", - "custom_sub.cc", - "custom_relu_forward.cu", - ], + ext_modules=Extension( + sources=sources, include_dirs=paddle_includes, extra_compile_args=extra_compile_args, verbose=True, diff --git a/python/paddle/fluid/tests/cpp_extension/test_cpp_extension_jit.py b/python/paddle/fluid/tests/cpp_extension/test_cpp_extension_jit.py index 5723df1b585b0..eac97c8457f46 100644 --- a/python/paddle/fluid/tests/cpp_extension/test_cpp_extension_jit.py +++ b/python/paddle/fluid/tests/cpp_extension/test_cpp_extension_jit.py @@ -28,7 +28,10 @@ sys.exit() # Compile and load cpp extension Just-In-Time. -sources = ["custom_extension.cc", "custom_sub.cc", "custom_relu_forward.cu"] +sources = ["custom_extension.cc", "custom_sub.cc"] +if paddle.is_compiled_with_cuda(): + sources.append("custom_relu_forward.cu") + paddle_includes = [] for site_packages_path in getsitepackages(): paddle_includes.append( diff --git a/python/paddle/fluid/tests/cpp_extension/test_cpp_extension_setup.py b/python/paddle/fluid/tests/cpp_extension/test_cpp_extension_setup.py index 15d0cb77d03d5..2de2dd80deac9 100644 --- a/python/paddle/fluid/tests/cpp_extension/test_cpp_extension_setup.py +++ b/python/paddle/fluid/tests/cpp_extension/test_cpp_extension_setup.py @@ -21,80 +21,9 @@ from utils import check_output import paddle -from paddle import static from paddle.utils.cpp_extension.extension_utils import run_cmd -def custom_relu_static( - func, device, dtype, np_x, use_func=True, test_infer=False -): - paddle.enable_static() - paddle.set_device(device) - - with static.scope_guard(static.Scope()): - with static.program_guard(static.Program()): - x = static.data(name='X', shape=[None, 8], dtype=dtype) - x.stop_gradient = False - out = func(x) if use_func else paddle.nn.functional.relu(x) - static.append_backward(out) - - exe = static.Executor() - exe.run(static.default_startup_program()) - # in static graph mode, x data has been covered by out - out_v = exe.run( - static.default_main_program(), - feed={'X': np_x}, - fetch_list=[out.name], - ) - - paddle.disable_static() - return out_v - - -def custom_relu_dynamic(func, device, dtype, np_x, use_func=True): - paddle.set_device(device) - - t = paddle.to_tensor(np_x, dtype=dtype) - t.stop_gradient = False - - out = func(t) if use_func else paddle.nn.functional.relu(t) - out.stop_gradient = False - - out.backward() - - if t.grad is None: - return out.numpy(), t.grad - else: - return out.numpy(), t.grad.numpy() - - -def custom_relu_double_grad_dynamic(func, device, dtype, np_x, use_func=True): - paddle.set_device(device) - - t = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False) - t.retain_grads() - - out = func(t) if use_func else paddle.nn.functional.relu(t) - out.retain_grads() - dx = paddle.grad( - outputs=out, - inputs=t, - grad_outputs=paddle.ones_like(t), - create_graph=True, - retain_graph=True, - ) - - ddout = paddle.grad( - outputs=dx[0], - inputs=out.grad, - grad_outputs=paddle.ones_like(t), - create_graph=False, - ) - - assert ddout[0].numpy() is not None - return dx[0].numpy(), ddout[0].numpy() - - class TestCppExtensionSetupInstall(unittest.TestCase): """ Tests setup install cpp extensions. @@ -117,21 +46,6 @@ def setUp(self): custom_egg_path ) sys.path.append(os.path.join(site_dir, custom_egg_path[0])) - - # install mixed custom_op and extension - cmd = 'cd {} && {} mix_relu_and_extension_setup.py install'.format( - cur_dir, sys.executable - ) - run_cmd(cmd) - - site_dir = site.getsitepackages()[0] - custom_egg_path = [ - x for x in os.listdir(site_dir) if 'mix_relu_extension' in x - ] - assert len(custom_egg_path) == 1, "Matched egg number is %d." % len( - custom_egg_path - ) - sys.path.append(os.path.join(site_dir, custom_egg_path[0])) ################################# # config seed @@ -145,19 +59,11 @@ def tearDown(self): pass def test_cpp_extension(self): - # Extension self._test_extension_function_plain() - self._test_extension_function_mixed() self._test_vector_tensor() self._test_extension_class() self._test_nullable_tensor() self._test_optional_tensor() - # Custom op - self._test_static() - self._test_dynamic() - self._test_double_grad_dynamic() - if paddle.is_compiled_with_cuda(): - self._test_cuda_relu() def _test_extension_function_plain(self): import custom_cpp_extension @@ -177,25 +83,6 @@ def _test_extension_function_plain(self): target_out = np.exp(np_x) - np.exp(np_y) np.testing.assert_allclose(out.numpy(), target_out, atol=1e-5) - def _test_extension_function_mixed(self): - import mix_relu_extension - - for dtype in self.dtypes: - np_x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) - x = paddle.to_tensor(np_x, dtype=dtype) - np_y = np.random.uniform(-1, 1, [4, 8]).astype(dtype) - y = paddle.to_tensor(np_y, dtype=dtype) - - # Test mix_relu_extension - out = mix_relu_extension.custom_add2(x, y) - target_out = np.exp(np_x) + np.exp(np_y) - np.testing.assert_allclose(out.numpy(), target_out, atol=1e-5) - - # Test we can call a method not defined in the main C++ file. - out = mix_relu_extension.custom_sub2(x, y) - target_out = np.exp(np_x) - np.exp(np_y) - np.testing.assert_allclose(out.numpy(), target_out, atol=1e-5) - def _test_extension_class(self): import custom_cpp_extension @@ -263,77 +150,6 @@ def _test_optional_tensor(self): err_msg=f'extension out: {x},\n numpy out: {x_np}', ) - def _test_static(self): - import mix_relu_extension - - for dtype in self.dtypes: - x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) - out = custom_relu_static( - mix_relu_extension.custom_relu, "CPU", dtype, x - ) - pd_out = custom_relu_static( - mix_relu_extension.custom_relu, "CPU", dtype, x, False - ) - np.testing.assert_array_equal( - out, - pd_out, - err_msg='custom op out: {},\n paddle api out: {}'.format( - out, pd_out - ), - ) - - def _test_dynamic(self): - import mix_relu_extension - - for dtype in self.dtypes: - x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) - out, x_grad = custom_relu_dynamic( - mix_relu_extension.custom_relu, "CPU", dtype, x - ) - pd_out, pd_x_grad = custom_relu_dynamic( - mix_relu_extension.custom_relu, "CPU", dtype, x, False - ) - np.testing.assert_array_equal( - out, - pd_out, - err_msg='custom op out: {},\n paddle api out: {}'.format( - out, pd_out - ), - ) - np.testing.assert_array_equal( - x_grad, - pd_x_grad, - err_msg='custom op x grad: {},\n paddle api x grad: {}'.format( - x_grad, pd_x_grad - ), - ) - - def _test_double_grad_dynamic(self): - import mix_relu_extension - - for dtype in self.dtypes: - x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) - out, dx_grad = custom_relu_double_grad_dynamic( - mix_relu_extension.custom_relu, "CPU", dtype, x - ) - pd_out, pd_dx_grad = custom_relu_double_grad_dynamic( - mix_relu_extension.custom_relu, "CPU", dtype, x, False - ) - np.testing.assert_array_equal( - out, - pd_out, - err_msg='custom op out: {},\n paddle api out: {}'.format( - out, pd_out - ), - ) - np.testing.assert_array_equal( - dx_grad, - pd_dx_grad, - err_msg='custom op dx grad: {},\n paddle api dx grad: {}'.format( - dx_grad, pd_dx_grad - ), - ) - def _test_cuda_relu(self): import custom_cpp_extension diff --git a/python/paddle/fluid/tests/cpp_extension/test_mixed_extension_setup.py b/python/paddle/fluid/tests/cpp_extension/test_mixed_extension_setup.py new file mode 100644 index 0000000000000..92aceff5067af --- /dev/null +++ b/python/paddle/fluid/tests/cpp_extension/test_mixed_extension_setup.py @@ -0,0 +1,234 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import site +import sys +import unittest + +import numpy as np + +import paddle +from paddle import static +from paddle.utils.cpp_extension.extension_utils import run_cmd + + +def custom_relu_static( + func, device, dtype, np_x, use_func=True, test_infer=False +): + paddle.enable_static() + paddle.set_device(device) + + with static.scope_guard(static.Scope()): + with static.program_guard(static.Program()): + x = static.data(name='X', shape=[None, 8], dtype=dtype) + x.stop_gradient = False + out = func(x) if use_func else paddle.nn.functional.relu(x) + static.append_backward(out) + + exe = static.Executor() + exe.run(static.default_startup_program()) + # in static graph mode, x data has been covered by out + out_v = exe.run( + static.default_main_program(), + feed={'X': np_x}, + fetch_list=[out.name], + ) + + paddle.disable_static() + return out_v + + +def custom_relu_dynamic(func, device, dtype, np_x, use_func=True): + paddle.set_device(device) + + t = paddle.to_tensor(np_x, dtype=dtype) + t.stop_gradient = False + + out = func(t) if use_func else paddle.nn.functional.relu(t) + out.stop_gradient = False + + out.backward() + + if t.grad is None: + return out.numpy(), t.grad + else: + return out.numpy(), t.grad.numpy() + + +def custom_relu_double_grad_dynamic(func, device, dtype, np_x, use_func=True): + paddle.set_device(device) + + t = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False) + t.retain_grads() + + out = func(t) if use_func else paddle.nn.functional.relu(t) + out.retain_grads() + dx = paddle.grad( + outputs=out, + inputs=t, + grad_outputs=paddle.ones_like(t), + create_graph=True, + retain_graph=True, + ) + + ddout = paddle.grad( + outputs=dx[0], + inputs=out.grad, + grad_outputs=paddle.ones_like(t), + create_graph=False, + ) + + assert ddout[0].numpy() is not None + return dx[0].numpy(), ddout[0].numpy() + + +class TestCppExtensionSetupInstall(unittest.TestCase): + """ + Tests setup install cpp extensions. + """ + + def setUp(self): + cur_dir = os.path.dirname(os.path.abspath(__file__)) + # install mixed custom_op and extension + # compile, install the custom op egg into site-packages under background + cmd = 'cd {} && {} mix_relu_and_extension_setup.py install'.format( + cur_dir, sys.executable + ) + run_cmd(cmd) + + site_dir = site.getsitepackages()[0] + custom_egg_path = [ + x for x in os.listdir(site_dir) if 'mix_relu_extension' in x + ] + assert len(custom_egg_path) == 1, "Matched egg number is %d." % len( + custom_egg_path + ) + sys.path.append(os.path.join(site_dir, custom_egg_path[0])) + ################################# + + # config seed + SEED = 2021 + paddle.seed(SEED) + paddle.framework.random._manual_program_seed(SEED) + + self.dtypes = ['float32', 'float64'] + + def tearDown(self): + pass + + def test_cpp_extension(self): + # Extension + self._test_extension_function_mixed() + # Custom op + self._test_static() + self._test_dynamic() + self._test_double_grad_dynamic() + + def _test_extension_function_mixed(self): + import mix_relu_extension + + for dtype in self.dtypes: + np_x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) + x = paddle.to_tensor(np_x, dtype=dtype) + np_y = np.random.uniform(-1, 1, [4, 8]).astype(dtype) + y = paddle.to_tensor(np_y, dtype=dtype) + + # Test mix_relu_extension + out = mix_relu_extension.custom_add2(x, y) + target_out = np.exp(np_x) + np.exp(np_y) + np.testing.assert_allclose(out.numpy(), target_out, atol=1e-5) + + # Test we can call a method not defined in the main C++ file. + out = mix_relu_extension.custom_sub2(x, y) + target_out = np.exp(np_x) - np.exp(np_y) + np.testing.assert_allclose(out.numpy(), target_out, atol=1e-5) + + def _test_static(self): + import mix_relu_extension + + for dtype in self.dtypes: + x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) + out = custom_relu_static( + mix_relu_extension.custom_relu, "CPU", dtype, x + ) + pd_out = custom_relu_static( + mix_relu_extension.custom_relu, "CPU", dtype, x, False + ) + np.testing.assert_array_equal( + out, + pd_out, + err_msg='custom op out: {},\n paddle api out: {}'.format( + out, pd_out + ), + ) + + def _test_dynamic(self): + import mix_relu_extension + + for dtype in self.dtypes: + x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) + out, x_grad = custom_relu_dynamic( + mix_relu_extension.custom_relu, "CPU", dtype, x + ) + pd_out, pd_x_grad = custom_relu_dynamic( + mix_relu_extension.custom_relu, "CPU", dtype, x, False + ) + np.testing.assert_array_equal( + out, + pd_out, + err_msg='custom op out: {},\n paddle api out: {}'.format( + out, pd_out + ), + ) + np.testing.assert_array_equal( + x_grad, + pd_x_grad, + err_msg='custom op x grad: {},\n paddle api x grad: {}'.format( + x_grad, pd_x_grad + ), + ) + + def _test_double_grad_dynamic(self): + import mix_relu_extension + + for dtype in self.dtypes: + x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) + out, dx_grad = custom_relu_double_grad_dynamic( + mix_relu_extension.custom_relu, "CPU", dtype, x + ) + pd_out, pd_dx_grad = custom_relu_double_grad_dynamic( + mix_relu_extension.custom_relu, "CPU", dtype, x, False + ) + np.testing.assert_array_equal( + out, + pd_out, + err_msg='custom op out: {},\n paddle api out: {}'.format( + out, pd_out + ), + ) + np.testing.assert_array_equal( + dx_grad, + pd_dx_grad, + err_msg='custom op dx grad: {},\n paddle api dx grad: {}'.format( + dx_grad, pd_dx_grad + ), + ) + + +if __name__ == '__main__': + if os.name == 'nt' or sys.platform.startswith('darwin'): + # only support Linux now + sys.exit() + unittest.main() From 18e9dcdcdb79467fdc799e332f0f504bfabfb6a1 Mon Sep 17 00:00:00 2001 From: Zhang Ting Date: Thu, 27 Apr 2023 14:11:35 +0800 Subject: [PATCH 105/405] [AMP] support OD level and skip dynamic loss scaling for bf16 (#53289) * support OD level and skip dynamic loss scaling for bf16 --- paddle/fluid/eager/amp_utils.h | 6 +- paddle/fluid/imperative/amp_auto_cast.h | 1 + paddle/fluid/pybind/imperative.cc | 1 + python/paddle/amp/amp_lists.py | 16 ++++- python/paddle/amp/auto_cast.py | 36 +++++----- python/paddle/amp/grad_scaler.py | 14 ++++ ...perative_auto_mixed_precision_for_eager.py | 13 ++-- test/amp/test_amp_api.py | 66 +++++++++++++++++++ 8 files changed, 129 insertions(+), 24 deletions(-) create mode 100644 test/amp/test_amp_api.py diff --git a/paddle/fluid/eager/amp_utils.h b/paddle/fluid/eager/amp_utils.h index ac9edc569df9f..2e06eaab8acae 100644 --- a/paddle/fluid/eager/amp_utils.h +++ b/paddle/fluid/eager/amp_utils.h @@ -129,7 +129,11 @@ inline phi::DataType GetAmpDestDtype( ->count(op_name)) { dst_type = phi::DataType::FLOAT32; } else { - dst_type = GetPromoteType(op_name, amp_tensors_vector, amp_setting_dtype); + if (amp_level == paddle::imperative::AmpLevel::OD) { + dst_type = phi::DataType::FLOAT32; + } else { + dst_type = GetPromoteType(op_name, amp_tensors_vector, amp_setting_dtype); + } } if (dst_type == amp_setting_dtype && diff --git a/paddle/fluid/imperative/amp_auto_cast.h b/paddle/fluid/imperative/amp_auto_cast.h index ced07b953d0c7..31dfc9dec57ab 100644 --- a/paddle/fluid/imperative/amp_auto_cast.h +++ b/paddle/fluid/imperative/amp_auto_cast.h @@ -31,6 +31,7 @@ enum class AmpLevel { O1, // amp, mixed fp32-fp16 O2, // almost fp16 O3, // fp16 + OD, // only conv and matmul use low precison. }; std::tuple, diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 372fae12ec3bc..8d5bd524a1c14 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -2152,6 +2152,7 @@ void BindImperative(py::module *m_ptr) { py::enum_(m, "AmpLevel", py::arithmetic()) .value("O0", paddle::imperative::AmpLevel::O0) + .value("OD", paddle::imperative::AmpLevel::OD) .value("O1", paddle::imperative::AmpLevel::O1) .value("O2", paddle::imperative::AmpLevel::O2) .value("O3", paddle::imperative::AmpLevel::O3) diff --git a/python/paddle/amp/amp_lists.py b/python/paddle/amp/amp_lists.py index f70c8f5ed7f91..51c557b9481c6 100644 --- a/python/paddle/amp/amp_lists.py +++ b/python/paddle/amp/amp_lists.py @@ -91,10 +91,19 @@ BF16_BLACK_LIST = set() +# At OD level, ops in WHITE_LIST will use FP16/BF16 and the others will use FP32. def white_list(): white_list = { - "float16": {"O1": FP16_WHITE_LIST, "O2": FP16_WHITE_LIST}, - "bfloat16": {"O1": BF16_WHITE_LIST, "O2": BF16_WHITE_LIST}, + "float16": { + "OD": FP16_WHITE_LIST, + "O1": FP16_WHITE_LIST, + "O2": FP16_WHITE_LIST, + }, + "bfloat16": { + "OD": BF16_WHITE_LIST, + "O1": BF16_WHITE_LIST, + "O2": BF16_WHITE_LIST, + }, } return white_list @@ -102,9 +111,10 @@ def white_list(): def black_list(): black_list = { "float16": { + "OD": set(), "O1": FP16_BLACK_LIST | FP16_EXTRA_BLACK_LIST, "O2": FP16_EXTRA_BLACK_LIST, }, - "bfloat16": {"O1": BF16_BLACK_LIST, "O2": set()}, + "bfloat16": {"OD": set(), "O1": BF16_BLACK_LIST, "O2": set()}, } return black_list diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py index ae9c957df68fa..d1eaf0dbd13ae 100644 --- a/python/paddle/amp/auto_cast.py +++ b/python/paddle/amp/auto_cast.py @@ -48,6 +48,7 @@ def __init__(self): self.model_parameters = [] self.use_master_grad = False self.already_register_final_backward_hook = False + self.amp_dtype = 'float32' def __setattr__(self, name, val): self.__dict__[name] = val @@ -320,10 +321,8 @@ def amp_guard( # check amp_level: O0-O2 level = level.upper() - if not (level in ['O0', 'O1', 'O2']): - raise ValueError( - "level should be O0, O1 or O2. O0 represents fp32 train mode, O1 represents AMP train mode, O2 represents pure fp16/bf16 train mode." - ) + if not (level in ['O0', 'OD', 'O1', 'O2']): + raise ValueError("level should be O0, OD, O1 or O2.") # check amp_dtype: float16 or bfloat16 dtype = dtype.lower() @@ -384,8 +383,11 @@ def amp_guard( ) amp_dtype = dtype + amp_global_state().amp_dtype = amp_dtype - if level == 'O1': + if level == 'OD': + amp_level = AMP_LEVEL.OD + elif level == 'O1': amp_level = AMP_LEVEL.O1 elif level == 'O2': amp_level = AMP_LEVEL.O2 @@ -642,22 +644,24 @@ def auto_cast( ): """ Create a context which enables auto-mixed-precision(AMP) of operators executed in dynamic graph mode. - If enabled, the input data type (float32 or float16) of each operator is decided + If enabled, the input data type (float32, float16 or bfloat16) of each operator is decided by autocast algorithm for better performance. - Commonly, it is used together with `GradScaler` to achieve Auto-Mixed-Precision in - imperative mode. It is used together with `decorator` to achieve Pure fp16 in imperative mode. + Commonly, it is used together with `GradScaler` and `decorator` to achieve Auto-Mixed-Precision in + imperative mode. Args: enable(bool, optional): Enable auto-mixed-precision or not. Default is True. - custom_white_list(set|list|tuple, optional): The custom white_list. It's the set of ops that support - fp16 calculation and are considered numerically-safe and performance-critical. These ops - will be converted to fp16. - custom_black_list(set|list|tuple, optional): The custom black_list. The set of ops that support fp16 - calculation and are considered numerically-dangerous and whose effects may also be - observed in downstream ops. These ops will not be converted to fp16. - level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the input data type of each operator will be casted by white_list and black_list; - O2 represent Pure fp16, all operators parameters and input data will be casted to fp16, except operators in black_list, don't support fp16 kernel and batchnorm. Default is O1(amp) + custom_white_list(set|list|tuple, optional): A default white list is already set. Usually there is no need to set custom white list. + The set of ops should be considered numerically-safe and performance-critical. These ops will be converted to float16/bfloat16. + custom_black_list(set|list|tuple, optional): A default black list is already set. You can set a custom black list according to the model. + The set of ops are considered numerically-dangerous and whose effects may also be observed in downstream ops. These ops will not be + converted to float16/bfloat16. + level(str, optional): Auto mixed precision level. Accepted values are "O1", "O2" and "OD": At the O1 level, operators in the white list + will use float16/bfloat16 inputs for calculations, and operators in the black list will use float32 inputs for calculations. At the O2 + level, model's parameters will be casted to float16/bfloat16 by using `decorator`, and operators that have all float16/bfloat16 inputs + will be converted to float16/bfloat16, and that have any float32 input will be converted to float32. For the OD level, operators in + default white list will compute in float16/bfloat16, and the others will compute in float32. Default is O1. dtype(str, optional): Whether to use 'float16' or 'bfloat16'. Default is 'float16'. Examples: diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py index 0f6d9f21a32c6..2cade3482e9c3 100644 --- a/python/paddle/amp/grad_scaler.py +++ b/python/paddle/amp/grad_scaler.py @@ -24,6 +24,8 @@ from paddle.fluid.dygraph import to_variable from paddle.fluid.framework import _dygraph_tracer, dygraph_only +from .auto_cast import amp_global_state + class OptimizerState(Enum): INIT = 0 @@ -179,6 +181,18 @@ def scale(self, var): """ check_type(var, "var", core.eager.Tensor, 'AmpScaler.scale()') + if ( + self._enable + and amp_global_state().amp_dtype != 'float16' + and self._use_dynamic_loss_scaling + ): + self._enable = False + self._use_dynamic_loss_scaling = False + warnings.warn( + 'It is not recommended to use dynamic loss scaling for %s, so GradScaler is disable by default.' + % (amp_global_state().amp_dtype) + ) + if not self._enable: return var diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py index 8d24febaff213..5de19dfb4113b 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py @@ -194,8 +194,11 @@ def test_mode_exception(self): class TestAmpScaler(unittest.TestCase): def scale(self): + if not paddle.amp.is_float16_supported(): + return with fluid.dygraph.guard(): - data = paddle.rand([10, 1024]) + with paddle.amp.auto_cast(dtype='float16'): + data = paddle.rand([10, 1024]) scaler = paddle.amp.AmpScaler(init_loss_scaling=1024) scaled_data = scaler.scale(data) self.assertEqual( @@ -333,9 +336,9 @@ def nan_inf(self): ) scaler = paddle.amp.AmpScaler(init_loss_scaling=1024) data = fluid.dygraph.to_variable(inp_np) - - out = model(data) - loss = paddle.mean(out) + with paddle.amp.auto_cast(dtype='float16'): + out = model(data) + loss = paddle.mean(out) scaled_loss = scaler.scale(loss) scaled_loss.backward() optimize_ops, params_grads = scaler.minimize(optimizer, scaled_loss) @@ -348,6 +351,8 @@ def nan_inf(self): ) def test_nan_inf(self): + if not paddle.amp.is_float16_supported(): + return self.nan_inf() def step_update_exception(self): diff --git a/test/amp/test_amp_api.py b/test/amp/test_amp_api.py new file mode 100644 index 0000000000000..7d397c7043206 --- /dev/null +++ b/test/amp/test_amp_api.py @@ -0,0 +1,66 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from amp_base_models import AmpTestBase + +import paddle + + +class TestAutoCast(AmpTestBase): + def test_amp_OD_level(self): + conv = paddle.nn.Conv2D( + in_channels=1, out_channels=6, kernel_size=3, bias_attr=False + ) + linear = paddle.nn.Linear(in_features=4, out_features=4) + with paddle.amp.auto_cast(level='OD'): + out1 = conv(paddle.rand(shape=[1, 1, 6, 6], dtype='float32')) + out2 = out1 + paddle.rand(shape=out1.shape, dtype='float16') + out3 = linear(out2) + + self.assertEqual(out1.dtype, paddle.float16) + self.assertEqual(out2.dtype, paddle.float32) + self.assertEqual(out3.dtype, paddle.float32) + + +class TestGradScaler(AmpTestBase): + def test_amp_grad_scaler(self): + model = paddle.nn.Conv2D(3, 2, 3) + optimizer = paddle.optimizer.SGD( + learning_rate=0.01, parameters=model.parameters() + ) + scaler = paddle.amp.GradScaler() + data = paddle.rand([1, 3, 8, 8], dtype='float32') + paddle.amp.debugging.enable_operator_stats_collection() + with paddle.amp.auto_cast( + custom_black_list=['conv2d'], dtype='bfloat16' + ): + out = model(data) + loss = out.mean() + scaled = scaler.scale(loss) + scaled.backward() + scaler.minimize(optimizer, scaled) + optimizer.clear_grad() + paddle.amp.debugging.disable_operator_stats_collection() + op_list = paddle.fluid.core.get_low_precision_op_list() + + self.assertEqual(scaler._enable, False) + self.assertEqual(scaler._use_dynamic_loss_scaling, False) + self.assertTrue('scale' not in op_list) + self.assertTrue('check_finite_and_unscale' not in op_list) + + +if __name__ == '__main__': + unittest.main() From 25b4ba7fef890aa74813ba47cecf175cc5bbe3a6 Mon Sep 17 00:00:00 2001 From: Sonder <55493212+AndSonder@users.noreply.github.com> Date: Thu, 27 Apr 2023 14:15:57 +0800 Subject: [PATCH 106/405] Move fused feedforward (#53166) * trans fused_feedward Compute function to phi * add register info * remove maxfunctor * move fused feedward to phi * remove sig file * remove fliud include * add include * add include * add sig file * add output register info * fix sig file * Update fused_feedforward_sig.cc * fix grad kernel * update output register info * fix * open fused_feedforward static build * add optional and fix code style * fix output info for fused attention * add optional param * merge --- .../operators/fused/fused_attention_op.cu | 2 + .../operators/fused/fused_feedforward_op.cu | 1254 +++++++++-------- paddle/phi/kernels/funcs/functors.h | 5 - .../kernels/fused_feedforward_grad_kernel.h | 4 +- .../kernels/fusion/gpu/fused_dropout_helper.h | 33 +- .../xpu/fused_feedforward_grad_kernel.cc | 6 +- .../fluid/tests/unittests/CMakeLists.txt | 4 + 7 files changed, 673 insertions(+), 635 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu index de62fe38653b5..b1212f410fe9f 100644 --- a/paddle/fluid/operators/fused/fused_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_attention_op.cu @@ -813,6 +813,8 @@ PD_REGISTER_KERNEL(fused_attention, phi::dtype::float16, double, float) { + kernel->OutputAt(9).SetDataType(phi::DataType::UINT8); + kernel->OutputAt(14).SetDataType(phi::DataType::UINT8); if (kernel_key.dtype() == phi::DataType::FLOAT16) { kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT32); kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu index 2c3456d31025b..ee40633e4252b 100644 --- a/paddle/fluid/operators/fused/fused_feedforward_op.cu +++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu @@ -13,634 +13,668 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/operators/fused/fused_dropout_helper.h" -#include "paddle/fluid/operators/matmul_v2_op.h" +#include "paddle/fluid/operators/fused/fused_attention_utils.h" #include "paddle/phi/api/include/tensor.h" +#include "paddle/phi/core/errors.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" #include "paddle/phi/kernels/funcs/layer_norm_impl.cu.h" +#include "paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h" +#include "paddle/phi/kernels/impl/matmul_grad_kernel_impl.h" + +namespace phi { +namespace fusion { + +template +void MatMul(const phi::GPUContext& dev_ctx, + const phi::DenseTensor& a, + const phi::DenseTensor& b, + phi::DenseTensor* c) { + auto blas = phi::funcs::GetBlas(dev_ctx); + auto a_2d = phi::FoldInitDims(a); + auto b_2d = phi::FoldInitDims(b); + auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_2d.dims(), 0, false); + auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_2d.dims(), 0, false); + T alpha = static_cast(1.0); + blas.MatMul(a, mat_dim_a, b, mat_dim_b, alpha, c, T(0)); +} -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/distributed/collective/process_group_nccl.h" -#include "paddle/fluid/platform/collective_helper.h" -#include "paddle/fluid/platform/device/gpu/nccl_helper.h" -#endif - -namespace paddle { -namespace operators { - -template -static void AllReduce(phi::DenseTensor& tensor, // NOLINT - const int ring_id, - const phi::GPUContext& ctx) { - if (ring_id == -1) return; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance(); - - if (map->has(ring_id)) { - paddle::distributed::ProcessGroup* pg = map->get(ring_id); - auto pg_nccl = static_cast(pg); - paddle::distributed::AllreduceOptions opts; - opts.reduce_op = distributed::ReduceOp::SUM; - auto task = pg_nccl->AllReduce(&tensor, tensor, opts, true, true); - task->Wait(); +template +void FFN(const phi::GPUContext& dev_ctx, + const phi::DenseTensor& x, + const phi::DenseTensor& linear1_weight, + const phi::DenseTensor* linear1_bias, + const phi::DenseTensor& linear2_weight, + const phi::DenseTensor* linear2_bias, + const phi::DenseTensor* ln1_scale, + const phi::DenseTensor* ln1_bias, + const phi::DenseTensor* ln2_scale, + const phi::DenseTensor* ln2_bias, + phi::DenseTensor* out, + phi::DenseTensor* dropout1_mask, + phi::DenseTensor* dropout2_mask, + phi::DenseTensor* ln1_mean, + phi::DenseTensor* ln1_variance, + phi::DenseTensor* ln2_mean, + phi::DenseTensor* ln2_variance, + phi::DenseTensor* linear1_out, + phi::DenseTensor* ln1_out, + phi::DenseTensor* dropout1_out, + phi::DenseTensor* dropout2_out, + const int bsz_seq, + const int d_model, + const int dim_feedforward, + const std::string& act_method, + const bool pre_layer_norm, + const float epsilon1, + const float epsilon2, + const bool add_residual, + const int ring_id, + const phi::fusion::DropoutParam& dropout_param1, + const phi::fusion::DropoutParam& dropout_param2) { + phi::fusion::FusedDropoutLayerNormHelper pre_layernorm_helper( + bsz_seq, d_model, epsilon1); + phi::fusion::FusedDropoutHelper fused_act_dropout_helper( + dev_ctx, bsz_seq, dim_feedforward, dropout_param1); + phi::fusion::FusedDropoutLayerNormHelper + fused_dropout_layernorm_helper( + dev_ctx, bsz_seq, d_model, dropout_param2, epsilon2); + + using U = phi::funcs::LayerNormParamType; + const phi::DenseTensor* in = &x; + + const U* ln1_scale_ptr = + ln1_scale == nullptr ? nullptr : ln1_scale->data(); + const U* ln1_bias_ptr = ln1_bias == nullptr ? nullptr : ln1_bias->data(); + const U* ln2_scale_ptr = + ln2_scale == nullptr ? nullptr : ln2_scale->data(); + const U* ln2_bias_ptr = ln2_bias == nullptr ? nullptr : ln2_bias->data(); + const T* linear1_bias_ptr = + linear1_bias == nullptr ? nullptr : linear1_bias->data(); + const T* linear2_bias_ptr = + linear2_bias == nullptr ? nullptr : linear2_bias->data(); + + if (pre_layer_norm) { + pre_layernorm_helper.LayerNorm(dev_ctx, + x.data(), + ln1_scale_ptr, + ln1_bias_ptr, + ln1_out->data(), + ln1_mean->data(), + ln1_variance->data()); + in = ln1_out; + } + MatMul(dev_ctx, *in, linear1_weight, linear1_out); + fused_act_dropout_helper.DropoutActBias(dev_ctx, + linear1_out->data(), + linear1_bias_ptr, + act_method, + dropout1_out->data(), + dropout1_mask->data()); + phi::DenseTensor linear2_out; + linear2_out.Resize({bsz_seq, d_model}); + dev_ctx.template Alloc(&linear2_out, linear2_out.numel() * sizeof(T)); + MatMul(dev_ctx, *dropout1_out, linear2_weight, &linear2_out); + + // tensor model parallel + phi::fusion::AllReduce(linear2_out, ring_id, dev_ctx); + + const T* residual_ptr = add_residual ? x.data() : nullptr; + if (!pre_layer_norm) { + // TODO(Xreki): support post layer_norm case when add_residual is false. + PADDLE_ENFORCE_EQ(add_residual, + true, + phi::errors::InvalidArgument( + "Attribute add_residual is expected to be true " + "when pre_layer_norm is false.")); + + fused_dropout_layernorm_helper.LayernormResidualDropoutBias( + dev_ctx, + linear2_out.data(), + residual_ptr, + linear2_bias_ptr, + ln2_scale_ptr, + ln2_bias_ptr, + dropout2_out->data(), + dropout2_mask->data(), + out->data(), + ln2_mean->data(), + ln2_variance->data()); } else { - auto dtype = platform::ToNCCLDataType( - framework::TransToProtoVarType(tensor.dtype())); - int64_t numel = tensor.numel(); - const void* sendbuff = tensor.data(); - auto place = ctx.GetPlace(); - void* recvbuff = ctx.Alloc(&tensor, tensor.numel() * sizeof(T)); - auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place); - auto stream = ctx.stream(); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( - sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream)); + fused_dropout_layernorm_helper.ResidualDropoutBias( + dev_ctx, + linear2_out.data(), + residual_ptr, + linear2_bias_ptr, + out->data(), + dropout2_mask->data()); } -#else - PADDLE_THROW(platform::errors::Unimplemented( - "PaddlePaddle should compile with NCCL or RCCL when used tensor model " - "parallel op.")); -#endif } -template -class FusedFeedForwardKernel : public framework::OpKernel { - public: - void MatMul(const phi::GPUContext& ctx, - const phi::DenseTensor& a, - const phi::DenseTensor& b, - phi::DenseTensor* c) const { - auto blas = phi::funcs::GetBlas(ctx); - auto a_2d = FoldInitDims(a); - auto b_2d = FoldInitDims(b); - auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_2d.dims(), 0, false); - auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_2d.dims(), 0, false); - T alpha = static_cast(1.0); - blas.MatMul(a, mat_dim_a, b, mat_dim_b, alpha, c, T(0)); +template +void FusedFeedForwardKernel(const Context& dev_ctx, + const DenseTensor& x, + const paddle::optional& dropout1_seed, + const paddle::optional& dropout2_seed, + const DenseTensor& linear1_weight, + const paddle::optional& linear1_bias, + const DenseTensor& linear2_weight, + const paddle::optional& linear2_bias, + const paddle::optional& ln1_scale, + const paddle::optional& ln1_bias, + const paddle::optional& ln2_scale, + const paddle::optional& ln2_bias, + bool pre_layer_norm, + float ln1_epsilon, + float ln2_epsilon, + const std::string& act_method, + float dropout1_prob, + float dropout2_prob, + const std::string& dropout1_implementation, + const std::string& dropout2_implementation, + bool is_test, + bool dropout1_fix_seed, + bool dropout2_fix_seed, + int dropout1_seed_val, + int dropout2_seed_val, + bool add_residual, + int ring_id, + DenseTensor* out, + DenseTensor* dropout1_mask, + DenseTensor* dropout2_mask, + DenseTensor* ln1_mean, + DenseTensor* ln1_variance, + DenseTensor* ln2_mean, + DenseTensor* ln2_variance, + DenseTensor* linear1_out, + DenseTensor* ln1_out, + DenseTensor* dropout1_out, + DenseTensor* dropout2_out) { + auto* x_ptr = &x; + auto* linear1_weight_ptr = &linear1_weight; + auto* linear1_bias_ptr = linear1_bias.get_ptr(); + auto* linear2_weight_ptr = &linear2_weight; + auto* linear2_bias_ptr = linear2_bias.get_ptr(); + + auto* ln1_scale_ptr = pre_layer_norm ? ln1_scale.get_ptr() : nullptr; + auto* ln1_bias_ptr = pre_layer_norm ? ln1_bias.get_ptr() : nullptr; + auto* ln2_scale_ptr = !pre_layer_norm ? ln2_scale.get_ptr() : nullptr; + auto* ln2_bias_ptr = !pre_layer_norm ? ln2_bias.get_ptr() : nullptr; + + if (!pre_layer_norm) { + ln1_mean = nullptr; + ln1_variance = nullptr; + ln1_out = nullptr; + } else { + ln2_mean = nullptr; + ln2_variance = nullptr; } - void FFN(const phi::GPUContext& ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& linear1_weight, - const phi::DenseTensor* linear1_bias, - const phi::DenseTensor& linear2_weight, - const phi::DenseTensor* linear2_bias, - const phi::DenseTensor* ln1_scale, - const phi::DenseTensor* ln1_bias, - const phi::DenseTensor* ln2_scale, - const phi::DenseTensor* ln2_bias, - phi::DenseTensor* out, - phi::DenseTensor* dropout1_mask, - phi::DenseTensor* dropout2_mask, - phi::DenseTensor* ln1_mean, - phi::DenseTensor* ln1_variance, - phi::DenseTensor* ln2_mean, - phi::DenseTensor* ln2_variance, - phi::DenseTensor* linear1_out, - phi::DenseTensor* ln1_out, - phi::DenseTensor* dropout1_out, - phi::DenseTensor* dropout2_out, - const int bsz_seq, - const int d_model, - const int dim_feedforward, - const std::string& act_method, - const bool pre_layer_norm, - const float epsilon1, - const float epsilon2, - const bool add_residual, - const int ring_id, - const DropoutParam& dropout_param1, - const DropoutParam& dropout_param2) const { - FusedDropoutLayerNormHelper pre_layernorm_helper( - bsz_seq, d_model, epsilon1); - FusedDropoutHelper fused_act_dropout_helper( - ctx, bsz_seq, dim_feedforward, dropout_param1); - FusedDropoutLayerNormHelper fused_dropout_layernorm_helper( - ctx, bsz_seq, d_model, dropout_param2, epsilon2); - - using U = phi::funcs::LayerNormParamType; - const phi::DenseTensor* in = &x; - - const U* ln1_scale_ptr = - ln1_scale == nullptr ? nullptr : ln1_scale->data(); - const U* ln1_bias_ptr = ln1_bias == nullptr ? nullptr : ln1_bias->data(); - const U* ln2_scale_ptr = - ln2_scale == nullptr ? nullptr : ln2_scale->data(); - const U* ln2_bias_ptr = ln2_bias == nullptr ? nullptr : ln2_bias->data(); - const T* linear1_bias_ptr = - linear1_bias == nullptr ? nullptr : linear1_bias->data(); - const T* linear2_bias_ptr = - linear2_bias == nullptr ? nullptr : linear2_bias->data(); - - if (pre_layer_norm) { - pre_layernorm_helper.LayerNorm(ctx, - x.data(), - ln1_scale_ptr, - ln1_bias_ptr, - ln1_out->data(), - ln1_mean->data(), - ln1_variance->data()); - in = ln1_out; - } - MatMul(ctx, *in, linear1_weight, linear1_out); - fused_act_dropout_helper.DropoutActBias(ctx, - linear1_out->data(), - linear1_bias_ptr, - act_method, - dropout1_out->data(), - dropout1_mask->data()); - phi::DenseTensor linear2_out; - linear2_out.Resize({bsz_seq, d_model}); - ctx.Alloc(&linear2_out, linear2_out.numel() * sizeof(T)); - MatMul(ctx, *dropout1_out, linear2_weight, &linear2_out); + bool is_upscale_in_train1 = dropout1_implementation == "upscale_in_train"; + bool is_upscale_in_train2 = dropout2_implementation == "upscale_in_train"; + auto* dropout1_seed_ptr = dropout1_seed.get_ptr(); + auto* dropout2_seed_ptr = dropout2_seed.get_ptr(); + + phi::fusion::DropoutParam dropout_param1(dropout1_fix_seed, + 0, + is_test, + is_upscale_in_train1, + dropout1_prob, + dropout1_seed_ptr, + dropout1_seed_val); + phi::fusion::DropoutParam dropout_param2(dropout2_fix_seed, + 0, + is_test, + is_upscale_in_train2, + dropout2_prob, + dropout2_seed_ptr, + dropout2_seed_val); + + using U = phi::funcs::LayerNormParamType; + dev_ctx.template Alloc(out, out->numel() * sizeof(T)); + dev_ctx.template Alloc(dropout1_mask, + dropout1_mask->numel() * sizeof(uint8_t)); + dev_ctx.template Alloc(dropout2_mask, + dropout2_mask->numel() * sizeof(uint8_t)); + if (pre_layer_norm) { + dev_ctx.template Alloc(ln1_mean, ln1_mean->numel() * sizeof(U)); + dev_ctx.template Alloc(ln1_variance, ln1_variance->numel() * sizeof(U)); + dev_ctx.template Alloc(ln1_out, ln1_out->numel() * sizeof(T)); + } else { + dev_ctx.template Alloc(ln2_mean, ln2_mean->numel() * sizeof(U)); + dev_ctx.template Alloc(ln2_variance, ln2_variance->numel() * sizeof(U)); + } - // tensor model parallel - AllReduce(linear2_out, ring_id, ctx); - - const T* residual_ptr = add_residual ? x.data() : nullptr; - if (!pre_layer_norm) { - // TODO(Xreki): support post layer_norm case when add_residual is false. - PADDLE_ENFORCE_EQ(add_residual, - true, - platform::errors::InvalidArgument( - "Attribute add_residual is expected to be true " - "when pre_layer_norm is false.")); - - fused_dropout_layernorm_helper.LayernormResidualDropoutBias( - ctx, - linear2_out.data(), - residual_ptr, - linear2_bias_ptr, - ln2_scale_ptr, - ln2_bias_ptr, - dropout2_out->data(), - dropout2_mask->data(), - out->data(), - ln2_mean->data(), - ln2_variance->data()); - } else { - fused_dropout_layernorm_helper.ResidualDropoutBias( - ctx, - linear2_out.data(), - residual_ptr, - linear2_bias_ptr, - out->data(), - dropout2_mask->data()); - } + dev_ctx.template Alloc(linear1_out, linear1_out->numel() * sizeof(T)); + dev_ctx.template Alloc(dropout1_out, dropout1_out->numel() * sizeof(T)); + dev_ctx.template Alloc(dropout2_out, dropout2_out->numel() * sizeof(T)); + + auto x_dim = x_ptr->dims(); + auto mat_dim_x = phi::funcs::CreateMatrixDescriptor( + phi::RowMatrixFromVector(x_dim), 0, false); + + auto dim = linear1_weight_ptr->dims(); + int d_model = dim[0]; + int dim_feedforward = dim[dim.size() - 1]; + int bsz_seq = mat_dim_x.batch_size_ * mat_dim_x.height_; + + phi::fusion::FFN(dev_ctx, + x, + linear1_weight, + linear1_bias_ptr, + linear2_weight, + linear2_bias_ptr, + ln1_scale_ptr, + ln1_bias_ptr, + ln2_scale_ptr, + ln2_bias_ptr, + out, + dropout1_mask, + dropout2_mask, + ln1_mean, + ln1_variance, + ln2_mean, + ln2_variance, + linear1_out, + ln1_out, + dropout1_out, + dropout2_out, + bsz_seq, + d_model, + dim_feedforward, + act_method, + pre_layer_norm, + ln1_epsilon, + ln2_epsilon, + add_residual, + ring_id, + dropout_param1, + dropout_param2); +} + +template +void MatMulGrad(const phi::GPUContext& dev_ctx, + const phi::DenseTensor& d_out, + const phi::DenseTensor& a, + const phi::DenseTensor& b, + phi::DenseTensor* d_a, + phi::DenseTensor* d_b) { + auto blas = phi::funcs::GetBlas(dev_ctx); + auto a_2d = phi::FoldInitDims(a); + auto b_2d = phi::FoldInitDims(b); + auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_2d.dims(), 0, true); + auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_2d.dims(), 0, true); + auto mat_dim_dout = + phi::funcs::CreateMatrixDescriptor(d_out.dims(), 0, false); + T alpha = static_cast(1.0); + blas.MatMul(d_out, mat_dim_dout, b, mat_dim_b, alpha, d_a, T(0)); + blas.MatMul(a, mat_dim_a, d_out, mat_dim_dout, alpha, d_b, T(0)); +} + +template +void FFNGrad(const phi::GPUContext& dev_ctx, + const phi::DenseTensor& d_out, + const phi::DenseTensor& x, + const phi::DenseTensor& dropout1_mask, + const phi::DenseTensor& dropout2_mask, + const phi::DenseTensor& linear1_out, + const phi::DenseTensor* ln1_out, + const phi::DenseTensor& dropout1_out, + const phi::DenseTensor* dropout2_out, + const phi::DenseTensor& linear1_weight, + const phi::DenseTensor* linear1_bias, + const phi::DenseTensor& linear2_weight, + const phi::DenseTensor* ln1_gamma, + const phi::DenseTensor* ln1_beta, + const phi::DenseTensor* ln1_mean, + const phi::DenseTensor* ln1_variance, + const phi::DenseTensor* ln2_gamma, + const phi::DenseTensor* ln2_beta, + const phi::DenseTensor* ln2_mean, + const phi::DenseTensor* ln2_variance, + phi::DenseTensor* d_x, + phi::DenseTensor* d_linear1_weight, + phi::DenseTensor* d_linear1_bias, + phi::DenseTensor* d_linear2_weight, + phi::DenseTensor* d_linear2_bias, + phi::DenseTensor* d_ln1_gamma, + phi::DenseTensor* d_ln1_beta, + phi::DenseTensor* d_ln2_gamma, + phi::DenseTensor* d_ln2_beta, + const int bsz_seq, + const int d_model, + const int dim_feedforward, + const phi::fusion::DropoutParam& dropout_param1, + const phi::fusion::DropoutParam& dropout_param2, + const std::string& act_method, + const bool pre_layer_norm, + const float epsilon1, + const float epsilon2, + const bool add_residual, + const int ring_id) { + phi::fusion::FusedDropoutLayerNormHelper pre_layernorm_helper( + bsz_seq, d_model, epsilon1); + phi::fusion::FusedDropoutHelper fused_act_dropout_helper( + dev_ctx, bsz_seq, dim_feedforward, dropout_param1); + phi::fusion::FusedDropoutLayerNormHelper + fused_dropout_layernorm_helper( + dev_ctx, bsz_seq, d_model, dropout_param2, epsilon2); + + using U = phi::funcs::LayerNormParamType; + const U* ln1_gamma_ptr = + ln1_gamma == nullptr ? nullptr : ln1_gamma->data(); + const U* ln1_beta_ptr = ln1_beta == nullptr ? nullptr : ln1_beta->data(); + const U* ln2_gamma_ptr = + ln2_gamma == nullptr ? nullptr : ln2_gamma->data(); + const U* ln2_beta_ptr = ln2_beta == nullptr ? nullptr : ln2_beta->data(); + const T* linear1_bias_ptr = + linear1_bias == nullptr ? nullptr : linear1_bias->data(); + T* d_linear1_bias_ptr = + d_linear1_bias == nullptr ? nullptr : d_linear1_bias->data(); + T* d_linear2_bias_ptr = + d_linear2_bias == nullptr ? nullptr : d_linear2_bias->data(); + U* d_ln1_gamma_ptr = + d_ln1_gamma == nullptr ? nullptr : d_ln1_gamma->data(); + U* d_ln1_beta_ptr = d_ln1_beta == nullptr ? nullptr : d_ln1_beta->data(); + U* d_ln2_gamma_ptr = + d_ln2_gamma == nullptr ? nullptr : d_ln2_gamma->data(); + U* d_ln2_beta_ptr = d_ln2_beta == nullptr ? nullptr : d_ln2_beta->data(); + + phi::DenseTensor d_linear2_out, d_dropout2_out, d_residual; + d_linear2_out.Resize({bsz_seq, d_model}); + dev_ctx.template Alloc(&d_linear2_out, d_linear2_out.numel() * sizeof(T)); + d_dropout2_out.Resize({bsz_seq, d_model}); + dev_ctx.template Alloc(&d_dropout2_out, + d_dropout2_out.numel() * sizeof(T)); + + T* d_residual_ptr = nullptr; + if (add_residual) { + d_residual.Resize(d_x->dims()); + d_residual_ptr = + dev_ctx.template Alloc(&d_residual, d_residual.numel() * sizeof(T)); + } + if (pre_layer_norm) { + fused_dropout_layernorm_helper.ResidualDropoutBiasGrad( + dev_ctx, + d_out.data(), + dropout2_mask.data(), + d_linear2_out.data(), + d_residual_ptr, + d_linear2_bias_ptr); + } else { + fused_dropout_layernorm_helper.LayernormResidualDropoutBiasGrad( + dev_ctx, + d_out.data(), + dropout2_out->data(), + dropout2_mask.data(), + ln2_gamma_ptr, + ln2_mean->data(), + ln2_variance->data(), + d_dropout2_out.data(), + d_ln2_gamma_ptr, + d_ln2_beta_ptr, + d_linear2_out.data(), + d_linear2_bias_ptr, + d_residual_ptr); } - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* linear1_weight = context.Input("Linear1Weight"); - auto* linear1_bias = context.Input("Linear1Bias"); - auto* linear2_weight = context.Input("Linear2Weight"); - auto* linear2_bias = context.Input("Linear2Bias"); - const bool pre_layer_norm = context.Attr("pre_layer_norm"); - auto& dev_ctx = context.template device_context(); - - auto* ln1_scale = - pre_layer_norm ? context.Input("Ln1Scale") : nullptr; - auto* ln1_bias = - pre_layer_norm ? context.Input("Ln1Bias") : nullptr; - auto* ln2_scale = - !pre_layer_norm ? context.Input("Ln2Scale") : nullptr; - auto* ln2_bias = - !pre_layer_norm ? context.Input("Ln2Bias") : nullptr; - - auto* ln1_mean = - pre_layer_norm ? context.Output("Ln1Mean") : nullptr; - auto* ln1_variance = pre_layer_norm - ? context.Output("Ln1Variance") - : nullptr; - auto* ln2_mean = - !pre_layer_norm ? context.Output("Ln2Mean") : nullptr; - auto* ln2_variance = !pre_layer_norm - ? context.Output("Ln2Variance") - : nullptr; - auto* out = context.Output("Out"); - auto* dropout1_mask = context.Output("Dropout1Mask"); - auto* dropout2_mask = context.Output("Dropout2Mask"); - auto* linear1_out = context.Output("Linear1Out"); - auto* ln1_out = - pre_layer_norm ? context.Output("Ln1Out") : nullptr; - auto* dropout1_out = context.Output("Dropout1Out"); - auto* dropout2_out = context.Output("Dropout2Out"); - - const std::string act_method = context.Attr("act_method"); - - const float epsilon1 = context.Attr("ln1_epsilon"); - const float epsilon2 = context.Attr("ln2_epsilon"); - const int ring_id = context.Attr("ring_id"); - const bool add_residual = context.Attr("add_residual"); - - DropoutParam dropout_param1(context, 1); - DropoutParam dropout_param2(context, 2); - - using U = phi::funcs::LayerNormParamType; - dev_ctx.Alloc(out, out->numel() * sizeof(T)); - dev_ctx.Alloc(dropout1_mask, - dropout1_mask->numel() * sizeof(uint8_t)); - dev_ctx.Alloc(dropout2_mask, - dropout2_mask->numel() * sizeof(uint8_t)); - if (pre_layer_norm) { - dev_ctx.Alloc(ln1_mean, ln1_mean->numel() * sizeof(U)); - dev_ctx.Alloc(ln1_variance, ln1_variance->numel() * sizeof(U)); - dev_ctx.Alloc(ln1_out, ln1_out->numel() * sizeof(T)); - } else { - dev_ctx.Alloc(ln2_mean, ln2_mean->numel() * sizeof(U)); - dev_ctx.Alloc(ln2_variance, ln2_variance->numel() * sizeof(U)); - } - - dev_ctx.Alloc(linear1_out, linear1_out->numel() * sizeof(T)); - dev_ctx.Alloc(dropout1_out, dropout1_out->numel() * sizeof(T)); - dev_ctx.Alloc(dropout2_out, dropout2_out->numel() * sizeof(T)); - - auto x_dim = x->dims(); - auto mat_dim_x = phi::funcs::CreateMatrixDescriptor( - RowMatrixFromVector(x_dim), 0, false); - - auto dim = linear1_weight->dims(); - int d_model = dim[0]; - int dim_feedforward = dim[dim.size() - 1]; - int bsz_seq = mat_dim_x.batch_size_ * mat_dim_x.height_; - - FFN(context.cuda_device_context(), - *x, - *linear1_weight, - linear1_bias, - *linear2_weight, - linear2_bias, - ln1_scale, - ln1_bias, - ln2_scale, - ln2_bias, - out, - dropout1_mask, - dropout2_mask, - ln1_mean, - ln1_variance, - ln2_mean, - ln2_variance, - linear1_out, - ln1_out, - dropout1_out, - dropout2_out, - bsz_seq, - d_model, - dim_feedforward, - act_method, - pre_layer_norm, - epsilon1, - epsilon2, - add_residual, - ring_id, - dropout_param1, - dropout_param2); + phi::DenseTensor d_dropout1_out; + d_dropout1_out.Resize({bsz_seq, dim_feedforward}); + dev_ctx.template Alloc(&d_dropout1_out, + d_dropout1_out.numel() * sizeof(T)); + MatMulGrad(dev_ctx, + d_linear2_out, + dropout1_out, + linear2_weight, + &d_dropout1_out, + d_linear2_weight); + + phi::DenseTensor d_linear1_out; + d_linear1_out.Resize({bsz_seq, dim_feedforward}); + dev_ctx.template Alloc(&d_linear1_out, d_linear1_out.numel() * sizeof(T)); + fused_act_dropout_helper.DropoutActBiasGrad(dev_ctx, + d_dropout1_out.data(), + linear1_out.data(), + linear1_bias_ptr, + dropout1_mask.data(), + d_linear1_out.data(), + d_linear1_bias_ptr, + act_method); + + if (pre_layer_norm) { + phi::DenseTensor d_ln1_out; + d_ln1_out.Resize({bsz_seq, d_model}); + dev_ctx.template Alloc(&d_ln1_out, d_ln1_out.numel() * sizeof(T)); + MatMulGrad(dev_ctx, + d_linear1_out, + *ln1_out, + linear1_weight, + &d_ln1_out, + d_linear1_weight); + // tensor model parallel + phi::fusion::AllReduce(d_ln1_out, ring_id, dev_ctx); + pre_layernorm_helper.LayerNormGrad(dev_ctx, + d_ln1_out.data(), + x.data(), + ln1_gamma_ptr, + ln1_mean->data(), + ln1_variance->data(), + d_x->data(), + d_ln1_gamma_ptr, + d_ln1_beta_ptr); + } else { + MatMulGrad( + dev_ctx, d_linear1_out, x, linear1_weight, d_x, d_linear1_weight); + // tensor model parallel + phi::fusion::AllReduce(*d_x, ring_id, dev_ctx); } -}; - -template -class FusedFeedForwardGradKernel : public framework::OpKernel { - public: - void MatMulGrad(const phi::GPUContext& ctx, - const phi::DenseTensor& d_out, - const phi::DenseTensor& a, - const phi::DenseTensor& b, - phi::DenseTensor* d_a, - phi::DenseTensor* d_b) const { - auto blas = phi::funcs::GetBlas(ctx); - auto a_2d = FoldInitDims(a); - auto b_2d = FoldInitDims(b); - auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_2d.dims(), 0, true); - auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_2d.dims(), 0, true); - auto mat_dim_dout = - phi::funcs::CreateMatrixDescriptor(d_out.dims(), 0, false); - T alpha = static_cast(1.0); - blas.MatMul(d_out, mat_dim_dout, b, mat_dim_b, alpha, d_a, T(0)); - blas.MatMul(a, mat_dim_a, d_out, mat_dim_dout, alpha, d_b, T(0)); + + if (add_residual) { + // gradient accumulation + std::vector ins = {&d_residual, d_x}; + std::vector outs = {d_x}; + phi::funcs::ElementwiseKernel( + dev_ctx, ins, &outs, phi::funcs::AddFunctor()); } +} - void FFNGrad(const phi::GPUContext& ctx, - const phi::DenseTensor& d_out, - const phi::DenseTensor& x, - const phi::DenseTensor& dropout1_mask, - const phi::DenseTensor& dropout2_mask, - const phi::DenseTensor& linear1_out, - const phi::DenseTensor* ln1_out, - const phi::DenseTensor& dropout1_out, - const phi::DenseTensor* dropout2_out, - const phi::DenseTensor& linear1_weight, - const phi::DenseTensor* linear1_bias, - const phi::DenseTensor& linear2_weight, - const phi::DenseTensor* ln1_gamma, - const phi::DenseTensor* ln1_beta, - const phi::DenseTensor* ln1_mean, - const phi::DenseTensor* ln1_variance, - const phi::DenseTensor* ln2_gamma, - const phi::DenseTensor* ln2_beta, - const phi::DenseTensor* ln2_mean, - const phi::DenseTensor* ln2_variance, - phi::DenseTensor* d_x, - phi::DenseTensor* d_linear1_weight, - phi::DenseTensor* d_linear1_bias, - phi::DenseTensor* d_linear2_weight, - phi::DenseTensor* d_linear2_bias, - phi::DenseTensor* d_ln1_gamma, - phi::DenseTensor* d_ln1_beta, - phi::DenseTensor* d_ln2_gamma, - phi::DenseTensor* d_ln2_beta, - const int bsz_seq, - const int d_model, - const int dim_feedforward, - const DropoutParam& dropout_param1, - const DropoutParam& dropout_param2, - const std::string& act_method, - const bool pre_layer_norm, - const float epsilon1, - const float epsilon2, - const bool add_residual, - const int ring_id) const { - FusedDropoutLayerNormHelper pre_layernorm_helper( - bsz_seq, d_model, epsilon1); - FusedDropoutHelper fused_act_dropout_helper( - ctx, bsz_seq, dim_feedforward, dropout_param1); - FusedDropoutLayerNormHelper fused_dropout_layernorm_helper( - ctx, bsz_seq, d_model, dropout_param2, epsilon2); - - using U = phi::funcs::LayerNormParamType; - const U* ln1_gamma_ptr = - ln1_gamma == nullptr ? nullptr : ln1_gamma->data(); - const U* ln1_beta_ptr = ln1_beta == nullptr ? nullptr : ln1_beta->data(); - const U* ln2_gamma_ptr = - ln2_gamma == nullptr ? nullptr : ln2_gamma->data(); - const U* ln2_beta_ptr = ln2_beta == nullptr ? nullptr : ln2_beta->data(); - const T* linear1_bias_ptr = - linear1_bias == nullptr ? nullptr : linear1_bias->data(); - T* d_linear1_bias_ptr = - d_linear1_bias == nullptr ? nullptr : d_linear1_bias->data(); - T* d_linear2_bias_ptr = - d_linear2_bias == nullptr ? nullptr : d_linear2_bias->data(); - U* d_ln1_gamma_ptr = - d_ln1_gamma == nullptr ? nullptr : d_ln1_gamma->data(); - U* d_ln1_beta_ptr = d_ln1_beta == nullptr ? nullptr : d_ln1_beta->data(); - U* d_ln2_gamma_ptr = - d_ln2_gamma == nullptr ? nullptr : d_ln2_gamma->data(); - U* d_ln2_beta_ptr = d_ln2_beta == nullptr ? nullptr : d_ln2_beta->data(); - - phi::DenseTensor d_linear2_out, d_dropout2_out, d_residual; - d_linear2_out.Resize({bsz_seq, d_model}); - ctx.Alloc(&d_linear2_out, d_linear2_out.numel() * sizeof(T)); - d_dropout2_out.Resize({bsz_seq, d_model}); - ctx.Alloc(&d_dropout2_out, d_dropout2_out.numel() * sizeof(T)); - - T* d_residual_ptr = nullptr; - if (add_residual) { - d_residual.Resize(d_x->dims()); - d_residual_ptr = - ctx.Alloc(&d_residual, d_residual.numel() * sizeof(T)); - } - if (pre_layer_norm) { - fused_dropout_layernorm_helper.ResidualDropoutBiasGrad( - ctx, - d_out.data(), - dropout2_mask.data(), - d_linear2_out.data(), - d_residual_ptr, - d_linear2_bias_ptr); - } else { - fused_dropout_layernorm_helper.LayernormResidualDropoutBiasGrad( - ctx, - d_out.data(), - dropout2_out->data(), - dropout2_mask.data(), - ln2_gamma_ptr, - ln2_mean->data(), - ln2_variance->data(), - d_dropout2_out.data(), - d_ln2_gamma_ptr, - d_ln2_beta_ptr, - d_linear2_out.data(), - d_linear2_bias_ptr, - d_residual_ptr); - } - - phi::DenseTensor d_dropout1_out; - d_dropout1_out.Resize({bsz_seq, dim_feedforward}); - ctx.Alloc(&d_dropout1_out, d_dropout1_out.numel() * sizeof(T)); - MatMulGrad(ctx, - d_linear2_out, - dropout1_out, - linear2_weight, - &d_dropout1_out, - d_linear2_weight); - - phi::DenseTensor d_linear1_out; - d_linear1_out.Resize({bsz_seq, dim_feedforward}); - ctx.Alloc(&d_linear1_out, d_linear1_out.numel() * sizeof(T)); - fused_act_dropout_helper.DropoutActBiasGrad(ctx, - d_dropout1_out.data(), - linear1_out.data(), - linear1_bias_ptr, - dropout1_mask.data(), - d_linear1_out.data(), - d_linear1_bias_ptr, - act_method); - - if (pre_layer_norm) { - phi::DenseTensor d_ln1_out; - d_ln1_out.Resize({bsz_seq, d_model}); - ctx.Alloc(&d_ln1_out, d_ln1_out.numel() * sizeof(T)); - MatMulGrad(ctx, - d_linear1_out, - *ln1_out, - linear1_weight, - &d_ln1_out, - d_linear1_weight); - // tensor model parallel - AllReduce(d_ln1_out, ring_id, ctx); - pre_layernorm_helper.LayerNormGrad(ctx, - d_ln1_out.data(), - x.data(), - ln1_gamma_ptr, - ln1_mean->data(), - ln1_variance->data(), - d_x->data(), - d_ln1_gamma_ptr, - d_ln1_beta_ptr); - } else { - MatMulGrad(ctx, d_linear1_out, x, linear1_weight, d_x, d_linear1_weight); - // tensor model parallel - AllReduce(*d_x, ring_id, ctx); - } - - if (add_residual) { - // gradient accumulation - std::vector ins = {&d_residual, d_x}; - std::vector outs = {d_x}; - phi::funcs::ElementwiseKernel( - ctx, ins, &outs, phi::funcs::AddFunctor()); - } +template +void FusedFeedForwardGradKernel( + const Context& dev_ctx, + const DenseTensor& out_grad, + const DenseTensor& x, + const DenseTensor& linear1_weight, + const paddle::optional& linear1_bias, + const DenseTensor& linear2_weight, + const DenseTensor& dropout1_mask, + const DenseTensor& dropout2_mask, + const DenseTensor& linear1_out, + const DenseTensor& dropout1_out, + const paddle::optional& dropout2_out, + const paddle::optional& ln1_scale, + const paddle::optional& ln1_bias, + const paddle::optional& ln1_out, + const paddle::optional& ln1_mean, + const paddle::optional& ln1_variance, + const paddle::optional& ln2_scale, + const paddle::optional& ln2_bias, + const paddle::optional& ln2_mean, + const paddle::optional& ln2_variance, + const paddle::optional& linear2_bias, + bool pre_layer_norm, + float ln1_epsilon, + float ln2_epsilon, + const std::string& act_method, + float dropout1_prob, + float dropout2_prob, + const std::string& dropout1_implementation, + const std::string& dropout2_implementation, + bool is_test, + bool dropout1_fix_seed, + bool dropout2_fix_seed, + int dropout1_seed_val, + int dropout2_seed_val, + bool add_residual, + int ring_id, + DenseTensor* x_grad, + DenseTensor* ln1_scale_grad, + DenseTensor* ln1_bias_grad, + DenseTensor* ln2_scale_grad, + DenseTensor* ln2_bias_grad, + DenseTensor* linear1_weight_grad, + DenseTensor* linear1_bias_grad, + DenseTensor* linear2_weight_grad, + DenseTensor* linear2_bias_grad) { + using U = phi::funcs::LayerNormParamType; + + auto* ln1_out_ptr = pre_layer_norm ? ln1_out.get_ptr() : nullptr; + auto* dropout2_out_ptr = dropout2_out.get_ptr(); + auto* linear1_bias_ptr = linear1_bias.get_ptr(); + auto* ln1_mean_ptr = pre_layer_norm ? ln1_mean.get_ptr() : nullptr; + auto* ln1_variance_ptr = pre_layer_norm ? ln1_variance.get_ptr() : nullptr; + auto* ln1_scale_ptr = pre_layer_norm ? ln1_scale.get_ptr() : nullptr; + auto* ln1_bias_ptr = pre_layer_norm ? ln1_bias.get_ptr() : nullptr; + auto* ln2_mean_ptr = !pre_layer_norm ? ln2_mean.get_ptr() : nullptr; + auto* ln2_variance_ptr = !pre_layer_norm ? ln2_variance.get_ptr() : nullptr; + auto* ln2_scale_ptr = !pre_layer_norm ? ln2_scale.get_ptr() : nullptr; + auto* ln2_bias_ptr = !pre_layer_norm ? ln2_bias.get_ptr() : nullptr; + + auto* d_x = x_grad; + auto* d_ln1_scale = pre_layer_norm ? ln1_scale_grad : nullptr; + auto* d_ln1_bias = pre_layer_norm ? ln1_bias_grad : nullptr; + auto* d_ln2_scale = pre_layer_norm ? nullptr : ln2_scale_grad; + auto* d_ln2_bias = pre_layer_norm ? nullptr : ln2_bias_grad; + auto* d_linear1_weight = linear1_weight_grad; + auto* d_linear1_bias = linear1_bias_grad; + auto* d_linear2_weight = linear2_weight_grad; + auto* d_linear2_bias = linear2_bias_grad; + + bool is_upscale_in_train1 = dropout1_implementation == "upscale_in_train"; + bool is_upscale_in_train2 = dropout2_implementation == "upscale_in_train"; + + phi::fusion::DropoutParam dropout_param1(dropout1_fix_seed, + 0, + is_test, + is_upscale_in_train1, + dropout1_prob, + nullptr, + dropout1_seed_val); + phi::fusion::DropoutParam dropout_param2(dropout2_fix_seed, + 0, + is_test, + is_upscale_in_train2, + dropout2_prob, + nullptr, + dropout2_seed_val); + + dev_ctx.template Alloc(d_x, d_x->numel() * sizeof(T)); + if (d_ln1_scale) { + dev_ctx.template Alloc(d_ln1_scale, d_ln1_scale->numel() * sizeof(U)); + } + if (d_ln1_bias) { + dev_ctx.template Alloc(d_ln1_bias, d_ln1_bias->numel() * sizeof(U)); + } + if (d_ln2_scale) { + dev_ctx.template Alloc(d_ln2_scale, d_ln2_scale->numel() * sizeof(U)); + } + if (d_ln2_bias) { + dev_ctx.template Alloc(d_ln2_bias, d_ln2_bias->numel() * sizeof(U)); } + if (d_linear1_bias) { + dev_ctx.template Alloc(d_linear1_bias, + d_linear1_bias->numel() * sizeof(T)); + } + if (d_linear2_bias) { + dev_ctx.template Alloc(d_linear2_bias, + d_linear2_bias->numel() * sizeof(T)); + } + dev_ctx.template Alloc(d_linear1_weight, + d_linear1_weight->numel() * sizeof(T)); + dev_ctx.template Alloc(d_linear2_weight, + d_linear2_weight->numel() * sizeof(T)); + + auto x_dim = x.dims(); + auto mat_dim_x = phi::funcs::CreateMatrixDescriptor( + phi::RowMatrixFromVector(x_dim), 0, false); + + auto linear1_weight_dim = linear1_weight.dims(); + int d_model = linear1_weight_dim[0]; + int dim_feedforward = linear1_weight_dim[linear1_weight_dim.size() - 1]; + int bsz_seq = mat_dim_x.batch_size_ * mat_dim_x.height_; + + FFNGrad(dev_ctx, + out_grad, + x, + dropout1_mask, + dropout2_mask, + linear1_out, + ln1_out_ptr, + dropout1_out, + dropout2_out_ptr, + linear1_weight, + linear1_bias_ptr, + linear2_weight, + ln1_scale_ptr, + ln1_bias_ptr, + ln1_mean_ptr, + ln1_variance_ptr, + ln2_scale_ptr, + ln2_bias_ptr, + ln2_mean_ptr, + ln2_variance_ptr, + d_x, + d_linear1_weight, + d_linear1_bias, + d_linear2_weight, + d_linear2_bias, + d_ln1_scale, + d_ln1_bias, + d_ln2_scale, + d_ln2_bias, + bsz_seq, + d_model, + dim_feedforward, + dropout_param1, + dropout_param2, + act_method, + pre_layer_norm, + ln1_epsilon, + ln2_epsilon, + add_residual, + ring_id); +} +} // namespace fusion +} // namespace phi + +PD_REGISTER_KERNEL(fused_feedforward, + GPU, + ALL_LAYOUT, + phi::fusion::FusedFeedForwardKernel, + float, + double, + phi::dtype::float16) { + kernel->OutputAt(1).SetDataType(phi::DataType::UINT8); + kernel->OutputAt(2).SetDataType(phi::DataType::UINT8); + if (kernel_key.dtype() == phi::DataType::FLOAT16) { + kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(5).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(6).SetDataType(phi::DataType::FLOAT32); + } +} - void Compute(const framework::ExecutionContext& context) const override { - using U = phi::funcs::LayerNormParamType; - auto& dev_ctx = context.template device_context(); - auto d_out = - *context.Input(framework::GradVarName("Out")); - auto x = *context.Input("X"); - const bool pre_layer_norm = context.Attr("pre_layer_norm"); - auto dropout1_mask = *context.Input("Dropout1Mask"); - auto dropout2_mask = *context.Input("Dropout2Mask"); - auto linear1_out = *context.Input("Linear1Out"); - auto* ln1_out = - pre_layer_norm ? context.Input("Ln1Out") : nullptr; - auto dropout1_out = *context.Input("Dropout1Out"); - auto* dropout2_out = context.Input("Dropout2Out"); - auto linear1_weight = *context.Input("Linear1Weight"); - auto* linear1_bias = context.Input("Linear1Bias"); - auto linear2_weight = *context.Input("Linear2Weight"); - auto* ln1_mean = - pre_layer_norm ? context.Input("Ln1Mean") : nullptr; - auto* ln1_variance = pre_layer_norm - ? context.Input("Ln1Variance") - : nullptr; - auto* ln1_scale = - pre_layer_norm ? context.Input("Ln1Scale") : nullptr; - auto* ln1_bias = - pre_layer_norm ? context.Input("Ln1Bias") : nullptr; - auto* ln2_mean = - !pre_layer_norm ? context.Input("Ln2Mean") : nullptr; - auto* ln2_variance = !pre_layer_norm - ? context.Input("Ln2Variance") - : nullptr; - auto* ln2_scale = - !pre_layer_norm ? context.Input("Ln2Scale") : nullptr; - auto* ln2_bias = - !pre_layer_norm ? context.Input("Ln2Bias") : nullptr; - - auto* d_x = context.Output(framework::GradVarName("X")); - auto* d_ln1_scale = pre_layer_norm ? context.Output( - framework::GradVarName("Ln1Scale")) - : nullptr; - auto* d_ln1_bias = pre_layer_norm ? context.Output( - framework::GradVarName("Ln1Bias")) - : nullptr; - auto* d_ln2_scale = pre_layer_norm - ? nullptr - : context.Output( - framework::GradVarName("Ln2Scale")); - auto* d_ln2_bias = pre_layer_norm ? nullptr - : context.Output( - framework::GradVarName("Ln2Bias")); - auto* d_linear1_weight = context.Output( - framework::GradVarName("Linear1Weight")); - auto* d_linear1_bias = - context.Output(framework::GradVarName("Linear1Bias")); - auto* d_linear2_weight = context.Output( - framework::GradVarName("Linear2Weight")); - auto* d_linear2_bias = - context.Output(framework::GradVarName("Linear2Bias")); - - const float epsilon1 = context.Attr("ln1_epsilon"); - const float epsilon2 = context.Attr("ln2_epsilon"); - const bool add_residual = context.Attr("add_residual"); - const int ring_id = context.Attr("ring_id"); - const std::string act_method = context.Attr("act_method"); - DropoutParam dropout_param1(context, 1); - DropoutParam dropout_param2(context, 2); - - dev_ctx.Alloc(d_x, d_x->numel() * sizeof(T)); - if (d_ln1_scale) { - dev_ctx.Alloc(d_ln1_scale, d_ln1_scale->numel() * sizeof(U)); - } - if (d_ln1_bias) { - dev_ctx.Alloc(d_ln1_bias, d_ln1_bias->numel() * sizeof(U)); - } - if (d_ln2_scale) { - dev_ctx.Alloc(d_ln2_scale, d_ln2_scale->numel() * sizeof(U)); - } - if (d_ln2_bias) { - dev_ctx.Alloc(d_ln2_bias, d_ln2_bias->numel() * sizeof(U)); - } - if (d_linear1_bias) { - dev_ctx.Alloc(d_linear1_bias, d_linear1_bias->numel() * sizeof(T)); - } - if (d_linear2_bias) { - dev_ctx.Alloc(d_linear2_bias, d_linear2_bias->numel() * sizeof(T)); - } - dev_ctx.Alloc(d_linear1_weight, d_linear1_weight->numel() * sizeof(T)); - dev_ctx.Alloc(d_linear2_weight, d_linear2_weight->numel() * sizeof(T)); - - auto x_dim = x.dims(); - auto mat_dim_x = phi::funcs::CreateMatrixDescriptor( - RowMatrixFromVector(x_dim), 0, false); - - auto linear1_weight_dim = linear1_weight.dims(); - int d_model = linear1_weight_dim[0]; - int dim_feedforward = linear1_weight_dim[linear1_weight_dim.size() - 1]; - int bsz_seq = mat_dim_x.batch_size_ * mat_dim_x.height_; - - FFNGrad(context.cuda_device_context(), - d_out, - x, - dropout1_mask, - dropout2_mask, - linear1_out, - ln1_out, - dropout1_out, - dropout2_out, - linear1_weight, - linear1_bias, - linear2_weight, - ln1_scale, - ln1_bias, - ln1_mean, - ln1_variance, - ln2_scale, - ln2_bias, - ln2_mean, - ln2_variance, - d_x, - d_linear1_weight, - d_linear1_bias, - d_linear2_weight, - d_linear2_bias, - d_ln1_scale, - d_ln1_bias, - d_ln2_scale, - d_ln2_bias, - bsz_seq, - d_model, - dim_feedforward, - dropout_param1, - dropout_param2, - act_method, - pre_layer_norm, - epsilon1, - epsilon2, - add_residual, - ring_id); +PD_REGISTER_KERNEL(fused_feedforward_grad, + GPU, + ALL_LAYOUT, + phi::fusion::FusedFeedForwardGradKernel, + float, + double, + phi::dtype::float16) { + if (kernel_key.dtype() == phi::DataType::FLOAT16) { + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -PD_REGISTER_STRUCT_KERNEL(fused_feedforward, - GPU, - ALL_LAYOUT, - ops::FusedFeedForwardKernel, - float, - double, - plat::float16) {} -PD_REGISTER_STRUCT_KERNEL(fused_feedforward_grad, - GPU, - ALL_LAYOUT, - ops::FusedFeedForwardGradKernel, - float, - double, - plat::float16) {} +} diff --git a/paddle/phi/kernels/funcs/functors.h b/paddle/phi/kernels/funcs/functors.h index 3c7ae5ed09af3..ce67f7f167199 100644 --- a/paddle/phi/kernels/funcs/functors.h +++ b/paddle/phi/kernels/funcs/functors.h @@ -25,11 +25,6 @@ struct MulGradFunctor { inline HOSTDEVICE T Dy(T x, T y) { return x; } }; -template -struct MaxFunctor { - inline HOSTDEVICE T operator()(T a, T b) const { return a < b ? b : a; } -}; - template struct AddGradFunctor { inline HOSTDEVICE T Dx(T x, T y) { return static_cast(1.); } diff --git a/paddle/phi/kernels/fused_feedforward_grad_kernel.h b/paddle/phi/kernels/fused_feedforward_grad_kernel.h index 9eee46a83987e..79b175d45ee89 100644 --- a/paddle/phi/kernels/fused_feedforward_grad_kernel.h +++ b/paddle/phi/kernels/fused_feedforward_grad_kernel.h @@ -24,13 +24,13 @@ void FusedFeedForwardGradKernel( const DenseTensor& out_grad, const DenseTensor& x, const DenseTensor& linear1_weight, - const DenseTensor& linear1_bias, + const paddle::optional& linear1_bias, const DenseTensor& linear2_weight, const DenseTensor& dropout1_mask, const DenseTensor& dropout2_mask, const DenseTensor& linear1_out, const DenseTensor& dropout1_out, - const DenseTensor& dropout2_out, + const paddle::optional& dropout2_out, const paddle::optional& ln1_scale, const paddle::optional& ln1_bias, const paddle::optional& ln1_out, diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h b/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h index 57cbf678b92b3..c73a35d2265ce 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h @@ -30,6 +30,8 @@ #include "paddle/phi/kernels/fusion/gpu/fused_residual_dropout_bias.h" #include "paddle/phi/kernels/layer_norm_kernel.h" +PHI_DECLARE_bool(use_fast_math); + namespace phi { namespace fusion { @@ -292,21 +294,22 @@ class FusedDropoutHelper { T* d_bias, const std::string& act_method) { if (act_method == "gelu") { - phi::funcs::GeluGradFunctor gelu_grad; - phi::fusion:: - LaunchDropoutActBiasGrad>( - gelu_grad, - dout, - mask, - src, - bias, - dropout_param_.dropout_prob, - dropout_param_.is_upscale_in_train, - rows_, - cols_, - d_src, - d_bias, - ctx); + phi::fusion::GeluGradFunctor gelu_grad; + phi::fusion::LaunchDropoutActBiasGrad>( + gelu_grad, + dout, + mask, + src, + bias, + dropout_param_.dropout_prob, + dropout_param_.is_upscale_in_train, + rows_, + cols_, + d_src, + d_bias, + ctx); } else if (act_method == "relu") { phi::funcs::ReluGradFunctor relu_grad; phi::fusion:: diff --git a/paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_kernel.cc b/paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_kernel.cc index 6798df360de19..3448efca7c3ab 100644 --- a/paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_kernel.cc +++ b/paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_kernel.cc @@ -366,13 +366,13 @@ void FusedFeedForwardGradKernel( const DenseTensor& out_grad, const DenseTensor& x, const DenseTensor& linear1_weight, - const DenseTensor& linear1_bias, + const paddle::optional& linear1_bias, const DenseTensor& linear2_weight, const DenseTensor& dropout1_mask, const DenseTensor& dropout2_mask, const DenseTensor& linear1_out, const DenseTensor& dropout1_out, - const DenseTensor& dropout2_out, + const paddle::optional& dropout2_out, const paddle::optional& ln1_scale, const paddle::optional& ln1_bias, const paddle::optional& ln1_out, @@ -417,7 +417,7 @@ void FusedFeedForwardGradKernel( auto* ln1_out_ptr = pre_layer_norm ? ln1_out.get_ptr() : nullptr; auto* dropout1_out_ptr = &dropout1_out; - auto* dropout2_out_ptr = &dropout2_out; + auto* dropout2_out_ptr = dropout2_out.get_ptr(); auto* linear1_weight_ptr = &linear1_weight; auto* linear2_weight_ptr = &linear2_weight; diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index a383e1b0c0624..e7d11ed8b16d6 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -1162,6 +1162,8 @@ set(STATIC_BUILD_TESTS test_fetch_lod_tensor_array test_fused_attention_op test_fused_attention_op_api + test_fused_feedforward_op + test_fused_feedforward_pass test_imperative_optimizer test_lamb_op test_layer_norm_op @@ -1191,6 +1193,8 @@ set(STATIC_BUILD_TESTS if(NOT WITH_GPU) list(REMOVE_ITEM STATIC_BUILD_TESTS test_fused_attention_op) list(REMOVE_ITEM STATIC_BUILD_TESTS test_fused_attention_op_api) + list(REMOVE_ITEM STATIC_BUILD_TESTS test_fused_feedforward_op) + list(REMOVE_ITEM STATIC_BUILD_TESTS test_fused_feedforward_op_pass) endif() foreach(STATIC_BUILD_TEST ${STATIC_BUILD_TESTS}) From 8bfd978f06ca78e6040df0027343dac44c65bcc3 Mon Sep 17 00:00:00 2001 From: NetPunk <69072522+Patrick-Star125@users.noreply.github.com> Date: Thu, 27 Apr 2023 14:22:50 +0800 Subject: [PATCH 107/405] =?UTF-8?q?=E3=80=90PaddlePaddle=20Hackathon=204?= =?UTF-8?q?=E3=80=91=EF=BC=9A=E4=B8=BAmaxout=E7=AE=97=E5=AD=90=E6=94=AF?= =?UTF-8?q?=E6=8C=81=20float16=20=E6=95=B0=E6=8D=AE=E7=B1=BB=E5=9E=8B=20(#?= =?UTF-8?q?50976)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * support fp16 for maxout op * format code * change api * add test for static float16 * format code * formatting code * atol alignment * experiment—1 * experiment-2 * experiment-3 * format code --- paddle/phi/kernels/funcs/maxouting.cu | 2 ++ paddle/phi/kernels/gpu/maxout_grad_kernel.cu | 9 +++-- paddle/phi/kernels/gpu/maxout_kernel.cu | 8 ++++- .../fluid/tests/unittests/test_maxout_op.py | 35 +++++++++++++++++++ python/paddle/nn/functional/activation.py | 6 ++-- 5 files changed, 55 insertions(+), 5 deletions(-) diff --git a/paddle/phi/kernels/funcs/maxouting.cu b/paddle/phi/kernels/funcs/maxouting.cu index 89450dbd5c60b..146bb1aca4c1b 100644 --- a/paddle/phi/kernels/funcs/maxouting.cu +++ b/paddle/phi/kernels/funcs/maxouting.cu @@ -175,9 +175,11 @@ void MaxOutGradFunctor::operator()( } template class MaxOutGradFunctor; +template class MaxOutGradFunctor; template class MaxOutGradFunctor; template class MaxOutFunctor; +template class MaxOutFunctor; template class MaxOutFunctor; } // namespace funcs diff --git a/paddle/phi/kernels/gpu/maxout_grad_kernel.cu b/paddle/phi/kernels/gpu/maxout_grad_kernel.cu index a405f38523a75..7d59436019c71 100644 --- a/paddle/phi/kernels/gpu/maxout_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/maxout_grad_kernel.cu @@ -15,5 +15,10 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/maxout_grad_kernel_impl.h" -PD_REGISTER_KERNEL( - maxout_grad, GPU, ALL_LAYOUT, phi::MaxOutGradKernel, float, double) {} +PD_REGISTER_KERNEL(maxout_grad, + GPU, + ALL_LAYOUT, + phi::MaxOutGradKernel, + float, + phi::dtype::float16, + double) {} diff --git a/paddle/phi/kernels/gpu/maxout_kernel.cu b/paddle/phi/kernels/gpu/maxout_kernel.cu index e5407a4925c84..4871046450264 100644 --- a/paddle/phi/kernels/gpu/maxout_kernel.cu +++ b/paddle/phi/kernels/gpu/maxout_kernel.cu @@ -15,4 +15,10 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/maxout_kernel_impl.h" -PD_REGISTER_KERNEL(maxout, GPU, ALL_LAYOUT, phi::MaxOutKernel, float, double) {} +PD_REGISTER_KERNEL(maxout, + GPU, + ALL_LAYOUT, + phi::MaxOutKernel, + float, + phi::dtype::float16, + double) {} diff --git a/python/paddle/fluid/tests/unittests/test_maxout_op.py b/python/paddle/fluid/tests/unittests/test_maxout_op.py index 678dd55fe92c1..b6d339c3aab28 100644 --- a/python/paddle/fluid/tests/unittests/test_maxout_op.py +++ b/python/paddle/fluid/tests/unittests/test_maxout_op.py @@ -136,5 +136,40 @@ def test_errors(self): self.assertRaises(ValueError, F.maxout, x_float32, 2, 2) +class TestMaxOutOpFP16(TestMaxOutOp): + def set_attrs(self): + self.dtype = 'float16' + + +class TestMaxoutFP16Case1(TestMaxOutOpFP16): + def set_attrs(self): + self.axis = -1 + + +class TestMaxoutFP16Case2(TestMaxOutOpFP16): + def set_attrs(self): + self.axis = 3 + + +@unittest.skipIf( + not core.is_compiled_with_cuda(), "core is not compiled with CUDA" +) +class TestMaxoutStaticAPIFP16(unittest.TestCase): + def setUp(self): + self.x_np = np.random.uniform(-1, 1, [2, 6, 5, 4]).astype(np.float16) + self.groups = 2 + self.axis = 1 + self.place = paddle.CUDAPlace(0) + + def test_static_api(self): + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.static.data('X', self.x_np.shape, self.x_np.dtype) + out = F.maxout(x, self.groups, self.axis) + exe = paddle.static.Executor(self.place) + res = exe.run(feed={'X': self.x_np}, fetch_list=[out]) + out_ref = maxout_forward_naive(self.x_np, self.groups, self.axis) + np.testing.assert_allclose(out_ref, res[0], rtol=1e-05) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py index 04fa9ebc6dd09..5bb9d2b1d0370 100644 --- a/python/paddle/nn/functional/activation.py +++ b/python/paddle/nn/functional/activation.py @@ -787,7 +787,7 @@ def maxout(x, groups, axis=1, name=None): Parameters: x (Tensor): The input is 4-D Tensor with shape [N, C, H, W] or [N, H, W, C], the data type - of input is float32 or float64. + of input is float16, float32 or float64. groups (int): The groups number of maxout. `groups` specifies the index of channel dimension where maxout will be performed. This must be a factor of number of features. @@ -822,7 +822,9 @@ def maxout(x, groups, axis=1, name=None): if in_dygraph_mode(): return _C_ops.maxout(x, groups, axis) else: - check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'maxout') + check_variable_and_dtype( + x, 'x', ['float16', 'float32', 'float64'], 'maxout' + ) if axis not in [1, -1, 3]: raise ValueError( "Attr(axis) should be 1 when data format is NCHW, -1 or 3 when data format is NHWC. Received " From cf6cbc347970a1fd2c9d76e427880139789497af Mon Sep 17 00:00:00 2001 From: Wang Xin Date: Thu, 27 Apr 2023 14:34:06 +0800 Subject: [PATCH 108/405] autogen code support for max_pool[2,3]_with_index op (#53359) --- paddle/fluid/operators/pool_with_index_op.cc | 302 ------------------- paddle/phi/api/yaml/backward.yaml | 18 ++ paddle/phi/api/yaml/legacy_backward.yaml | 18 -- paddle/phi/api/yaml/legacy_ops.yaml | 18 -- paddle/phi/api/yaml/op_compat.yaml | 16 + paddle/phi/api/yaml/ops.yaml | 18 ++ paddle/phi/ops/compat/pool_sig.cc | 46 --- 7 files changed, 52 insertions(+), 384 deletions(-) delete mode 100644 paddle/fluid/operators/pool_with_index_op.cc diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc deleted file mode 100644 index 79262db30fafb..0000000000000 --- a/paddle/fluid/operators/pool_with_index_op.cc +++ /dev/null @@ -1,302 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/infermeta_utils.h" -#include "paddle/phi/infermeta/backward.h" -#include "paddle/phi/infermeta/unary.h" - -namespace paddle { -namespace operators { - -inline int MaxPoolOutputSize(int input_size, - int filter_size, - int padding, - int stride) { - PADDLE_ENFORCE_NE( - stride, - 0, - phi::errors::InvalidArgument( - "The stride of MaxPool shall not be 0, but received %d.", stride)); - int output_size = (input_size - filter_size + 2 * padding) / stride + 1; - return output_size; -} - -class MaxPoolWithIndexOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"), - ctx.device_context().GetPlace()); - } -}; - -class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")), - ctx.device_context().GetPlace()); - } -}; - -class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput( - "X", - "(Tensor) The input tensor of pooling operator. " - "The format of input tensor is NCHW, where N is batch size, C is the " - "number of channels, H is the height of the image, " - "and W is the width of the image."); - AddOutput("Out", - "(Tensor) The output tensor of pooling operator. " - "The format of output tensor is also NCHW, " - "where N is batch size, C is " - "the number of channels, H is the height of the image " - "and W is the width of the image."); - AddOutput("Mask", - "(Tensor) The Mask tensor of pooling operator." - "The format of output tensor is also NCHW, " - "where N is batch size, C is the number of channels, " - "H is the height of the image, " - "and W is the width of the image. " - "It represents the index in the current feature map."); - - AddAttr>("ksize", - "(vector) The pooling window size(height, " - "width) of pooling operator. " - "If global_pooling = true, ksize and paddings " - "will be ignored."); // TODO(Chengduo): Add - // checker. (Currently, - // TypedAttrChecker don't support vector type.) - AddAttr( - "global_pooling", - "(bool, default:false) Whether to use the global pooling. " - "If global_pooling = true, ksize and paddings will be ignored.") - .SetDefault(false); - AddAttr( - "adaptive", - "(bool, default False) When true, will perform adaptive pooling " - "instead, " - "output shape in H and W dimensions will be same as ksize, input data " - "will be divided into grids specify by ksize averagely and perform " - "pooling in each grid area to get output pooling value.") - .SetDefault(false); - AddAttr>("strides", - "(vector, default {1, 1}), strides(height, " - "width) of pooling operator.") - .SetDefault({1, 1}); // TODO(Chengduo): Add checker. (Currently, - // TypedAttrChecker don't support vector type.) - AddAttr>( - "paddings", - "(vector, default:{0, 0}), paddings(height, width) of pooling " - "operator. " - "If global_pooling = true, paddings and will be ignored.") - .SetDefault({0, 0}); // TODO(Chengduo): Add checker. (Currently, - // TypedAttrChecker don't support vector type.) - - AddComment(R"DOC( -MaxPool2d Operator. - -The maxPooling2d with index operation calculates the output and the mask -based on the input, ksize, strides, and paddings parameters. Input(X) and -output(Out, Mask) are in NCHW format, where N is batch size, C is the -number of channels, H is the height of the feature, -and W is the width of the feature. -Parameters(ksize, strides, paddings) are two elements. -These two elements represent height and width, respectively. -The input(X) size and output(Out, Mask) size may be different. - -Example: - Input: - X shape: $(N, C, H_{in}, W_{in})$ - Output: - Out shape: $(N, C, H_{out}, W_{out})$ - Mask shape: $(N, C, H_{out}, W_{out})$ - Where - $$ - H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\ - W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 - $$ - - For adaptive = true: - $$ - H_{out} = ksize[0] W_{out} = ksize[1] - $$ - - -)DOC"); - } -}; - -class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", - "(Tensor) The input tensor of pooling operator. " - "The format of input tensor is NCDHW, where N is batch size, C is " - "the number of channels, and D, H and W are the depth, height and " - "width of " - "the image, respectively"); - AddOutput("Out", - "(Tensor) The output tensor of pooling operator. " - "The format of output tensor is also NCDHW, " - "where N is the batch size, C is the number of channels, " - "and D, H and W are the depth, height and " - "width of the image, respectively."); - AddOutput("Mask", - "(Tensor) The Mask tensor of pooling operator. " - "The format of output tensor is also NCDHW, " - "where N is the batch size, C is the number of channels, and " - "D, H and W are the depth, height and width " - "of the image, respectively. " - "It represents the index in the current feature map."); - - AddAttr>("ksize", - "(vector) The pooling window size(depth, " - "height, width) of pooling operator. " - "If global_pooling = true, ksize and paddings " - "will be ignored."); // TODO(Chengduo): Add - // checker. (Currently, - // TypedAttrChecker don't support vector type.) - AddAttr( - "global_pooling", - "(bool, default false) Whether to use the global pooling. " - "If global_pooling = true, ksize and paddings will be ignored.") - .SetDefault(false); - AddAttr( - "adaptive", - "(bool, default False) When true, will perform adaptive pooling " - "instead, " - "output shape in H and W dimensions will be same as ksize, input data " - "will be divided into grids specify by ksize averagely and perform " - "pooling in each grid area to get output pooling value.") - .SetDefault(false); - AddAttr>("strides", - "(vector, default {1,1,1}), strides(depth, " - "height, width) of pooling operator.") - .SetDefault({1, 1, 1}); // TODO(Chengduo): Add checker. (Currently, - // TypedAttrChecker don't support vector type.) - AddAttr>( - "paddings", - "(vector, default {0,0,0}), paddings(depth, " - "height, width) of pooling operator. " - "If global_pooling = true, paddings and ksize will be ignored.") - .SetDefault({0, 0, 0}); // TODO(Chengduo): Add checker. (Currently, - // TypedAttrChecker don't support vector type.) - - AddComment(R"DOC( -MaxPool3d Operator. - -The maxpooling3d with index operation calculates the output and the mask -based on the input and ksize, strides, paddings parameters. -Input(X) and output(Out, Mask) are in NCDHW format, where N is batch -size, C is the number of channels, and D, H and W are the depth, height and -width of the feature, respectively. -Parameters(ksize, strides, paddings) are three elements. -These three elements represent depth, height and width, respectively. -The input(X) size and output(Out, Mask) size may be different. - -Example: - Input: - X shape: $(N, C, D_{in}, H_{in}, W_{in})$ - Output: - Out shape: $(N, C, D_{out}, H_{out}, W_{out})$ - Mask shape: $(N, C, D_{out}, H_{out}, W_{out})$ - Where - $$ - D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\ - H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\ - W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1 - $$ - - For adaptive = true: - $$ - D_{out} = ksize[0] H_{out} = ksize[1] W_{out} = ksize[2] - $$ - -)DOC"); - } -}; - -template -class MaxPoolWithIndexGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr op) const override { - op->SetType(this->ForwardOpType() + "_grad"); - op->SetAttrMap(this->Attrs()); - op->SetInput("X", this->Input("X")); - op->SetInput("Mask", this->Output("Mask")); - op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - } -}; - -DECLARE_NO_NEED_BUFFER_VARS_INFERER( - MaxPoolWithIndexOpGradNoNeedBufferVarsInferer, "X"); - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -DECLARE_INFER_SHAPE_FUNCTOR(max_pool2d_with_index, - MaxPool2dWithIndexInferShapeFunctor, - PD_INFER_META(phi::MaxPoolWithIndexInferMeta)); -DECLARE_INFER_SHAPE_FUNCTOR(max_pool2d_with_index_grad, - MaxPool2dWithIndexGradInferShapeFunctor, - PD_INFER_META(phi::MaxPoolWithIndexGradInferMeta)); - -REGISTER_OPERATOR(max_pool2d_with_index, - ops::MaxPoolWithIndexOp, - ops::MaxPool2dWithIndexOpMaker, - ops::MaxPoolWithIndexGradOpMaker, - ops::MaxPoolWithIndexGradOpMaker, - MaxPool2dWithIndexInferShapeFunctor); -REGISTER_OPERATOR(max_pool2d_with_index_grad, - ops::MaxPoolWithIndexOpGrad, - ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer, - MaxPool2dWithIndexGradInferShapeFunctor); - -DECLARE_INFER_SHAPE_FUNCTOR(max_pool3d_with_index, - MaxPool3dWithIndexInferShapeFunctor, - PD_INFER_META(phi::MaxPoolWithIndexInferMeta)); -DECLARE_INFER_SHAPE_FUNCTOR(max_pool3d_with_index_grad, - MaxPool3dWithIndexGradInferShapeFunctor, - PD_INFER_META(phi::MaxPoolWithIndexGradInferMeta)); - -REGISTER_OPERATOR(max_pool3d_with_index, - ops::MaxPoolWithIndexOp, - ops::MaxPool3dWithIndexOpMaker, - ops::MaxPoolWithIndexGradOpMaker, - ops::MaxPoolWithIndexGradOpMaker, - MaxPool3dWithIndexInferShapeFunctor); -REGISTER_OPERATOR(max_pool3d_with_index_grad, - ops::MaxPoolWithIndexOpGrad, - ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer, - MaxPool3dWithIndexGradInferShapeFunctor); diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml index 6faf2d0ba7a49..9a31d8fb3e3b1 100644 --- a/paddle/phi/api/yaml/backward.yaml +++ b/paddle/phi/api/yaml/backward.yaml @@ -1092,6 +1092,24 @@ kernel : func : matrix_power_grad +- backward_op : max_pool2d_with_index_grad + forward : max_pool2d_with_index(Tensor x, int[] kernel_size, int[] strides = {1, 1}, int[] paddings = {0, 0}, bool global_pooling = false, bool adaptive = false) -> Tensor(out), Tensor(mask) + args : (Tensor x, Tensor mask, Tensor out_grad, int[] kernel_size, int[] strides, int[] paddings, bool global_pooling, bool adaptive) + output : Tensor(x_grad) + infer_meta : + func : MaxPoolWithIndexGradInferMeta + kernel : + func : max_pool2d_with_index_grad + +- backward_op : max_pool3d_with_index_grad + forward : max_pool3d_with_index(Tensor x, int[] kernel_size, int[] strides = {1, 1, 1}, int[] paddings = {0, 0, 0}, bool global_pooling = false, bool adaptive = false) -> Tensor(out), Tensor(mask) + args : (Tensor x, Tensor mask, Tensor out_grad, int[] kernel_size, int[] strides, int[] paddings, bool global_pooling, bool adaptive) + output : Tensor(x_grad) + infer_meta : + func : MaxPoolWithIndexGradInferMeta + kernel : + func : max_pool3d_with_index_grad + - backward_op : maxout_grad forward : maxout(Tensor x, int groups, int axis) -> Tensor(out) args : (Tensor x, Tensor out, Tensor out_grad, int groups, int axis) diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index c80e79e3ff207..590acb5b35b93 100755 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -557,24 +557,6 @@ func : max_grad composite : max_grad(x, out, out_grad, axis, keepdim, reduce_all, x_grad) -- backward_op : max_pool2d_with_index_grad - forward : max_pool2d_with_index(Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool global_pooling, bool adaptive) -> Tensor(out), Tensor(mask) - args : (Tensor x, Tensor mask, Tensor out_grad, int[] kernel_size, int[] strides, int[] paddings, bool global_pooling, bool adaptive) - output : Tensor(x_grad) - infer_meta : - func : MaxPoolWithIndexGradInferMeta - kernel : - func : max_pool2d_with_index_grad - -- backward_op : max_pool3d_with_index_grad - forward : max_pool3d_with_index(Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool global_pooling, bool adaptive) -> Tensor(out), Tensor(mask) - args : (Tensor x, Tensor mask, Tensor out_grad, int[] kernel_size, int[] strides, int[] paddings, bool global_pooling, bool adaptive) - output : Tensor(x_grad) - infer_meta : - func : MaxPoolWithIndexGradInferMeta - kernel : - func : max_pool3d_with_index_grad - - backward_op : maximum_grad forward : maximum(Tensor x, Tensor y) -> Tensor(out) args : (Tensor x, Tensor y, Tensor out_grad) diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml index 5bb3e8d0d73e7..257d67c3477ee 100755 --- a/paddle/phi/api/yaml/legacy_ops.yaml +++ b/paddle/phi/api/yaml/legacy_ops.yaml @@ -732,24 +732,6 @@ func : max backward : max_grad -- op : max_pool2d_with_index - args : (Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool global_pooling, bool adaptive) - output : Tensor(out), Tensor(mask) - infer_meta : - func : MaxPoolWithIndexInferMeta - kernel : - func : max_pool2d_with_index - backward : max_pool2d_with_index_grad - -- op : max_pool3d_with_index - args : (Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool global_pooling, bool adaptive) - output : Tensor(out), Tensor(mask) - infer_meta : - func : MaxPoolWithIndexInferMeta - kernel : - func : max_pool3d_with_index - backward : max_pool3d_with_index_grad - - op : maximum args : (Tensor x, Tensor y) output : Tensor(out) diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml index 68778a1c85602..17e56bd2a0e90 100755 --- a/paddle/phi/api/yaml/op_compat.yaml +++ b/paddle/phi/api/yaml/op_compat.yaml @@ -1469,6 +1469,22 @@ extra : attrs : [bool use_mkldnn = false] +- op : max_pool2d_with_index + inputs : + {x : X} + outputs : + {out : Out, mask : Mask} + attrs : + kernel_size : ksize + +- op : max_pool3d_with_index + inputs : + {x : X} + outputs : + {out : Out, mask : Mask} + attrs : + kernel_size : ksize + - op : maximum (elementwise_max) backward : maximum_grad (elementwise_max_grad) extra : diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml index 8fc8c4c9b081c..8d99f74156736 100644 --- a/paddle/phi/api/yaml/ops.yaml +++ b/paddle/phi/api/yaml/ops.yaml @@ -1271,6 +1271,24 @@ func : matrix_power backward : matrix_power_grad +- op : max_pool2d_with_index + args : (Tensor x, int[] kernel_size, int[] strides= {1, 1}, int[] paddings = {0, 0}, bool global_pooling = false, bool adaptive = false) + output : Tensor(out), Tensor(mask) + infer_meta : + func : MaxPoolWithIndexInferMeta + kernel : + func : max_pool2d_with_index + backward : max_pool2d_with_index_grad + +- op : max_pool3d_with_index + args : (Tensor x, int[] kernel_size, int[] strides = {1, 1, 1}, int[] paddings = {0, 0, 0}, bool global_pooling = false, bool adaptive = false) + output : Tensor(out), Tensor(mask) + infer_meta : + func : MaxPoolWithIndexInferMeta + kernel : + func : max_pool3d_with_index + backward : max_pool3d_with_index_grad + - op : maxout args : (Tensor x, int groups, int axis = 1) output : Tensor(out) diff --git a/paddle/phi/ops/compat/pool_sig.cc b/paddle/phi/ops/compat/pool_sig.cc index b807b21a1c0b1..8bbb1d63c29ad 100644 --- a/paddle/phi/ops/compat/pool_sig.cc +++ b/paddle/phi/ops/compat/pool_sig.cc @@ -65,24 +65,6 @@ KernelSignature Pool2dDoubleGradOpArgumentMapping( {"Out"}); } -KernelSignature MaxPool2dWithIndexOpArgumentMapping( - const ArgumentMappingContext& ctx) { - return KernelSignature( - "max_pool2d_with_index", - {"X"}, - {"ksize", "strides", "paddings", "global_pooling", "adaptive"}, - {"Out", "Mask"}); -} - -KernelSignature MaxPool2dWithIndexGradOpArgumentMapping( - const ArgumentMappingContext& ctx) { - return KernelSignature( - "max_pool2d_with_index_grad", - {"X", "Mask", "Out@GRAD"}, - {"ksize", "strides", "paddings", "global_pooling", "adaptive"}, - {"X@GRAD"}); -} - KernelSignature Pool3dOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("pool3d", {"X"}, @@ -115,24 +97,6 @@ KernelSignature Pool3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { {"X@GRAD"}); } -KernelSignature MaxPool3dWithIndexOpArgumentMapping( - const ArgumentMappingContext& ctx) { - return KernelSignature( - "max_pool3d_with_index", - {"X"}, - {"ksize", "strides", "paddings", "global_pooling", "adaptive"}, - {"Out", "Mask"}); -} - -KernelSignature MaxPool3dWithIndexGradOpArgumentMapping( - const ArgumentMappingContext& ctx) { - return KernelSignature( - "max_pool3d_with_index_grad", - {"X", "Mask", "Out@GRAD"}, - {"ksize", "strides", "paddings", "global_pooling", "adaptive"}, - {"X@GRAD"}); -} - } // namespace phi PD_REGISTER_ARG_MAPPING_FN(pool2d, phi::Pool2dOpArgumentMapping); @@ -140,15 +104,5 @@ PD_REGISTER_ARG_MAPPING_FN(pool2d_grad, phi::Pool2dGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(pool2d_double_grad, phi::Pool2dDoubleGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(max_pool2d_with_index, - phi::MaxPool2dWithIndexOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(max_pool2d_with_index_grad, - phi::MaxPool2dWithIndexGradOpArgumentMapping); - PD_REGISTER_ARG_MAPPING_FN(pool3d, phi::Pool3dOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(pool3d_grad, phi::Pool3dGradOpArgumentMapping); - -PD_REGISTER_ARG_MAPPING_FN(max_pool3d_with_index, - phi::MaxPool3dWithIndexOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(max_pool3d_with_index_grad, - phi::MaxPool3dWithIndexGradOpArgumentMapping); From db30aa1d7fac77b98105e40878d0746327fea86a Mon Sep 17 00:00:00 2001 From: yangguohao <70266361+yangguohao@users.noreply.github.com> Date: Thu, 27 Apr 2023 15:35:01 +0800 Subject: [PATCH 109/405] =?UTF-8?q?=E3=80=90Hackathon=20No.91=E3=80=91regi?= =?UTF-8?q?ster=5Fhook=20for=20static=20mode=20=20(#52948)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- python/paddle/fluid/framework.py | 21 ++- .../unittests/test_tensor_register_hook.py | 22 ++- python/paddle/jit/dy2static/ast_utils.py | 84 +++++++++ python/paddle/jit/dy2static/utils.py | 25 +-- test/dygraph_to_static/test_tensor_hook.py | 169 ++++++++++++++++++ 5 files changed, 302 insertions(+), 19 deletions(-) create mode 100644 test/dygraph_to_static/test_tensor_hook.py diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index cab57eae045b3..7c183ea34767e 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1640,9 +1640,26 @@ def clear_gradient(self): """ pass - @fake_interface_only def register_hook(self, hook): - pass + import paddle + + def backward_hook_wrapper(dy): + """call the backward hook in .""" + import numpy as np + + return hook(np.array(dy)) + + def forward_hook_wrapper(x): + """do nothing but return a new variable.""" + return x + + paddle.static.py_func( + func=forward_hook_wrapper, + x=self, + out=self, + backward_func=backward_hook_wrapper, + skip_vars_in_backward_input=[self], + ) def __str__(self): return self._to_readable_code() diff --git a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py index 16b6d32ce404d..d0fa1fe25974b 100644 --- a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py +++ b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py @@ -45,9 +45,10 @@ def __init__(self, in_size, out_size): self.linear1 = nn.Linear(in_size, in_size) self.linear2 = nn.Linear(in_size, out_size) - def forward(self, x): + def forward(self, x, hook=False): ret1 = self.linear1(x) - ret1.register_hook(lambda grad: grad * 2) + if hook: + ret1.register_hook(lambda grad: grad * 2) ret2 = self.linear2(ret1) out = paddle.mean(ret2, axis=-1) @@ -512,8 +513,7 @@ def test_register_hook_in_static_mode(self): ) net = SimpleNetForStatic(self.in_size, self.out_size) - with self.assertRaises(AssertionError): - out = net(x) + out = net(x) paddle.disable_static() @@ -527,9 +527,17 @@ def test_register_hook_in_dy2static_mode(self): 'float32' ) data_t = paddle.to_tensor(data) - - with self.assertRaises(AssertionError): - out = jit_net(data_t) + data_t2 = paddle.to_tensor(data) + data_t.stop_gradient = False + data_t2.stop_gradient = False + + out1 = jit_net(data_t) + out2 = jit_net(data_t2, True) + out1.backward() + out2.backward() + np.testing.assert_array_equal( + 2 * data_t.grad.numpy(), data_t2.grad.numpy() + ) HOOK_INIT_VALUE = 10 diff --git a/python/paddle/jit/dy2static/ast_utils.py b/python/paddle/jit/dy2static/ast_utils.py index 7724cc1b4a13f..a9f40b7ce6548 100644 --- a/python/paddle/jit/dy2static/ast_utils.py +++ b/python/paddle/jit/dy2static/ast_utils.py @@ -14,6 +14,9 @@ import ast +import collections +import inspect +import textwrap import astor @@ -38,3 +41,84 @@ def pretty_source(source): source_code = astor.to_source(ast_node, pretty_source=pretty_source) return source_code + + +class RegisterHookVisitor(gast.NodeVisitor): + def __init__(self, func_name): + self.register_hook_pos_map = collections.defaultdict(list) + self.assignment_pos_map = collections.defaultdict(list) + self.func_name = func_name + + def visit_FunctionDef(self, func_def): + # The inner function that has register_hook will not be processed + if func_def.name != self.func_name: + return + register_hook_pos_map = self.register_hook_pos_map + assignment_pos_map = self.assignment_pos_map + + for i in range(len(func_def.body) - 1, -1, -1): + + body = func_def.body[i] + # Check if the code body contains the register_hook + if isinstance(body, ast.Expr): + for node in ast.walk(body): + if ( + isinstance(node, ast.Attribute) + and node.attr == 'register_hook' + ): + # parameter name for register_hook + param_name = node.value.id + register_hook_pos_map[param_name].append(i) + elif isinstance(body, ast.Assign): + for target in body.targets: + assignment_pos_map[target.id].append(i) + + # Confirm the order + order_map = {} + for k, idx_list in register_hook_pos_map.items(): + for idx in idx_list: + if k not in assignment_pos_map: + order_map[idx] = 1 + else: + for assignment_idx in assignment_pos_map[k]: + if idx > assignment_idx: + order_map[idx] = assignment_idx + 1 + break + code_order = [*range(len(func_def.body))] + for k, v in sorted(order_map.items(), key=lambda x: x[1], reverse=True): + if k == v: + continue + code_order.remove(k) + code_order.insert(v, k) + + # rearrange the code according to the specified order + new_body = [func_def.body[i] for i in code_order] + func_def.body = new_body + + +def modify_function_code(func): + """ + Modify the function code for the register hook + """ + + func_ast = ast.parse(textwrap.dedent(inspect.getsource(func))) + # check if there is register_hook on code after visit the tree. + check_register_hook = next( + ( + node + for node in ast.walk(func_ast) + if isinstance(node, ast.Attribute) and node.attr == 'register_hook' + ), + None, + ) + if check_register_hook is None: + return + + visitor = RegisterHookVisitor(func.__name__) + visitor.visit(func_ast) + + def pretty_source(source): + return ''.join(source) + + new_code = astor.to_source(func_ast, pretty_source=pretty_source) + return new_code diff --git a/python/paddle/jit/dy2static/utils.py b/python/paddle/jit/dy2static/utils.py index bd90f6089fe95..df892ab2a06ef 100644 --- a/python/paddle/jit/dy2static/utils.py +++ b/python/paddle/jit/dy2static/utils.py @@ -38,7 +38,7 @@ from paddle.fluid.wrapped_decorator import signature_safe_contextmanager from paddle.utils import gast -from .ast_utils import ast_to_source_code +from .ast_utils import ast_to_source_code, modify_function_code from .static_analysis import StaticAnalysisVisitor from .utils_helper import DYGRAPH_MODULE_PREFIX # noqa: F401 from .utils_helper import DYGRAPH_TO_STATIC_MODULE_PREFIX # noqa: F401 @@ -643,15 +643,20 @@ def func_to_source_code(function, dedent=True): type(function).__name__ ) ) - source_code_list, _ = inspect.getsourcelines(function) - # Replace comments with blank lines so that error messages are not misplaced - source_code_list = [ - line if not line.lstrip().startswith('#') else '\n' - for line in source_code_list - ] - source_code = ''.join(source_code_list) - if dedent: - source_code = textwrap.dedent(source_code) + # return modified function source code if there is 'register_hook', otherwise return None + source_code = modify_function_code(function) + + if source_code is None: + source_code_list, _ = inspect.getsourcelines(function) + # Replace comments with blank lines so that error messages are not misplaced + source_code_list = [ + line if not line.lstrip().startswith('#') else '\n' + for line in source_code_list + ] + source_code = ''.join(source_code_list) + + if dedent: + source_code = textwrap.dedent(source_code) return source_code diff --git a/test/dygraph_to_static/test_tensor_hook.py b/test/dygraph_to_static/test_tensor_hook.py new file mode 100644 index 0000000000000..a943c40495de4 --- /dev/null +++ b/test/dygraph_to_static/test_tensor_hook.py @@ -0,0 +1,169 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle +from paddle import nn +from paddle.jit import to_static + + +class TestStaticAnalysis(unittest.TestCase): + def test_hook_for_different_parameter(self): + def f(x): + def h(g): + return 2 * g + + y = x + 4 + f = y + x + z = f**2 + y.register_hook(h) + f.register_hook(h) + x.register_hook(h) + return z + + x = paddle.to_tensor([2.0]) + x.stop_gradient = False + loss = f(x) + loss.backward() + + x_jit = paddle.to_tensor([2.0]) + x_jit.stop_gradient = False + jit_f = to_static(f) + loss = jit_f(x_jit) + loss.backward() + self.assertTrue(np.allclose(x.grad.numpy(), x_jit.grad.numpy())) + + def test_hook_for_reassignment_parameter(self): + def f(x): + def h(g): + return 2 * g + + y = x + 4 + x = y * 5 + z = x**2 + x.register_hook(h) + return z + + x = paddle.to_tensor([2.0]) + x.stop_gradient = False + loss = f(x) + loss.backward() + + x_jit = paddle.to_tensor([2.0]) + x_jit.stop_gradient = False + jit_f = to_static(f) + loss = jit_f(x_jit) + loss.backward() + self.assertTrue(np.allclose(x.grad.numpy(), x_jit.grad.numpy())) + + def test_hook_for_repeat_register(self): + def f(x): + def h(g): + return 2 * g + + y = x + 4 + z = y**2 + x.register_hook(h) + x.register_hook(h) + return z + + x = paddle.to_tensor([2.0]) + x.stop_gradient = False + loss = f(x) + loss.backward() + + x_jit = paddle.to_tensor([2.0]) + x_jit.stop_gradient = False + jit_f = to_static(f) + loss = jit_f(x_jit) + loss.backward() + self.assertTrue(np.allclose(x.grad.numpy(), x_jit.grad.numpy())) + + def test_hook_in_init_for_layer(self): + def hook(grad): + return grad * 2 + + IMAGE_SIZE = 784 + CLASS_NUM = 10 + + class LinearNet(nn.Layer): + def __init__(self): + super().__init__() + self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM) + # register_hook in init + self._linear.parameters()[0].register_hook(hook) + + def forward(self, x): + return self._linear(x) + + # create network + layer = LinearNet() + jit_layer = to_static(LinearNet()) + data = np.random.random([IMAGE_SIZE]).astype('float32') + image = paddle.to_tensor(data) + image_jit = paddle.to_tensor(data) + loss = layer(image) + loss_jit = jit_layer(image_jit) + loss_jit.backward() + loss.backward() + self.assertTrue( + np.allclose( + layer.parameters()[0].grad.numpy(), + jit_layer.parameters()[0].grad.numpy(), + ) + ) + + # def test_hook_in_forward_for_layer(self): + # + # IMAGE_SIZE = 784 + # CLASS_NUM = 10 + # + # class LinearNet(nn.Layer): + # def __init__(self): + # super().__init__() + # self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM) + # + # def forward(self, x): + # def hook(grad): + # return grad * 2 + # + # res = self._linear(x) + # + # # register_hook in forward + # self._linear.parameters()[0].register_hook(hook) + # return res + # + # # create network + # layer = LinearNet() + # jit_layer = to_static(LinearNet()) + # data = np.random.random([IMAGE_SIZE]).astype('float32') + # image = paddle.to_tensor(data) + # image_jit = paddle.to_tensor(data) + # loss = layer(image) + # loss_jit = jit_layer(image_jit) + # loss_jit.backward() + # loss.backward() + # self.assertTrue( + # np.allclose( + # layer.parameters()[0].grad.numpy(), + # jit_layer.parameters()[0].grad.numpy(), + # ) + # ) + + +if __name__ == '__main__': + unittest.main() From 9ab14865b7d3d343d0c34a26fd1893747cbb5bc0 Mon Sep 17 00:00:00 2001 From: HongyuJia Date: Thu, 27 Apr 2023 15:42:12 +0800 Subject: [PATCH 110/405] [CINN Support 0D-Tensor] CINN supports 0D-Tensor with trick temporarily (#53382) * [CINN Support 0D-Tensor] CINN supports 0D-Tensor with trick temporarily * Add unittest --- paddle/fluid/framework/details/CMakeLists.txt | 1 + .../fluid/framework/details/build_strategy.cc | 4 + .../framework/paddle2cinn/CMakeLists.txt | 16 ++++ .../cinn_zero_tensor_trick_pass.cc | 78 +++++++++++++++++++ .../paddle2cinn/cinn_zero_tensor_trick_pass.h | 33 ++++++++ .../cinn_zero_tensor_trick_pass_test.cc | 56 +++++++++++++ 6 files changed, 188 insertions(+) create mode 100644 paddle/fluid/framework/paddle2cinn/cinn_zero_tensor_trick_pass.cc create mode 100644 paddle/fluid/framework/paddle2cinn/cinn_zero_tensor_trick_pass.h create mode 100644 paddle/fluid/framework/paddle2cinn/cinn_zero_tensor_trick_pass_test.cc diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 820846cacca6b..99ebd6a370b4a 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -382,6 +382,7 @@ set(IR_PASS_DEPS if(WITH_CINN) set(IR_PASS_DEPS ${IR_PASS_DEPS} build_cinn_pass) + set(IR_PASS_DEPS ${IR_PASS_DEPS} cinn_zero_tensor_trick_pass) endif() if(NOT APPLE diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 91024a9dbe317..b0349966bb5d7 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -57,6 +57,9 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { #ifdef PADDLE_WITH_CINN if (FLAGS_use_cinn || strategy.build_cinn_pass_) { + // Note: This is a trick to support 0D-Tensor for CINN. This pass will be + // removed in the near future. + AppendPass("cinn_zero_tensor_trick_pass"); // Note: This pass is used to enable cinn. AppendPass("build_cinn_pass"); AppendPrintGraphPass("graph_viz_pass", "_build_cinn_graph"); @@ -532,6 +535,7 @@ USE_PASS(fused_attention_pass); USE_PASS(fuse_adamw_op_pass); #endif #ifdef PADDLE_WITH_CINN +USE_PASS(cinn_zero_tensor_trick_pass); USE_PASS(build_cinn_pass); #endif #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt index a082dff6e54c2..f6a183304075b 100644 --- a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt +++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt @@ -8,6 +8,8 @@ pass_library( errors enforce) +pass_library(cinn_zero_tensor_trick_pass base) + cc_library( transform_desc SRCS transform_desc.cc @@ -62,6 +64,20 @@ if(WITH_TESTING) set_tests_properties(build_cinn_pass_test PROPERTIES LABELS "RUN_TYPE=CINN") target_link_libraries(build_cinn_pass_test ${PYTHON_LIBRARIES}) + cc_test_old( + cinn_zero_tensor_trick_pass_test + SRCS + cinn_zero_tensor_trick_pass_test.cc + DEPS + build_cinn_pass + cinn_compiler + op_registry + elementwise_add_op + generated_op) + set_tests_properties(cinn_zero_tensor_trick_pass_test + PROPERTIES LABELS "RUN_TYPE=CINN") + target_link_libraries(cinn_zero_tensor_trick_pass_test ${PYTHON_LIBRARIES}) + cc_test_old(transform_desc_test SRCS transform_desc_test.cc DEPS transform_desc) set_tests_properties(transform_desc_test PROPERTIES LABELS "RUN_TYPE=CINN") diff --git a/paddle/fluid/framework/paddle2cinn/cinn_zero_tensor_trick_pass.cc b/paddle/fluid/framework/paddle2cinn/cinn_zero_tensor_trick_pass.cc new file mode 100644 index 0000000000000..9c4e6192be424 --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/cinn_zero_tensor_trick_pass.cc @@ -0,0 +1,78 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/paddle2cinn/cinn_zero_tensor_trick_pass.h" + +#include +#include "glog/logging.h" + +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +void CinnZeroTensorTrickPass::ApplyImpl(ir::Graph* graph) const { + // fix shape attr of these ops + const std::unordered_set op_cases_fix_attr{"fill_constant", + "uniform_random", + "expand_v2", + "assign_value", + "gaussian_random", + "set_value"}; + for (const ir::Node* n : graph->Nodes()) { + if (n->IsOp() && op_cases_fix_attr.count(n->Op()->Type())) { + if (n->Op()->HasAttr("shape")) { + auto attr_type = n->Op()->GetAttrType("shape"); + if (attr_type == paddle::framework::proto::INTS) { + auto shapes = + PADDLE_GET_CONST(std::vector, n->Op()->GetAttr("shape")); + if (shapes.empty()) { + shapes.push_back(1); + n->Op()->SetAttr("shape", shapes); + VLOG(4) << "op " << n->Op()->Type() + << " shape attribute dims is empty, fix dim -> {1} "; + } + } else { /* attr_type == paddle::framework::proto::LONGS */ + auto shapes = + PADDLE_GET_CONST(std::vector, n->Op()->GetAttr("shape")); + if (shapes.empty()) { + shapes.push_back(1); + n->Op()->SetAttr("shape", shapes); + VLOG(4) << "op " << n->Op()->Type() + << " shape attribute dims is empty, fix dim -> {1} "; + } + } + } + } + if (n->IsVar()) { + if (n->Var() && n->Var()->GetType() == proto::VarType::LOD_TENSOR) { + std::vector shape = n->Var()->GetShape(); + if (shape.empty()) { + shape.push_back(1); + n->Var()->SetShape(shape); + VLOG(4) << "var " << n->Name() << " dims is empty, fix dim -> {1} "; + } + } + } + } +} + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle + +REGISTER_PASS(cinn_zero_tensor_trick_pass, + paddle::framework::paddle2cinn::CinnZeroTensorTrickPass); diff --git a/paddle/fluid/framework/paddle2cinn/cinn_zero_tensor_trick_pass.h b/paddle/fluid/framework/paddle2cinn/cinn_zero_tensor_trick_pass.h new file mode 100644 index 0000000000000..57e8beb6dacf8 --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/cinn_zero_tensor_trick_pass.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +class Graph; + +class CinnZeroTensorTrickPass : public framework::ir::Pass { + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/cinn_zero_tensor_trick_pass_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_zero_tensor_trick_pass_test.cc new file mode 100644 index 0000000000000..ff07ce2f3de50 --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/cinn_zero_tensor_trick_pass_test.cc @@ -0,0 +1,56 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/framework/paddle2cinn/cinn_zero_tensor_trick_pass.h" + +#include "paddle/fluid/framework/ir/pass_tester_helper.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +TEST(CinnZeroTensorTrickPass, basic) { + ir::Layers layers; + auto* x = layers.data("x", {}); + auto* y = layers.data("y", {3, 4}); + auto* add_out_0 = layers.elementwise_add(x, y, nullptr, 0); + std::unique_ptr graph(new ir::Graph(layers.main_program())); + auto pass = ir::PassRegistry::Instance().Get("cinn_zero_tensor_trick_pass"); + VLOG(3) << DebugString(graph); + + graph.reset(pass->Apply(graph.release())); + VLOG(3) << DebugString(graph); + + for (auto* n : graph->Nodes()) { + if (n->IsVar()) { + if (n->Var() && n->Var()->GetType() == proto::VarType::LOD_TENSOR) { + std::vector shape = n->Var()->GetShape(); + PADDLE_ENFORCE_EQ( + shape.empty(), + false, + platform::errors::PreconditionNotMet( + "The shape of elementwise_add should not be empty after fuse")); + } + } + } +} + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle + +USE_PASS(cinn_zero_tensor_trick_pass); From 18968e7ea3d40229e26355d30e529b39919bc75c Mon Sep 17 00:00:00 2001 From: gouzil <66515297+gouzil@users.noreply.github.com> Date: Thu, 27 Apr 2023 15:42:33 +0800 Subject: [PATCH 111/405] [static op generation] triangular_solve (#53328) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [static op generation] triangular_solve * [phi] mv triangular_solve_grad to static_backward * [phi] fix import * [phi] mv to ops.yaml、 backward.yaml * fix forward attr * [phi] fix triangular_solve_grad args --- paddle/fluid/operators/triangular_solve_op.cc | 135 ------------------ paddle/phi/api/yaml/backward.yaml | 10 ++ paddle/phi/api/yaml/legacy_backward.yaml | 10 -- paddle/phi/api/yaml/legacy_ops.yaml | 10 -- paddle/phi/api/yaml/op_compat.yaml | 7 + paddle/phi/api/yaml/ops.yaml | 10 ++ paddle/phi/ops/compat/triangular_solve_sig.cc | 30 ---- 7 files changed, 27 insertions(+), 185 deletions(-) delete mode 100644 paddle/fluid/operators/triangular_solve_op.cc delete mode 100644 paddle/phi/ops/compat/triangular_solve_sig.cc diff --git a/paddle/fluid/operators/triangular_solve_op.cc b/paddle/fluid/operators/triangular_solve_op.cc deleted file mode 100644 index 66e4c3a57890e..0000000000000 --- a/paddle/fluid/operators/triangular_solve_op.cc +++ /dev/null @@ -1,135 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/infermeta/binary.h" - -namespace paddle { -namespace operators { - -class TriangularSolveOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"), - ctx.GetPlace()); - } -}; - -class TriangularSolveOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", - "(Tensor), The first input tensor of triangular solve op, which " - "is the triangular coefficient matrix."); - AddInput("Y", - "(Tensor), The second input tensor of triangular solve op, which " - "is multiple right-hand."); - AddOutput("Out", "(Tensor), The solution tensor of triangular solve op."); - AddAttr("upper", - "whether to solve the upper-triangular or the " - "lower-triangular system of equations") - .SetDefault(true); - AddAttr("transpose", "whether X should be transposed firstly.") - .SetDefault(false); - AddAttr("unitriangular", "whether X is unit triangular.") - .SetDefault(false); - AddComment(R"DOC( - Triangular Solve Operator. - This operator is used to computes the solution of equations with a triangular coefficient matrix. - - The equation is: - $$Out = X^-1 * Y$$ -)DOC"); - } -}; - -class TriangularSolveOpInferVarType - : public framework::PassInDtypeAndVarTypeToOutput { - protected: - std::unordered_map& GetInputOutputWithSameType() - const override { - static std::unordered_map m{{"X", /*->*/ "Out"}}; - return m; - } -}; - -class TriangularSolveGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "triangular_solve"); - OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "triangular_solve"); - OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "triangular_solve"); - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), - "Input", - "Out@GRAD", - "triangular_solve"); - - auto x_dims = ctx->GetInputDim("X"); - auto y_dims = ctx->GetInputDim("Y"); - - auto x_grad_name = framework::GradVarName("X"); - auto y_grad_name = framework::GradVarName("Y"); - - if (ctx->HasOutput(x_grad_name)) { - ctx->SetOutputDim(x_grad_name, x_dims); - } - if (ctx->HasOutput(y_grad_name)) { - ctx->SetOutputDim(y_grad_name, y_dims); - } - } -}; - -template -class TriangularSolveOpGradMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr retv) const override { - retv->SetType("triangular_solve_grad"); - retv->SetInput("X", this->Input("X")); - retv->SetInput("Y", this->Input("Y")); - retv->SetInput("Out", this->Output("Out")); - retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - - retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - retv->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y")); - retv->SetAttrMap(this->Attrs()); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -DECLARE_INFER_SHAPE_FUNCTOR(triangular_solve, - TriangularSolveInferShapeFunctor, - PD_INFER_META(phi::TriangularSolveInferMeta)); - -REGISTER_OPERATOR(triangular_solve, - ops::TriangularSolveOp, - ops::TriangularSolveOpMaker, - ops::TriangularSolveOpInferVarType, - ops::TriangularSolveOpGradMaker, - ops::TriangularSolveOpGradMaker, - TriangularSolveInferShapeFunctor); - -REGISTER_OPERATOR(triangular_solve_grad, ops::TriangularSolveGradOp); diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml index 9a31d8fb3e3b1..a2189b7084bd2 100644 --- a/paddle/phi/api/yaml/backward.yaml +++ b/paddle/phi/api/yaml/backward.yaml @@ -1950,6 +1950,16 @@ data_type : out_grad no_need_buffer : x +- backward_op : triangular_solve_grad + forward : triangular_solve (Tensor x, Tensor y, bool upper=true, bool transpose=false, bool unitriangular=false) -> Tensor(out) + args : (Tensor x, Tensor y, Tensor out, Tensor out_grad, bool upper, bool transpose, bool unitriangular) + output : Tensor(x_grad), Tensor(y_grad) + infer_meta : + func : GeneralBinaryGradInferMeta + param : [x, y] + kernel : + func : triangular_solve_grad + - backward_op : trilinear_interp_grad forward : trilinear_interp (Tensor x, Tensor out_size, Tensor[] size_tensor, Tensor scale_tensor, str data_layout="NCHW", int out_d=0, int out_h=0, int out_w=0, float[] scale={}, str interp_method="bilinear", bool align_corners=true, int align_mode=1) -> Tensor(output) args : (Tensor x, Tensor out_size, Tensor[] size_tensor, Tensor scale_tensor, Tensor output_grad, str data_layout, int out_d, int out_h, int out_w, float[] scale, str interp_method, bool align_corners, int align_mode) diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index 590acb5b35b93..30bfa1f384758 100755 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -1034,16 +1034,6 @@ backward : transpose_double_grad composite: transpose_grad(out_grad, perm, x_grad) -- backward_op : triangular_solve_grad - forward : triangular_solve (Tensor x, Tensor y, bool upper, bool tranpose, bool unitriangular) -> Tensor(out) - args : (Tensor x, Tensor y, Tensor out, Tensor out_grad, bool upper, bool tranpose, bool unitriangular) - output : Tensor(x_grad), Tensor(y_grad) - infer_meta : - func : GeneralBinaryGradInferMeta - param : [x, y] - kernel : - func : triangular_solve_grad - - backward_op : tril_grad forward : tril(Tensor x, int diagonal) -> Tensor(out) args : (Tensor out_grad, int diagonal) diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml index 257d67c3477ee..c3610b4ae5ae1 100755 --- a/paddle/phi/api/yaml/legacy_ops.yaml +++ b/paddle/phi/api/yaml/legacy_ops.yaml @@ -1140,16 +1140,6 @@ func : transpose backward : transpose_grad -- op : triangular_solve - args : (Tensor x, Tensor y, bool upper, bool transpose, bool unitriangular) - output : Tensor - infer_meta : - func : TriangularSolveInferMeta - kernel : - func : triangular_solve - data_type : x - backward : triangular_solve_grad - - op : tril args : (Tensor x, int diagonal) output : Tensor(out) diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml index 17e56bd2a0e90..46245aa4c928e 100755 --- a/paddle/phi/api/yaml/op_compat.yaml +++ b/paddle/phi/api/yaml/op_compat.yaml @@ -2334,6 +2334,13 @@ outputs : [XShape] attrs : [bool use_mkldnn = false, str data_format = "AnyLayout", str mkldnn_data_type = "float32"] +- op : triangular_solve + backward : triangular_solve_grad + inputs : + {x : X, y : Y} + outputs : + out : Out + - op : trilinear_interp (trilinear_interp_v2) backward : trilinear_interp_grad (trilinear_interp_v2_grad) inputs : diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml index 8d99f74156736..bea41007e5aec 100644 --- a/paddle/phi/api/yaml/ops.yaml +++ b/paddle/phi/api/yaml/ops.yaml @@ -2062,6 +2062,16 @@ func : trace backward : trace_grad +- op : triangular_solve + args : (Tensor x, Tensor y, bool upper=true, bool transpose=false, bool unitriangular=false) + output : Tensor + infer_meta : + func : TriangularSolveInferMeta + kernel : + func : triangular_solve + data_type : x + backward : triangular_solve_grad + - op : trilinear_interp args : (Tensor x, Tensor out_size, Tensor[] size_tensor, Tensor scale_tensor, str data_layout="NCHW", int out_d=0, int out_h=0, int out_w=0, float[] scale={}, str interp_method="bilinear", bool align_corners=true, int align_mode=1) output : Tensor(output) diff --git a/paddle/phi/ops/compat/triangular_solve_sig.cc b/paddle/phi/ops/compat/triangular_solve_sig.cc deleted file mode 100644 index 851db32a032d6..0000000000000 --- a/paddle/phi/ops/compat/triangular_solve_sig.cc +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/core/compat/op_utils.h" - -namespace phi { - -KernelSignature TriangularSolveGradOpArgumentMapping( - const ArgumentMappingContext& ctx) { - return KernelSignature("triangular_solve_grad", - {"X", "Y", "Out", "Out@GRAD"}, - {"upper", "transpose", "unitriangular"}, - {"X@GRAD", "Y@GRAD"}); -} - -} // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(triangular_solve_grad, - phi::TriangularSolveGradOpArgumentMapping); From 9c1eb98ad6c979ec9f9b1d6e170843b858c60272 Mon Sep 17 00:00:00 2001 From: houj04 <35131887+houj04@users.noreply.github.com> Date: Thu, 27 Apr 2023 16:07:36 +0800 Subject: [PATCH 112/405] [XPU] c_sync_calc_stream support more types (#53389) --- .../operators/collective/c_sync_calc_stream_op_xpu.cc | 11 +++++++++-- paddle/phi/backends/xpu/xpu2_op_list.cc | 7 ++++++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op_xpu.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op_xpu.cc index 0b432cab281fc..24157f1c64a6c 100644 --- a/paddle/fluid/operators/collective/c_sync_calc_stream_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op_xpu.cc @@ -17,5 +17,12 @@ limitations under the License. */ namespace ops = paddle::operators; namespace plat = paddle::platform; -PD_REGISTER_STRUCT_KERNEL( - c_sync_calc_stream, XPU, ALL_LAYOUT, ops::CSyncCalcStreamKernel, float) {} +PD_REGISTER_STRUCT_KERNEL(c_sync_calc_stream, + XPU, + ALL_LAYOUT, + ops::CSyncCalcStreamKernel, + float, + double, + int, + int64_t, + plat::float16) {} diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc index 43461e696da81..303963450b197 100644 --- a/paddle/phi/backends/xpu/xpu2_op_list.cc +++ b/paddle/phi/backends/xpu/xpu2_op_list.cc @@ -115,7 +115,12 @@ XPUOpMap& get_kl2_ops() { XPUKernelSet({phi::DataType::FLOAT16, phi::DataType::FLOAT32, phi::DataType::INT32})}, - {"c_sync_calc_stream", XPUKernelSet({phi::DataType::FLOAT32})}, + {"c_sync_calc_stream", + XPUKernelSet({phi::DataType::FLOAT16, + phi::DataType::FLOAT32, + phi::DataType::FLOAT64, + phi::DataType::INT32, + phi::DataType::INT64})}, {"c_sync_comm_stream", XPUKernelSet({phi::DataType::FLOAT32})}, {"cast", XPUKernelSet({phi::DataType::FLOAT32, From 421f56a86d21ac3503d9a750ccd122ec72a0da84 Mon Sep 17 00:00:00 2001 From: wuhuachaocoding <77733235+wuhuachaocoding@users.noreply.github.com> Date: Thu, 27 Apr 2023 16:13:46 +0800 Subject: [PATCH 113/405] set sync_param default true (#53335) --- paddle/fluid/framework/distributed_strategy.proto | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index 6b093e9ee03b8..0435199ecaa48 100755 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -52,7 +52,7 @@ message ShardingConfig { // for dygraph message MpConfig { - optional bool sync_param= 1 [ default = false ]; + optional bool sync_param= 1 [ default = true ]; optional bool sync_grad= 2 [ default = false ]; optional bool sync_moment= 3 [ default = false ]; optional string sync_mode= 4 [ default = 'broadcast' ]; From a3a9168268c530ce04c63bda83d41e5257cab3de Mon Sep 17 00:00:00 2001 From: WangZhen <23097963+0x45f@users.noreply.github.com> Date: Thu, 27 Apr 2023 16:48:29 +0800 Subject: [PATCH 114/405] Update inference approve list (#53399) * Update slim approve list * Fix id, test=document_fix --- tools/check_api_approvals.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh index ed77494c43f35..36449fd1444a5 100644 --- a/tools/check_api_approvals.sh +++ b/tools/check_api_approvals.sh @@ -104,8 +104,8 @@ if [ "$slim_approve" != "" ]; then fi if [ "$inference_approve" != "" ]; then - echo_line="You must have one RD (Superjomn(Recommend), Shixiaowei02, cyj1986) approval for the changes of `def` Inputs/Output/Attrs of OPs. \n For more details, please click [https://github.com/PaddlePaddle/Paddle/wiki/OP-Input-Output-Attribute-Compatibility-Modification].\n${inference_approve}\n" - check_approval 1 39645414 328693 39303645 + echo_line="You must have one RD (qingqing01(Recommend), jiweibo, Shixiaowei02) approval for the changes of `def` Inputs/Output/Attrs of OPs. \n For more details, please click [https://github.com/PaddlePaddle/Paddle/wiki/OP-Input-Output-Attribute-Compatibility-Modification].\n${inference_approve}\n" + check_approval 1 7845005 26377421 39303645 fi DEV_OP_USE_DEFAULT_GRAD_MAKER_SPEC=${PADDLE_ROOT}/paddle/fluid/op_use_default_grad_maker_DEV.spec From 2d17df97015e5635ffb344d081234372b724656a Mon Sep 17 00:00:00 2001 From: WangZhen <23097963+0x45f@users.noreply.github.com> Date: Thu, 27 Apr 2023 17:17:04 +0800 Subject: [PATCH 115/405] [Dy2St]Get grad names when call append backward to fix high order gradient (#53250) [Dy2St]Get grad names when call append backward to fix high order gradient (#53250) --- .../eager/to_static/run_program_op_node.h | 19 +++- paddle/fluid/operators/run_program_op.cc | 4 + python/paddle/fluid/backward.py | 69 ++++++++----- .../fluid/tests/unittests/test_dropout_op.py | 9 +- .../tests/unittests/test_eager_run_program.py | 2 + .../tests/unittests/test_run_program_op.py | 4 + .../paddle/jit/dy2static/partial_program.py | 58 +++++++---- python/paddle/jit/dy2static/utils.py | 73 ++++---------- python/paddle/jit/translated_layer.py | 39 +++++++- test/dygraph_to_static/test_gradname_parse.py | 96 +++++++++++++++++++ 10 files changed, 262 insertions(+), 111 deletions(-) diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h index 57defbaee4dca..b4deb4e4ac306 100644 --- a/paddle/fluid/eager/to_static/run_program_op_node.h +++ b/paddle/fluid/eager/to_static/run_program_op_node.h @@ -689,15 +689,26 @@ class GradNodeRunProgram : public egr::GradNodeBase { protected: void ConstructXGradTensors(const std::vector &x, std::vector *x_grad) { + auto x_grad_names = + PADDLE_GET_CONST(std::vector, attrs_.at("x_grad_names")); + PADDLE_ENFORCE_EQ( + x.size(), + x_grad_names.size(), + paddle::platform::errors::InvalidArgument( + "The x.size() and x_grad_names.size() should be equal. " + "But received x.size() = %d, x_grad_names.size() = %d", + x.size(), + x_grad_names.size())); + // TODO(dev): Need an elegant way to determine inforamtion of grad_tensor, // such as: name, tensor type(DenseTensor or SelectedRows). - for (auto &t : x) { - if (t.is_dense_tensor()) { + for (size_t i = 0; i < x.size(); i++) { + if (x[i].is_dense_tensor()) { x_grad->emplace_back(std::make_shared()); - } else if (t.is_selected_rows()) { + } else if (x[i].is_selected_rows()) { x_grad->emplace_back(std::make_shared()); } - x_grad->back().set_name(t.name() + "@GRAD"); + x_grad->back().set_name(x_grad_names[i]); } } diff --git a/paddle/fluid/operators/run_program_op.cc b/paddle/fluid/operators/run_program_op.cc index d54a9e9cbe0a6..353fca80ad404 100644 --- a/paddle/fluid/operators/run_program_op.cc +++ b/paddle/fluid/operators/run_program_op.cc @@ -139,6 +139,10 @@ class RunProgramOpMaker : public framework::OpProtoAndCheckerMaker { "std::vector" "The names of output gradients.") .SetDefault({}); + AddAttr>("x_grad_names", + "std::vector" + "The names of input gradients.") + .SetDefault({}); AddComment(R"DOC( RunProgram operator. diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index ef66532e6b730..fa7ae57ff9d97 100755 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -2376,28 +2376,12 @@ def _find_op_path_( return op_path -def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None): - """ - Backpropagate the gradients of targets to inputs. - - Args: - targets(Tensor|list[Tensor]|tuple[Tensor]): The target Tensors - inputs(Tensor|list[Tensor]|tuple[Tensor]): The input Tensors - target_gradients (Tensor|list[Tensor]|tuple[Tensor], optional): The gradient Tensors - of targets which has the same shape with targets, If None, ones will - be created for them. - no_grad_set(set[Tensor|str], optional): Set of Tensors or Tensor.names in the :ref:`api_guide_Block_en` 0 whose gradients - should be ignored. All Tensors with - `stop_gradient=True` from all blocks will - be automatically added into this set. - If this parameter is not None, the Tensors or Tensor.names in this set will be added to the default set. - Default: None. - - Return: - (list[Tensor]): A list of gradients for inputs - If an input does not affect targets, the corresponding gradient Tensor - will be None - """ +def calc_gradient_helper( + targets, inputs, target_gradients=None, no_grad_set=None +): + ''' + Calculate gradient and return grad_info_map + ''' targets = _as_list(targets) inputs = _as_list(inputs) target_gradients = _as_list(target_gradients) @@ -2510,7 +2494,11 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None): _append_backward_vars_(block, fwd_op_num, grad_to_var, grad_info_map) prog._sync_with_cpp() + return grad_info_map + +def _get_grad_vars(grad_info_map, inputs): + inputs = _as_list(inputs) grad_vars = [] for input_var in inputs: if input_var.name not in grad_info_map: @@ -2520,6 +2508,43 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None): grad_block = grad_info[1] grad_var = grad_block.var(grad_info[0]) grad_vars.append(grad_var) + return grad_vars + + +def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None): + """ + Backpropagate the gradients of targets to inputs. + + Args: + targets(Tensor|list[Tensor]|tuple[Tensor]): The target Tensors + inputs(Tensor|list[Tensor]|tuple[Tensor]): The input Tensors + target_gradients (Tensor|list[Tensor]|tuple[Tensor], optional): The gradient Tensors + of targets which has the same shape with targets, If None, ones will + be created for them. + no_grad_set(set[Tensor|str], optional): Set of Tensors or Tensor.names in the :ref:`api_guide_Block_en` 0 whose gradients + should be ignored. All Tensors with + `stop_gradient=True` from all blocks will + be automatically added into this set. + If this parameter is not None, the Tensors or Tensor.names in this set will be added to the default set. + Default: None. + + Return: + (list[Tensor]): A list of gradients for inputs + If an input does not affect targets, the corresponding gradient Tensor + will be None + """ + + # NOTE: If you want to modify the logic of calc_gradient, please modify + # it inside the calc_gradient_helper and _get_grad_vars functions + # to ensure the correctness of dy2st mode. + grad_info_map = calc_gradient_helper( + targets, + inputs, + target_gradients=target_gradients, + no_grad_set=no_grad_set, + ) + + grad_vars = _get_grad_vars(grad_info_map, inputs) if len(grad_vars) == 1: return grad_vars[0] diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py index 6cae44cc471c2..540d30da2e6ca 100644 --- a/python/paddle/fluid/tests/unittests/test_dropout_op.py +++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py @@ -83,7 +83,8 @@ def test_check_output(self): self.check_output(check_prim=True) def test_check_grad_normal(self): - self.check_grad(['X'], 'Out', check_prim=True) + # Now in dy2st mode x_grad = [], so set check_prim=False + self.check_grad(['X'], 'Out', check_prim=False) class TestDropoutOpInput1d(OpTest): @@ -107,7 +108,8 @@ def test_check_output(self): self.check_output(check_prim=True) def test_check_grad_normal(self): - self.check_grad(['X'], 'Out', check_prim=True) + # Now in dy2st mode x_grad = [], so set check_prim=False + self.check_grad(['X'], 'Out', check_prim=False) class TestDropoutOp2(TestDropoutOp): @@ -283,7 +285,8 @@ def test_check_output(self): self.check_output(check_prim=True) def test_check_grad_normal(self): - self.check_grad(['X'], 'Out', max_relative_error=0.05, check_prim=True) + # Now in dy2st mode x_grad = [], so set check_prim=False + self.check_grad(['X'], 'Out', max_relative_error=0.05, check_prim=False) @unittest.skipIf( diff --git a/python/paddle/fluid/tests/unittests/test_eager_run_program.py b/python/paddle/fluid/tests/unittests/test_eager_run_program.py index 0dd6b320362eb..416a692038c5e 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_run_program.py +++ b/python/paddle/fluid/tests/unittests/test_eager_run_program.py @@ -134,6 +134,8 @@ def test_eager(self): ['Fake_var@GRAD'], 'out_grad_names', [out.name + '@GRAD'], + 'x_grad_names', + [x_t.name + '@GRAD', y_t.name + '@GRAD'], ] use_interpretorcore = True diff --git a/python/paddle/fluid/tests/unittests/test_run_program_op.py b/python/paddle/fluid/tests/unittests/test_run_program_op.py index b780bf493397c..73eb152de43ac 100644 --- a/python/paddle/fluid/tests/unittests/test_run_program_op.py +++ b/python/paddle/fluid/tests/unittests/test_run_program_op.py @@ -254,6 +254,8 @@ def calc_dygraph_output(self, place): [p.name + '@GRAD' for p in inputs['Params']], 'out_grad_names', [out.name + '@GRAD' for out in outputs['Out']], + 'x_grad_names', + [p.name + '@GRAD' for p in inputs['X']], ) ) @@ -303,6 +305,8 @@ def calc_dygraph_grad(self, place): [p.name + '@GRAD' for p in inputs['Params']], 'out_grad_names', [out.name + '@GRAD' for out in outputs['Out']], + 'x_grad_names', + [p.name + '@GRAD' for p in inputs['X']], ) ) diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py index d24a335c8946e..2a27c0eb15aa4 100644 --- a/python/paddle/jit/dy2static/partial_program.py +++ b/python/paddle/jit/dy2static/partial_program.py @@ -21,7 +21,7 @@ from paddle.amp.auto_cast import _in_amp_guard, _in_pure_fp16_guard from paddle.fluid import backward, core, framework, program_guard from paddle.fluid.compiler import BuildStrategy -from paddle.fluid.data_feeder import convert_dtype +from paddle.fluid.data_feeder import check_type, convert_dtype from paddle.fluid.dygraph.base import switch_to_static_graph from paddle.fluid.framework import _apply_pass from paddle.optimizer.lr import LRScheduler @@ -29,9 +29,8 @@ from . import logging_utils from .utils import ( RETURN_NO_VALUE_MAGIC_NUM, - _out_grad_names, - _param_grad_names, backend_guard, + construct_grad_names, ) __all__ = [] @@ -208,6 +207,7 @@ def __init__( self._scope_cache = {} self._hooker = None self._backend = kwargs.get('backend', None) + self._grad_var_names = {} def __call__(self, inputs): """ @@ -443,23 +443,11 @@ def _train_pure_fp16_program_id(self): def _infer_pure_fp16_program_id(self): return paddle.utils._hash_with_id(self._infer_pure_fp16_program, self) - @LazyInitialized - def _param_grad_names(self): - return _param_grad_names(self._train_program.desc, self._params) - def get_forward_end_op_idx(self, program): return self._forward_end_index_map[ paddle.utils._hash_with_id(program, self) ] - @LazyInitialized - def _out_grad_names(self): - return _out_grad_names( - self._train_program.desc, - self.get_forward_end_op_idx(self._train_program), - len(self._outputs.var_ids), - ) - @property def program(self): """ @@ -649,7 +637,33 @@ def _append_backward_desc(self, main_program): if targets: start_idx = len(program.block(0).ops) + len(self._outputs.tolist()) with backend_guard(self._backend): - backward.gradients(targets=targets, inputs=[]) + check_type( + targets, + 'targets', + (framework.Variable, list, tuple), + 'paddle.static.gradients', + ) + grad_info_map = backward.calc_gradient_helper( + targets=targets, inputs=[] + ) + + x_vars = [ + program.block(0).var(var.name) + for var in self._inputs + if isinstance(var, framework.Variable) + ] + param_vars = [ + program.block(0).var(param.name) for param in self._params + ] + out_vars = [ + program.block(0).var(var.name) + for var in self._outputs + if isinstance(var, framework.Variable) + ] + + self._grad_var_names = construct_grad_names( + grad_info_map, x_vars, param_vars, out_vars + ) if self._hooker: program, start_idx = self._hooker.after_append_backward( @@ -720,9 +734,11 @@ def _prepare_attributes(self): attrs.extend( ( 'param_grad_names', - self._param_grad_names, + self._grad_var_names.get('param', []), 'out_grad_names', - self._out_grad_names, + self._grad_var_names.get('out', []), + 'x_grad_names', + self._grad_var_names.get('x', []), ) ) if self._cuda_graph_capture_mode: @@ -761,9 +777,9 @@ def _get_forward_backward_program_form( backward_end_op_index = whole_program.desc.block(0).op_size() # For Backward process in CINN, all param@GRAD shoule be skipped for GC, because # they will be shared in scope and used by optimizer. - backward_skip_vars = ( - self._parse_skip_gc_vars(whole_program) + self._param_grad_names - ) + backward_skip_vars = self._parse_skip_gc_vars( + whole_program + ) + self._grad_var_names.get('param', []) backward_builded_program = add_build_strategy_for( whole_program, backward_start_op_index, diff --git a/python/paddle/jit/dy2static/utils.py b/python/paddle/jit/dy2static/utils.py index df892ab2a06ef..1dbbf3669ca4b 100644 --- a/python/paddle/jit/dy2static/utils.py +++ b/python/paddle/jit/dy2static/utils.py @@ -32,7 +32,7 @@ import paddle from paddle import fluid # noqa: F401 -from paddle.fluid import core, unique_name +from paddle.fluid import backward, core, framework, unique_name from paddle.fluid.data_feeder import convert_dtype from paddle.fluid.layer_helper import LayerHelper from paddle.fluid.wrapped_decorator import signature_safe_contextmanager @@ -1462,61 +1462,6 @@ def create_name_str(name_ids): return "(%s, )" % ','.join(names_str) -def _param_grad_names(program_desc, params): - """ - Parse PARAM@GARD name from original train and infer program. - """ - names = [] - # NOTE: `names` and `params` must be in the same order so that - # the param grad name can be set correctly in the run_program. - for param in params: - candidate = [] - for var in program_desc.block(0).all_vars(): - var_name = var.name() - if param.name not in var_name: - continue - suf_count = var_name.count(GRAD_SUFFIX) - if suf_count > 0: - suffix = param.name + GRAD_SUFFIX * suf_count - pre_count = var_name.count(GRAD_PREFIX) - if GRAD_PREFIX * pre_count + suffix == var_name: - candidate.append(var_name) - - if candidate: - names.append( - max( - candidate, - key=lambda name: name.count(GRAD_PREFIX) - if GRAD_PREFIX in name - else name.count(GRAD_SUFFIX), - ) - ) - else: - names.append(param.name + GRAD_SUFFIX) - return names - - -def _out_grad_names(program_desc, fwd_end_op_index, out_size): - """ - Parse Out@GARD name from original train and infer program. - """ - names = [] - for i in range( - fwd_end_op_index, - min(fwd_end_op_index + out_size, program_desc.block(0).op_size()), - ): - op = program_desc.block(0).op(i) - # If prim forward op, fill_any_like will be decomposite as fill_constant. - if core._is_fwd_prim_enabled(): - target = ('fill_any_like', 'fill_constant') - else: - target = 'fill_any_like' - if op.type() in target: - var_name = op.output('Out')[0] - names.append(var_name) - return names - - def prim_or_cinn_is_enabled(build_strategy, backend): if backend == 'CINN': return True @@ -1571,3 +1516,19 @@ def backend_guard(backend): finally: core._set_prim_forward_enabled(orign_fwd) core._set_prim_backward_enabled(orign_bwd) + + +def construct_grad_names(grad_info_map, x_vars, param_vars, out_vars): + grad_var_names = {} + fn = ( + lambda grad_var: grad_var.name + if isinstance(grad_var, framework.Variable) + else framework.EMPTY_VAR_NAME + ) + x_grad_vars = backward._get_grad_vars(grad_info_map, x_vars) + grad_var_names['x'] = list(map(fn, x_grad_vars)) + param_grad_vars = backward._get_grad_vars(grad_info_map, param_vars) + grad_var_names['param'] = list(map(fn, param_grad_vars)) + out_grad_vars = backward._get_grad_vars(grad_info_map, out_vars) + grad_var_names['out'] = list(map(fn, out_grad_vars)) + return grad_var_names diff --git a/python/paddle/jit/translated_layer.py b/python/paddle/jit/translated_layer.py index c56608d6bb66e..9f2fe5ee41b0d 100644 --- a/python/paddle/jit/translated_layer.py +++ b/python/paddle/jit/translated_layer.py @@ -20,16 +20,16 @@ import paddle from paddle import _legacy_C_ops from paddle.fluid import backward, core, framework, unique_name +from paddle.fluid.data_feeder import check_type from paddle.fluid.dygraph.base import switch_to_static_graph from paddle.fluid.framework import OpProtoHolder, _non_static_mode from paddle.jit.dy2static.partial_program import ( LazyInitialized, add_build_strategy_for, ) +from paddle.jit.dy2static.utils import construct_grad_names from paddle.nn.layer import layers -from .dy2static.utils import _out_grad_names, _param_grad_names - __all__ = [] INFER_MODEL_SUFFIX = ".pdmodel" @@ -349,6 +349,7 @@ def __init__(self, program_desc): self._train_program_desc = self._append_backward_desc( self._infer_program_desc ) + self._grad_var_names = {} # forward: @switch_to_static_graph @@ -419,6 +420,10 @@ def double_grad_descs(self): def scope(self): return self._inner_scope + @property + def grad_var_names(self): + return self._grad_var_names + def _preprocess(self, program_desc): # rename persistable variables of 'program_desc' list_persistable_var = _get_persistable_var_names(program_desc) @@ -599,7 +604,29 @@ def _append_backward_desc(self, infer_program_desc): targets.append(program.global_block().var(out.name())) # 3. append backward - backward.gradients(targets=targets, inputs=[]) + check_type( + targets, + 'targets', + (framework.Variable, list, tuple), + 'paddle.static.gradients', + ) + grad_info_map = backward.calc_gradient_helper( + targets=targets, inputs=[] + ) + x_vars = [ + program.block(0).var(desc.name()) for desc in self._input_descs + ] + param_vars = [ + program.block(0).var(name) for name in self._persistable_names + ] + out_vars = [ + program.block(0).var(desc.name()) for desc in self._output_descs + ] + + self._grad_var_names = construct_grad_names( + grad_info_map, x_vars, param_vars, out_vars + ) + return program.desc @@ -964,9 +991,11 @@ def _run_dygraph(instance, input, program_holder): attrs.extend( ( 'param_grad_names', - _param_grad_names(trace_program, persistable_vars), + program_holder.grad_var_names.get('param', []), 'out_grad_names', - _out_grad_names(trace_program, end_op_index, len(output_vars)), + program_holder.grad_var_names.get('out', []), + 'x_grad_names', + program_holder.grad_var_names.get('x', []), ) ) diff --git a/test/dygraph_to_static/test_gradname_parse.py b/test/dygraph_to_static/test_gradname_parse.py index d51bcbf068423..743f92f758f4f 100644 --- a/test/dygraph_to_static/test_gradname_parse.py +++ b/test/dygraph_to_static/test_gradname_parse.py @@ -15,6 +15,8 @@ import unittest +import numpy as np + import paddle from paddle import ParamAttr from paddle.nn import BatchNorm, Linear @@ -65,5 +67,99 @@ def test_grad_name_parse(self): opt.minimize(loss) +def tanh_high_order_grad(x): + y = paddle.tanh(x) + return paddle.grad(y, x, create_graph=True)[0] + + +class TestTanhHighOrderGrad(unittest.TestCase): + def setUp(self): + self.func = tanh_high_order_grad + + x1 = paddle.ones((1,)) + x1.stop_gradient = False + self.dy_input = (x1,) + self.dy_grad_input = (x1,) + + x2 = paddle.ones((1,)) + x2.stop_gradient = False + self.dy2st_input = (x2,) + self.dy2st_grad_input = (x2,) + + def test_run(self): + try: + dy_out = self.func(*self.dy_input) + dy_grad = paddle.grad(dy_out, self.dy_grad_input) + except: + dy_grad = [None for i in self.dy_grad_input] + dy_grad = [ + t.numpy() if isinstance(t, paddle.Tensor) else t for t in dy_grad + ] + + dy2st_out = paddle.jit.to_static(self.func)(*self.dy2st_input) + dy2st_grad = paddle.grad(dy2st_out, self.dy2st_grad_input) + dy2st_grad = [ + t.numpy() if isinstance(t, paddle.Tensor) else t for t in dy_grad + ] + np.testing.assert_equal(dy_grad, dy2st_grad) + + dy_input_grad = [ + t.grad.numpy() if isinstance(t.grad, paddle.Tensor) else None + for t in self.dy_input + ] + dy2st_input_grad = [ + t.grad.numpy() if isinstance(t.grad, paddle.Tensor) else None + for t in self.dy2st_input + ] + np.testing.assert_equal(dy_input_grad, dy2st_input_grad) + + +def matmul_high_order_grad(x, y): + z = paddle.matmul(x, y) + g = paddle.grad(z, [x, y], create_graph=False) + return g[0] + + +class TestMatMulHighOrderGrad1(TestTanhHighOrderGrad): + def setUp(self): + self.func = matmul_high_order_grad + + x1 = paddle.ones([1]) + x1.stop_gradient = False + y1 = paddle.ones([1]) + y1.stop_gradient = False + self.dy_input = (x1, y1) + self.dy_grad_input = (x1,) + + x2 = paddle.ones([1]) + x2.stop_gradient = False + y2 = paddle.ones([1]) + y2.stop_gradient = False + self.dy2st_input = (x2, y2) + self.dy2st_grad_input = (x2,) + + +class TestMatMulHighOrderGrad2(TestTanhHighOrderGrad): + def setUp(self): + self.func = matmul_high_order_grad + + x = np.random.randn(5, 5) + y = np.random.randn(5, 5) + + x1 = paddle.to_tensor(x) + x1.stop_gradient = False + y1 = paddle.to_tensor(y) + y1.stop_gradient = True + self.dy_input = (x1, y1) + self.dy_grad_input = (x1,) + + x2 = paddle.to_tensor(x) + x2.stop_gradient = False + y2 = paddle.to_tensor(y) + y2.stop_gradient = True + self.dy2st_input = (x2, y2) + self.dy2st_grad_input = (x2,) + + if __name__ == "__main__": unittest.main() From 1c97aa69632f2d8aae7622a178c225f8164bbe26 Mon Sep 17 00:00:00 2001 From: zhupengyang Date: Thu, 27 Apr 2023 19:15:17 +0800 Subject: [PATCH 116/405] xpu quant weight only (#53306) --- .../fused_multi_transformer_xpu_quant_pass.cc | 65 ++++++++++--------- paddle/fluid/framework/ir/xpu/pass_utils.cc | 7 ++ paddle/fluid/framework/ir/xpu/quant_utils.cc | 13 ++++ paddle/fluid/inference/analysis/argument.h | 6 ++ .../inference/analysis/ir_pass_manager.cc | 8 +++ paddle/fluid/inference/api/analysis_config.cc | 21 ++++++ .../fluid/inference/api/analysis_predictor.cc | 4 ++ .../inference/api/paddle_analysis_config.h | 14 ++++ paddle/fluid/pybind/inference_api.cc | 5 ++ 9 files changed, 113 insertions(+), 30 deletions(-) diff --git a/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_quant_pass.cc b/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_quant_pass.cc index fab466a50637e..b9868a81135d4 100644 --- a/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_quant_pass.cc +++ b/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_quant_pass.cc @@ -280,6 +280,8 @@ int FusedMultiTransformerXPUQuantPass::ApplyImpl(ir::Graph* graph, with_time_step, with_seq_lengths, with_src_mask); + int quant_weight_bits = + Has("quant_weight_bits") ? Get("quant_weight_bits") : -1; int found_subgraph_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, @@ -312,36 +314,39 @@ int FusedMultiTransformerXPUQuantPass::ApplyImpl(ir::Graph* graph, // quant weight nodes // w_nodes_vec: [QKVW, OutLinearW, FFN1Weight, FFN2Weight] std::vector> w_nodes_vec(4); - std::vector> w_int16_nodes_vec(4); + std::vector> w_intx_nodes_vec(4); std::vector> w_max_nodes_vec(4); - std::vector> w_int16_names_vec(4); + std::vector> w_intx_names_vec(4); std::vector> w_max_names_vec(4); auto quant_func = [&](const std::string& input_name, std::vector* w_nodes, - std::vector* w_int16_nodes, + std::vector* w_intx_nodes, std::vector* w_max_nodes, - std::vector* w_int16_names, + std::vector* w_intx_names, std::vector* w_max_names, bool need_transpose) { - typedef int16_t TW; - auto w_names = fused_mt->Op()->Input(input_name); for (auto w_name : w_names) { Node* w_node = FindNodeWithName(graph, w_name); - Node* w_int16 = nullptr; + Node* w_intx = nullptr; Node* w_max = nullptr; PADDLE_ENFORCE_NE( w_node, nullptr, platform::errors::Fatal("w node should not be nullptr")); - PrepareWeight( - graph, scope, block, w_node, &w_int16, &w_max, need_transpose); + if (quant_weight_bits == 8) { + PrepareWeight( + graph, scope, block, w_node, &w_intx, &w_max, need_transpose); + } else { + PrepareWeight( + graph, scope, block, w_node, &w_intx, &w_max, need_transpose); + } w_nodes->push_back(w_node); - w_int16_nodes->push_back(w_int16); + w_intx_nodes->push_back(w_intx); w_max_nodes->push_back(w_max); } for (size_t i = 0; i < w_names.size(); ++i) { - w_int16_names->push_back(w_int16_nodes->at(i)->Name()); + w_intx_names->push_back(w_intx_nodes->at(i)->Name()); w_max_names->push_back(w_max_nodes->at(i)->Name()); } PADDLE_ENFORCE_EQ( @@ -353,11 +358,11 @@ int FusedMultiTransformerXPUQuantPass::ApplyImpl(ir::Graph* graph, static_cast(w_nodes->size()))); PADDLE_ENFORCE_EQ( w_names.size(), - w_int16_nodes->size(), + w_intx_nodes->size(), platform::errors::Fatal( - "The size of w_names(%d) should be equal to w_int16_nodes(%d)", + "The size of w_names(%d) should be equal to w_intx_nodes(%d)", static_cast(w_names.size()), - static_cast(w_int16_nodes->size()))); + static_cast(w_intx_nodes->size()))); PADDLE_ENFORCE_EQ( w_names.size(), w_max_nodes->size(), @@ -367,11 +372,11 @@ int FusedMultiTransformerXPUQuantPass::ApplyImpl(ir::Graph* graph, static_cast(w_max_nodes->size()))); PADDLE_ENFORCE_EQ( w_names.size(), - w_int16_names->size(), + w_intx_names->size(), platform::errors::Fatal( - "The size of w_names(%d) should be equal to w_int16_names(%d)", + "The size of w_names(%d) should be equal to w_intx_names(%d)", static_cast(w_names.size()), - static_cast(w_int16_names->size()))); + static_cast(w_intx_names->size()))); PADDLE_ENFORCE_EQ( w_names.size(), w_max_names->size(), @@ -382,30 +387,30 @@ int FusedMultiTransformerXPUQuantPass::ApplyImpl(ir::Graph* graph, }; quant_func("QKVW", &(w_nodes_vec[0]), - &(w_int16_nodes_vec[0]), + &(w_intx_nodes_vec[0]), &(w_max_nodes_vec[0]), - &(w_int16_names_vec[0]), + &(w_intx_names_vec[0]), &(w_max_names_vec[0]), false); quant_func("OutLinearW", &(w_nodes_vec[1]), - &(w_int16_nodes_vec[1]), + &(w_intx_nodes_vec[1]), &(w_max_nodes_vec[1]), - &(w_int16_names_vec[1]), + &(w_intx_names_vec[1]), &(w_max_names_vec[1]), true); quant_func("FFN1Weight", &(w_nodes_vec[2]), - &(w_int16_nodes_vec[2]), + &(w_intx_nodes_vec[2]), &(w_max_nodes_vec[2]), - &(w_int16_names_vec[2]), + &(w_intx_names_vec[2]), &(w_max_names_vec[2]), true); quant_func("FFN2Weight", &(w_nodes_vec[3]), - &(w_int16_nodes_vec[3]), + &(w_intx_nodes_vec[3]), &(w_max_nodes_vec[3]), - &(w_int16_names_vec[3]), + &(w_intx_names_vec[3]), &(w_max_names_vec[3]), true); @@ -482,13 +487,13 @@ int FusedMultiTransformerXPUQuantPass::ApplyImpl(ir::Graph* graph, name_caches.at("CacheKVOut")); fused_mt_xpu_op_desc->SetOutput("out", name_caches.at("Out")); - fused_mt_xpu_op_desc->SetInput("qkvw", w_int16_names_vec[0]); + fused_mt_xpu_op_desc->SetInput("qkvw", w_intx_names_vec[0]); fused_mt_xpu_op_desc->SetInput("qkvw_max", w_max_names_vec[0]); - fused_mt_xpu_op_desc->SetInput("out_linear_w", w_int16_names_vec[1]); + fused_mt_xpu_op_desc->SetInput("out_linear_w", w_intx_names_vec[1]); fused_mt_xpu_op_desc->SetInput("out_linear_wmax", w_max_names_vec[1]); - fused_mt_xpu_op_desc->SetInput("ffn1_weight", w_int16_names_vec[2]); + fused_mt_xpu_op_desc->SetInput("ffn1_weight", w_intx_names_vec[2]); fused_mt_xpu_op_desc->SetInput("ffn1_weight_max", w_max_names_vec[2]); - fused_mt_xpu_op_desc->SetInput("ffn2_weight", w_int16_names_vec[3]); + fused_mt_xpu_op_desc->SetInput("ffn2_weight", w_intx_names_vec[3]); fused_mt_xpu_op_desc->SetInput("ffn2_weight_max", w_max_names_vec[3]); if (!fused_mt_xpu_op_desc->HasAttr("rotary_emb_dims")) { fused_mt_xpu_op_desc->SetAttr("rotary_emb_dims", 0); @@ -501,7 +506,7 @@ int FusedMultiTransformerXPUQuantPass::ApplyImpl(ir::Graph* graph, } // link int16 format of QKVW/OutLinearW/FFN1Weight/FFN2Weight to // fused_mt_xpu - for (auto nodes : w_int16_nodes_vec) { + for (auto nodes : w_intx_nodes_vec) { for (auto node : nodes) { IR_NODE_LINK_TO(node, fused_mt); } diff --git a/paddle/fluid/framework/ir/xpu/pass_utils.cc b/paddle/fluid/framework/ir/xpu/pass_utils.cc index bec10d421c7c2..aaa117d363a5f 100644 --- a/paddle/fluid/framework/ir/xpu/pass_utils.cc +++ b/paddle/fluid/framework/ir/xpu/pass_utils.cc @@ -193,6 +193,13 @@ template void PrepareWeight(Graph* graph, Node** dst, Node** dst_max, bool transpose); +template void PrepareWeight(Graph* graph, + Scope* scope, + BlockDesc* block, + Node* src, + Node** dst, + Node** dst_max, + bool transpose); void PrepareBias( Graph* graph, Scope* scope, BlockDesc* block, Node* src, Node** dst) { diff --git a/paddle/fluid/framework/ir/xpu/quant_utils.cc b/paddle/fluid/framework/ir/xpu/quant_utils.cc index ae697708ddcd7..d075d42d29506 100644 --- a/paddle/fluid/framework/ir/xpu/quant_utils.cc +++ b/paddle/fluid/framework/ir/xpu/quant_utils.cc @@ -207,6 +207,16 @@ void QuantFP32ToIntX(const float* src_ptr, } } +template <> +void QuantFP32ToIntX(const float* src_ptr, + int8_t* dst_ptr, + float max_val, + int numel) { + for (int i = 0; i < numel; i++) { + dst_ptr[i] = Fp32ToIntx(src_ptr[i], max_val); + } +} + template void PrepareWeight(phi::DenseTensor* weight, phi::DenseTensor* weight_max, @@ -253,6 +263,9 @@ void PrepareWeight(phi::DenseTensor* weight, template void PrepareWeight(phi::DenseTensor* weight, phi::DenseTensor* weight_max, bool transpose); +template void PrepareWeight(phi::DenseTensor* weight, + phi::DenseTensor* weight_max, + bool transpose); } // namespace ir } // namespace framework diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 014f4e828efc7..88e6749223008 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -289,6 +289,12 @@ struct Argument { DECL_ARGUMENT_FIELD(xpu_adaptive_seqlen, XpuAdaptiveSeqlen, bool); DECL_ARGUMENT_FIELD(xpu_device_id, XpuDeviceId, int); DECL_ARGUMENT_FIELD(xpu_enable_multi_stream, XpuEnableMultiStream, bool); + DECL_ARGUMENT_FIELD(xpu_quant_post_dynamic_weight_bits, + XpuQuantPostDynamicWeightBits, + int); + DECL_ARGUMENT_FIELD(xpu_quant_post_dynamic_op_types, + XpuQuantPostDynamicOpTypss, + std::vector); DECL_ARGUMENT_FIELD(use_opencl, UseOpenCL, bool); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index c3c2fb6d80ffd..1d87edcd3404c 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -308,6 +308,14 @@ void IRPassManager::CreatePasses(Argument *argument, } bool use_fc_padding = !fc_mkldnn_pass && argument->use_fc_padding(); pass->Set("use_fc_padding", new bool(use_fc_padding)); + } else if (pass_name == "fused_multi_transformer_xpu_quant_pass") { + auto op_types = argument->xpu_quant_post_dynamic_op_types(); + if (std::count(op_types.begin(), + op_types.end(), + "fused_multi_transformer") > 0) { + pass->Set("quant_weight_bits", + new int(argument->xpu_quant_post_dynamic_weight_bits())); + } } pre_pass = pass_name; diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index afb2dcd981fa8..3fa947ce27daf 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -196,6 +196,14 @@ void AnalysisConfig::SetXpuDeviceId(int device_id) { Update(); } +void AnalysisConfig::SetXpuConfig( + int quant_post_dynamic_weight_bits, + const std::vector &quant_post_dynamic_op_types) { + xpu_quant_post_dynamic_weight_bits_ = quant_post_dynamic_weight_bits; + xpu_quant_post_dynamic_op_types_ = quant_post_dynamic_op_types; + Update(); +} + void AnalysisConfig::EnableCustomDevice(const std::string &device_type, int device_id, Precision precision_mode) { @@ -489,6 +497,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(xpu_precision_); CP_MEMBER(xpu_adaptive_seqlen_); CP_MEMBER(xpu_enable_multi_stream_); + CP_MEMBER(xpu_quant_post_dynamic_weight_bits_); + CP_MEMBER(xpu_quant_post_dynamic_op_types_); // Lite OpenCL Related CP_MEMBER(use_opencl_); @@ -1091,6 +1101,10 @@ std::string AnalysisConfig::SerializeInfoCache() { ss << xpu_precision_; ss << xpu_adaptive_seqlen_; ss << xpu_enable_multi_stream_; + ss << xpu_quant_post_dynamic_weight_bits_; + for (auto op_type : xpu_quant_post_dynamic_op_types_) { + ss << op_type; + } ss << use_npu_; ss << npu_device_id_; @@ -1331,6 +1345,13 @@ std::string AnalysisConfig::Summary() { os.InsertRow({"xpu_device_id", std::to_string(xpu_device_id_)}); os.InsertRow( {"xpu_l3_workspace_size", std::to_string(xpu_l3_workspace_size_)}); + os.InsertRow({"xpu_quant_post_dynamic_weight_bits", + std::to_string(xpu_quant_post_dynamic_weight_bits_)}); + std::vector op_types{"xpu_quant_post_dynamic_op_types"}; + for (auto op_type : xpu_quant_post_dynamic_op_types_) { + op_types.push_back(op_type); + } + os.InsertRow(op_types); } os.InsetDivider(); diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 219c3c2754c68..4ef15fbbc15dc 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1426,6 +1426,10 @@ void AnalysisPredictor::PrepareArgument() { argument_->SetXpuAdaptiveSeqlen(config_.xpu_adaptive_seqlen_); argument_->SetXpuDeviceId(config_.xpu_device_id_); argument_->SetXpuEnableMultiStream(config_.xpu_enable_multi_stream_); + argument_->SetXpuQuantPostDynamicWeightBits( + config_.xpu_quant_post_dynamic_weight_bits_); + argument_->SetXpuQuantPostDynamicOpTypss( + config_.xpu_quant_post_dynamic_op_types_); #endif auto *pass_builder = config_.pass_builder(); diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 10b92debdace1..3300fe8a9b8dc 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -288,6 +288,18 @@ struct PD_INFER_DECL AnalysisConfig { bool adaptive_seqlen = false, bool enable_multi_stream = false); + /// + /// \brief configs of XPU + /// + /// \param quant_post_dynamic_weight_bits Weight bits used in dynamic post + /// quantization. Optional value: -1, 8, 16. Default value is -1, means using + /// the recommended way. \param quant_post_dynamic_op_types Ops used in + /// dynamic post quantization. + /// + void SetXpuConfig( + int quant_post_dynamic_weight_bits = -1, + const std::vector& quant_post_dynamic_op_types = {}); + /// /// \brief configs of IPU /// @@ -1181,6 +1193,8 @@ struct PD_INFER_DECL AnalysisConfig { std::string xpu_precision_; bool xpu_adaptive_seqlen_; bool xpu_enable_multi_stream_; + int xpu_quant_post_dynamic_weight_bits_{-1}; + std::vector xpu_quant_post_dynamic_op_types_; // LITE OPENCL SETTINGS bool use_opencl_{false}; diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index bce79c27c0585..e00c22423eb28 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -767,6 +767,11 @@ void BindAnalysisConfig(py::module *m) { .def("set_xpu_device_id", &AnalysisConfig::SetXpuDeviceId, py::arg("device_id") = 0) + .def( + "set_xpu_config", + &AnalysisConfig::SetXpuConfig, + py::arg("quant_post_dynamic_weight_bits") = -1, + py::arg("quant_post_dynamic_op_types") = std::vector({})) .def("enable_custom_device", &AnalysisConfig::EnableCustomDevice, py::arg("device_type"), From 182b6f8385b17f880652782924e131bbfcc7f9f7 Mon Sep 17 00:00:00 2001 From: Yuanle Liu Date: Thu, 27 Apr 2023 19:44:10 +0800 Subject: [PATCH 117/405] scale trt converter support int64 (#53388) --- paddle/fluid/inference/tensorrt/convert/scale_op.cc | 2 +- paddle/fluid/inference/tensorrt/op_teller.cc | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/scale_op.cc b/paddle/fluid/inference/tensorrt/convert/scale_op.cc index c6ed50501f766..054964419d2d9 100644 --- a/paddle/fluid/inference/tensorrt/convert/scale_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/scale_op.cc @@ -19,7 +19,7 @@ namespace inference { namespace tensorrt { /* - * ConcatOp + * Scale Op */ class ScaleOpConverter : public OpConverter { public: diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index bb0fbdf6ca848..6a94e14d7e68b 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -1184,10 +1184,12 @@ struct SimpleOpTypeSetTeller : public Teller { return false; } } else { - // At present, only support float32 or float16 or int32 into trt. + // At present, only support float32 or float16 or int32 or int64 into + // trt. if (!(dtype == framework::proto::VarType::FP32 || dtype == framework::proto::VarType::FP16 || - dtype == framework::proto::VarType::INT32)) { + dtype == framework::proto::VarType::INT32 || + dtype == framework::proto::VarType::INT64)) { return false; } } From 6b9b49ceac9d5b15e41c90b8eb70f4caa5fb6d3f Mon Sep 17 00:00:00 2001 From: JamesLim-sy Date: Thu, 27 Apr 2023 19:48:29 +0800 Subject: [PATCH 118/405] add scale for AF2 flash_attn, much thanks to xreki and shaojie for debug these codes --- paddle/fluid/operators/fused/fused_gate_attention.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h index cf4fbdd3a1739..5dd6f26a802d6 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention.h +++ b/paddle/fluid/operators/fused/fused_gate_attention.h @@ -1010,7 +1010,7 @@ class FlashAttnWithGating { // 7. flas_attn part one, get temp worksapce size. float p_dropout = 0.f; - float softmax_scale = static_cast(1); + float softmax_scale = static_cast(1.0f / std::sqrt(head_size_)); cudaStream_t stream = dev_ctx_.stream(); uint64_t workspace_size; bool succ = phi::dynload::flash_attn_fwd_with_bias_and_mask( @@ -1212,7 +1212,7 @@ class FlashAttnWithGating { // 7. flas_attn part one, get temp worksapce size. uint64_t workspace_size; float p_dropout = 0.f; - float softmax_scale = static_cast(1); + float softmax_scale = static_cast(1.0f / std::sqrt(head_size_)); cudaStream_t stream = dev_ctx_.stream(); int num_splits = 0; // 0 for an internal heuristic, which is optimal bool succ = phi::dynload::flash_attn_bwd_with_bias_and_mask( From fe053396734c58e445e3e8c9973e6ca8c125682a Mon Sep 17 00:00:00 2001 From: gouzil <66515297+gouzil@users.noreply.github.com> Date: Thu, 27 Apr 2023 21:23:35 +0800 Subject: [PATCH 119/405] =?UTF-8?q?[phi]=20Move=20sequence=5Fpool=20to=20p?= =?UTF-8?q?hi=20-=20Step=203=20=EF=BC=9Asequence=5Fpool=5Fgrad=5Fop=20(#52?= =?UTF-8?q?680)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [phi] move sequence_pool kernel to phi * mv kernels impl * fix parameter error * clean include * fix compat filename * [phi] move fluid sequence_pool_grad to phi * [phi][compat] sig rm GradVarName * [phi] fix sequence_pool out type * [phi] rm impl, add const string * [phi] fix const str * fix sequence_pooling cmake * [phi] mv sequence_pooling_test * [phi] fix grad sig * [phi] fix sequence_pool is_test error * [phi] fix sequence_pooling gpu include * [phi] mv to impl * [phi] fix SequencePoolFunctor cu include * [phi] modify out max_index int32_t * [phi] add pooltype mapping determine * [phi] fix sequence_pool_sig * [phi] fix sequence_pool_sig sum * [phi] try ci * [phi] fix max_index optional --- paddle/fluid/operators/math/CMakeLists.txt | 1 - .../sequence_ops/sequence_pool_op.cc | 12 +---- .../operators/sequence_ops/sequence_pool_op.h | 49 ------------------- .../sequence_ops/unity_build_rule.cmake | 1 - .../kernels/cpu/sequence_pool_grad_kernel.cc | 26 ++++++++++ .../kernels/gpu/sequence_pool_grad_kernel.cu} | 14 ++++-- .../impl/sequence_pool_grad_kernel_impl.h | 39 +++++++++++++++ .../kernels/impl/sequence_pool_kernel_impl.h | 2 +- .../phi/kernels/sequence_pool_grad_kernel.h | 30 ++++++++++++ paddle/phi/ops/compat/sequence_pool_sig.cc | 10 ++++ 10 files changed, 116 insertions(+), 68 deletions(-) delete mode 100644 paddle/fluid/operators/sequence_ops/sequence_pool_op.h create mode 100644 paddle/phi/kernels/cpu/sequence_pool_grad_kernel.cc rename paddle/{fluid/operators/sequence_ops/sequence_pool_op.cu => phi/kernels/gpu/sequence_pool_grad_kernel.cu} (59%) create mode 100644 paddle/phi/kernels/impl/sequence_pool_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/sequence_pool_grad_kernel.h diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 405661f3a1ab9..c67703ba52814 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -14,7 +14,6 @@ math_library(sample_prob) math_library(sampler DEPS generator) # math_library(math_function DEPS blas dense_tensor tensor) - if(WITH_XPU) math_library(beam_search DEPS math_function beam_search_xpu) else() diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc index f51cc7d903664..d616bca2c4e3b 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc @@ -12,10 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/sequence_ops/sequence_pool_op.h" - -#include -#include +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -196,10 +193,3 @@ REGISTER_OPERATOR(sequence_pool, REGISTER_OPERATOR(sequence_pool_grad, ops::SequencePoolGradOp, ops::SequencePoolGradOpNoNeedBufferVarsInferer); - -PD_REGISTER_STRUCT_KERNEL(sequence_pool_grad, - CPU, - ALL_LAYOUT, - ops::SequencePoolGradKernel, - float, - double) {} diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.h b/paddle/fluid/operators/sequence_ops/sequence_pool_op.h deleted file mode 100644 index 7b00395bb6bb2..0000000000000 --- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.h +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/funcs/sequence_pooling.h" - -namespace paddle { -namespace operators { - -template -class SequencePoolGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* out_g = - context.Input(framework::GradVarName("Out")); - auto* in_g = context.Output(framework::GradVarName("X")); - std::string pooltype = context.Attr("pooltype"); - const phi::DenseTensor* index = nullptr; - if (pooltype == "MAX") { - index = context.Input("MaxIndex"); - } - in_g->mutable_data(context.GetPlace()); - phi::funcs::SequencePoolGradFunctor pool; - pool(context.template device_context(), - pooltype, - *out_g, - in_g, - index); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/sequence_ops/unity_build_rule.cmake b/paddle/fluid/operators/sequence_ops/unity_build_rule.cmake index b23035082e4aa..b256a227a9646 100644 --- a/paddle/fluid/operators/sequence_ops/unity_build_rule.cmake +++ b/paddle/fluid/operators/sequence_ops/unity_build_rule.cmake @@ -30,7 +30,6 @@ register_unity_group( sequence_expand_op.cu sequence_mask_op.cu sequence_pad_op.cu - sequence_pool_op.cu sequence_expand_as_op.cu sequence_reshape_op.cu sequence_reverse_op.cu diff --git a/paddle/phi/kernels/cpu/sequence_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/sequence_pool_grad_kernel.cc new file mode 100644 index 0000000000000..9f15ed0147031 --- /dev/null +++ b/paddle/phi/kernels/cpu/sequence_pool_grad_kernel.cc @@ -0,0 +1,26 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/sequence_pool_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/sequence_pool_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(sequence_pool_grad, + CPU, + ALL_LAYOUT, + phi::SequencePoolGradKernel, + float, + double) {} diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu b/paddle/phi/kernels/gpu/sequence_pool_grad_kernel.cu similarity index 59% rename from paddle/fluid/operators/sequence_ops/sequence_pool_op.cu rename to paddle/phi/kernels/gpu/sequence_pool_grad_kernel.cu index 796b02cb03e32..fe991a1fef431 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu +++ b/paddle/phi/kernels/gpu/sequence_pool_grad_kernel.cu @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -11,8 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/sequence_ops/sequence_pool_op.h" -namespace ops = paddle::operators; -PD_REGISTER_STRUCT_KERNEL( - sequence_pool_grad, GPU, ALL_LAYOUT, ops::SequencePoolGradKernel, float) {} +#include "paddle/phi/kernels/sequence_pool_grad_kernel.h" +#include "paddle/phi/kernels/impl/sequence_pool_grad_kernel_impl.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL( + sequence_pool_grad, GPU, ALL_LAYOUT, phi::SequencePoolGradKernel, float) {} diff --git a/paddle/phi/kernels/impl/sequence_pool_grad_kernel_impl.h b/paddle/phi/kernels/impl/sequence_pool_grad_kernel_impl.h new file mode 100644 index 0000000000000..da9bdc1f1fdf5 --- /dev/null +++ b/paddle/phi/kernels/impl/sequence_pool_grad_kernel_impl.h @@ -0,0 +1,39 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/kernels/funcs/sequence_pooling.h" + +namespace phi { + +template +void SequencePoolGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const paddle::optional& max_index, + const DenseTensor& out_grad, + bool is_test, + const std::string& pooltype, + float pad_value, + DenseTensor* x_grad) { + const phi::DenseTensor* index = nullptr; + if (pooltype == "MAX") { + index = max_index.get_ptr(); + } + dev_ctx.template Alloc(x_grad); + phi::funcs::SequencePoolGradFunctor pool; + pool(dev_ctx, pooltype, out_grad, x_grad, index); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/sequence_pool_kernel_impl.h b/paddle/phi/kernels/impl/sequence_pool_kernel_impl.h index 56633b6b54270..e448820516afe 100644 --- a/paddle/phi/kernels/impl/sequence_pool_kernel_impl.h +++ b/paddle/phi/kernels/impl/sequence_pool_kernel_impl.h @@ -68,7 +68,7 @@ void SequencePoolKernel(const Context& ctx, (is_test == false || (ctx.GetPlace() == phi::CPUPlace()) == false)) { index = max_index; index->Resize({dims}); - ctx.template Alloc(index); + ctx.template Alloc(index); } phi::funcs::SequencePoolFunctor pool; pool(ctx, pooltype, pad_value_, x, out, is_test, index); diff --git a/paddle/phi/kernels/sequence_pool_grad_kernel.h b/paddle/phi/kernels/sequence_pool_grad_kernel.h new file mode 100644 index 0000000000000..a88f9ceb4b1ae --- /dev/null +++ b/paddle/phi/kernels/sequence_pool_grad_kernel.h @@ -0,0 +1,30 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { +template +void SequencePoolGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const paddle::optional& max_index, + const DenseTensor& out_grad, + bool is_test, + const std::string& pooltype, + float pad_value, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/ops/compat/sequence_pool_sig.cc b/paddle/phi/ops/compat/sequence_pool_sig.cc index 6c4d6691db4bb..224f83d1f32f9 100644 --- a/paddle/phi/ops/compat/sequence_pool_sig.cc +++ b/paddle/phi/ops/compat/sequence_pool_sig.cc @@ -21,6 +21,16 @@ KernelSignature SequencePoolOpArgumentMapping( {"Out", "MaxIndex"}); } +KernelSignature SequencePoolGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("sequence_pool_grad", + {"X", "MaxIndex", "Out@GRAD"}, + {"is_test", "pooltype", "pad_value"}, + {"X@GRAD"}); +} + } // namespace phi PD_REGISTER_ARG_MAPPING_FN(sequence_pool, phi::SequencePoolOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sequence_pool_grad, + phi::SequencePoolGradOpArgumentMapping); From 2e31f245c4fd0a028f86f0ad80adc2d8df8a5bf6 Mon Sep 17 00:00:00 2001 From: JamesLim-sy Date: Thu, 27 Apr 2023 22:00:02 +0800 Subject: [PATCH 120/405] much thanks to xreki and shaojie for debuging this flash_attn in AlphaFold2 model plugin --- cmake/external/flashattn.cmake | 2 +- .../operators/fused/fused_gate_attention.h | 82 +++++++++---------- 2 files changed, 42 insertions(+), 42 deletions(-) diff --git a/cmake/external/flashattn.cmake b/cmake/external/flashattn.cmake index c94717ff9f5ff..29c0b8efb426c 100644 --- a/cmake/external/flashattn.cmake +++ b/cmake/external/flashattn.cmake @@ -20,7 +20,7 @@ set(FLASHATTN_PREFIX_DIR ${THIRD_PARTY_PATH}/flashattn) set(FLASHATTN_SOURCE_SUBDIR csrc/flash_attn) set(FLASHATTN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/flashattn) set(FLASHATTN_REPOSITORY ${GIT_URL}/JamesLim-sy/flash-attention.git) -set(FLASHATTN_TAG opitmization_for_alphafold2) +set(FLASHATTN_TAG 782a98a1fc02fb36d07129a97750430e4fdf2022) set(FLASHATTN_INCLUDE_DIR "${FLASHATTN_INSTALL_DIR}/include" diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h index 5dd6f26a802d6..5126d1a78f13a 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention.h +++ b/paddle/fluid/operators/fused/fused_gate_attention.h @@ -70,9 +70,24 @@ inline std::string TensorDebugString(const phi::DenseTensor* t) { } } -inline std::string WaitWithDebugInfo(const phi::GPUContext& dev_ctx) { - dev_ctx.Wait(); - return "[Synchronize] "; +inline void WaitWithDebugInfo(const phi::GPUContext& dev_ctx) { + if (VLOG_IS_ON(5)) { + dev_ctx.Wait(); + VLOG(5) << "[Flash attn Synchronize] "; + } +} + +template +inline void TypeDebugInfo() { + if (VLOG_IS_ON(4)) { + if (std::is_same::value) { + VLOG(4) << "[Grad]: T is phi::dtype::float16."; + } else if (std::is_same::value) { + VLOG(4) << "[Grad]: T is phi::dtype::bfloat16."; + } else if (std::is_same::value) { + VLOG(4) << "[Grad]: T is float."; + } + } } template @@ -902,15 +917,7 @@ class FlashAttnWithGating { GateAttentionConfig* config) { bool is_bf16 = qkv_transpose_out->dtype() == DataType::BFLOAT16 ? true : false; - - if (std::is_same::value) { - VLOG(4) << "T is phi::dtype::float16."; - } else if (std::is_same::value) { - VLOG(4) << "T is phi::dtype::bfloat16."; - } else if (std::is_same::value) { - VLOG(4) << "T is float."; - } - + TypeDebugInfo(); PADDLE_ENFORCE_NOT_NULL( qkv_transpose_out, platform::errors::NotFound("The input qkv_transpose_out can not be " @@ -920,6 +927,7 @@ class FlashAttnWithGating { phi::DenseTensor* qkv_out = config->GetQKVOut(); ComputeQKVTransposeForwardForFlashAttn(*qkv_out, qkv_transpose_out); config->ClearQKVOut(); + WaitWithDebugInfo(dev_ctx_); int seq_batch_size = static_cast(config->batch_size) * static_cast(config->seq_len_m); @@ -928,7 +936,7 @@ class FlashAttnWithGating { seq_batch_size * static_cast(config->seq_len_r), static_cast(config->num_heads), static_cast(config->head_dim)}); - VLOG(5) << WaitWithDebugInfo(dev_ctx_) << "Reshape qkv_transpose_out: [" + VLOG(5) << "Reshape qkv_transpose_out: [" << config->qkv_transpose_out_dims << "] -> [" << qkv_transpose_out->dims() << "]"; @@ -988,14 +996,15 @@ class FlashAttnWithGating { int softmax_lse_last_dim = ((max_seqlen_q_ + 16 - 1) / 16) * 16; softmax_lse->Resize({batch_size_, num_heads_, softmax_lse_last_dim}); AllocWithDebugInfo(dev_ctx_, "flash_attn: softmax_lse", softmax_lse); - VLOG(5) << WaitWithDebugInfo(dev_ctx_) << "Allocate softmax_lse: shape=[" + WaitWithDebugInfo(dev_ctx_); + VLOG(5) << "Allocate softmax_lse: shape=[" << softmax_lse->dims() << "]"; // 6. construct random seed auto seed_offset_pair = GenerateSeedOffsetPair(batch_size_, num_heads_); uint64_t seed = seed_offset_pair.first; uint64_t offset = seed_offset_pair.second; - VLOG(5) << WaitWithDebugInfo(dev_ctx_) << "Construct random seed"; + VLOG(5) << "Construct random seed"; VLOG(6) << "cu_seq_q: " << TensorDebugString(&cu_seq_q); VLOG(6) << "cu_seq_k: " << TensorDebugString(&cu_seq_k); @@ -1047,8 +1056,8 @@ class FlashAttnWithGating { if (!succ) { PADDLE_THROW(phi::errors::External(phi::dynload::flash_attn_error())); } - VLOG(5) << WaitWithDebugInfo(dev_ctx_) - << "Get workspace_size=" << workspace_size; + WaitWithDebugInfo(dev_ctx_); + VLOG(5) << "Get workspace_size=" << workspace_size; // 8. Run flash-attention kernel. phi::DenseTensor workspace = CreateWorkspace(workspace_size); @@ -1086,8 +1095,8 @@ class FlashAttnWithGating { if (!succ) { PADDLE_THROW(phi::errors::External(phi::dynload::flash_attn_error())); } - VLOG(5) << "[ComputeForward]" << WaitWithDebugInfo(dev_ctx_) - << "Run SUCCESS!!!"; + WaitWithDebugInfo(dev_ctx_); + VLOG(5) << "[ComputeForward]: Run SUCCESS!!!"; if (config->has_gating) { gate_out->Resize(config->gate_out_dims); @@ -1105,14 +1114,7 @@ class FlashAttnWithGating { GateAttentionGradConfig* config) { bool is_bf16 = qkv_transpose_out->dtype() == DataType::BFLOAT16 ? true : false; - - if (std::is_same::value) { - VLOG(4) << "[Grad]: T is phi::dtype::float16."; - } else if (std::is_same::value) { - VLOG(4) << "[Grad]: T is phi::dtype::bfloat16."; - } else if (std::is_same::value) { - VLOG(4) << "[Grad]: T is float."; - } + TypeDebugInfo(); PADDLE_ENFORCE_NOT_NULL( qkv_transpose_out, @@ -1135,7 +1137,7 @@ class FlashAttnWithGating { int seq_batch_size = static_cast(config->batch_size) * static_cast(config->seq_len_m); - VLOG(5) << WaitWithDebugInfo(dev_ctx_); + WaitWithDebugInfo(dev_ctx_); // 2. Init with cu_seq_q and cu_seq_k for flash_attn. phi::DenseTensor cu_seq_q, cu_seq_k; @@ -1174,8 +1176,8 @@ class FlashAttnWithGating { int total_k_ = qkv_dims[1]; // q.dims()[0] int num_heads_ = qkv_dims[2]; // q.dims()[1] int head_size_ = qkv_dims[3]; // q.dims()[2] - int max_seqlen_q_ = config->seq_len_r; // batch_size_; - int max_seqlen_k_ = config->m_size; // batch_size_; + int max_seqlen_q_ = config->seq_len_r; + int max_seqlen_k_ = config->m_size; VLOG(6) << "batch_size : " << batch_size_; VLOG(6) << "total_q : " << total_q_; VLOG(6) << "total_k : " << total_k_; @@ -1188,20 +1190,19 @@ class FlashAttnWithGating { phi::DenseTensor softmax_d; softmax_d.Resize(softmax_lse->dims()); AllocWithDebugInfo(dev_ctx_, "d_softmax_lse", &softmax_d); - VLOG(5) << WaitWithDebugInfo(dev_ctx_); + WaitWithDebugInfo(dev_ctx_); phi::DenseTensor bias_d; if (nonbatched_bias) { bias_d = phi::Empty( dev_ctx_, {batch_size_, num_heads_, max_seqlen_q_, max_seqlen_k_}); } - VLOG(5) << WaitWithDebugInfo(dev_ctx_); + WaitWithDebugInfo(dev_ctx_); // 6. construct random seed auto seed_offset_pair = GenerateSeedOffsetPair(batch_size_, num_heads_); uint64_t seed = seed_offset_pair.first; uint64_t offset = seed_offset_pair.second; - VLOG(6) << "fmha_out: " << TensorDebugString(fmha_out); VLOG(6) << "fmha_out_grad: " << TensorDebugString(fmha_out_grad); VLOG(6) << "softmax_lse: " << TensorDebugString(softmax_lse); @@ -1254,8 +1255,7 @@ class FlashAttnWithGating { if (!succ) { PADDLE_THROW(phi::errors::External(phi::dynload::flash_attn_error())); } - VLOG(5) << WaitWithDebugInfo(dev_ctx_) - << "Get workspace_size=" << workspace_size; + VLOG(5) << "Get workspace_size=" << workspace_size; phi::DenseTensor workspace = CreateWorkspace(workspace_size); succ = phi::dynload::flash_attn_bwd_with_bias_and_mask( @@ -1297,7 +1297,8 @@ class FlashAttnWithGating { if (!succ) { PADDLE_THROW(phi::errors::External(phi::dynload::flash_attn_error())); } - VLOG(5) << WaitWithDebugInfo(dev_ctx_); + WaitWithDebugInfo(dev_ctx_); + VLOG(5) << "[ComputeBackward]: Run SUCCESS!!!"; if (nonbatched_bias) { // compare block reduce @@ -1344,9 +1345,8 @@ class FlashAttnWithGating { int64_t grid = (seq_size + block - 1) / block; FlashAttRange<<>>( start, step, end, cu_seq_q->data(), cu_seq_k->data()); - - VLOG(5) << WaitWithDebugInfo(dev_ctx_) - << "AllocAndInit cu_seq_q and cu_seq_k: start=" << start + WaitWithDebugInfo(dev_ctx_); + VLOG(5) << "AllocAndInit cu_seq_q and cu_seq_k: start=" << start << ", step=" << step << ", end=" << end; } @@ -1356,8 +1356,8 @@ class FlashAttnWithGating { workspace = phi::Empty( dev_ctx_, {int64_t(workspace_size / sizeof(float))}); } - VLOG(5) << WaitWithDebugInfo(dev_ctx_) - << "Allocate workspace: workspace_size=" << workspace_size; + WaitWithDebugInfo(dev_ctx_); + VLOG(5) << "Allocate workspace: workspace_size=" << workspace_size; return workspace; } From 3262172d5db5d09b844cd8d04d9a6ca110c5451a Mon Sep 17 00:00:00 2001 From: JamesLim-sy Date: Thu, 27 Apr 2023 22:00:40 +0800 Subject: [PATCH 121/405] much thanks to xreki and shaojie for debuging this flash_attn in AlphaFold2 model plugin --- .../operators/fused/fused_gate_attention.h | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h index 5126d1a78f13a..319889246890a 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention.h +++ b/paddle/fluid/operators/fused/fused_gate_attention.h @@ -75,9 +75,9 @@ inline void WaitWithDebugInfo(const phi::GPUContext& dev_ctx) { dev_ctx.Wait(); VLOG(5) << "[Flash attn Synchronize] "; } -} +} -template +template inline void TypeDebugInfo() { if (VLOG_IS_ON(4)) { if (std::is_same::value) { @@ -922,7 +922,6 @@ class FlashAttnWithGating { qkv_transpose_out, platform::errors::NotFound("The input qkv_transpose_out can not be " "nullptr when merge_qkv is true.")); - // 1. Dealing with qkv_out for flash_attn. phi::DenseTensor* qkv_out = config->GetQKVOut(); ComputeQKVTransposeForwardForFlashAttn(*qkv_out, qkv_transpose_out); @@ -936,9 +935,8 @@ class FlashAttnWithGating { seq_batch_size * static_cast(config->seq_len_r), static_cast(config->num_heads), static_cast(config->head_dim)}); - VLOG(5) << "Reshape qkv_transpose_out: [" - << config->qkv_transpose_out_dims << "] -> [" - << qkv_transpose_out->dims() << "]"; + VLOG(5) << "Reshape qkv_transpose_out: [" << config->qkv_transpose_out_dims + << "] -> [" << qkv_transpose_out->dims() << "]"; // q_size == k_size int64_t q_size = config->GetQuerySize(); @@ -997,8 +995,7 @@ class FlashAttnWithGating { softmax_lse->Resize({batch_size_, num_heads_, softmax_lse_last_dim}); AllocWithDebugInfo(dev_ctx_, "flash_attn: softmax_lse", softmax_lse); WaitWithDebugInfo(dev_ctx_); - VLOG(5) << "Allocate softmax_lse: shape=[" - << softmax_lse->dims() << "]"; + VLOG(5) << "Allocate softmax_lse: shape=[" << softmax_lse->dims() << "]"; // 6. construct random seed auto seed_offset_pair = GenerateSeedOffsetPair(batch_size_, num_heads_); @@ -1172,10 +1169,10 @@ class FlashAttnWithGating { static_cast(config->num_heads), static_cast(config->head_dim)}); int batch_size_ = seq_batch_size; - int total_q_ = qkv_dims[1]; // q.dims()[0] - int total_k_ = qkv_dims[1]; // q.dims()[0] - int num_heads_ = qkv_dims[2]; // q.dims()[1] - int head_size_ = qkv_dims[3]; // q.dims()[2] + int total_q_ = qkv_dims[1]; // q.dims()[0] + int total_k_ = qkv_dims[1]; // q.dims()[0] + int num_heads_ = qkv_dims[2]; // q.dims()[1] + int head_size_ = qkv_dims[3]; // q.dims()[2] int max_seqlen_q_ = config->seq_len_r; int max_seqlen_k_ = config->m_size; VLOG(6) << "batch_size : " << batch_size_; From 3474e09cc6face797d866f27efe3c8605e3af883 Mon Sep 17 00:00:00 2001 From: Bo Zhang <105368690+zhangbopd@users.noreply.github.com> Date: Thu, 27 Apr 2023 22:09:44 +0800 Subject: [PATCH 122/405] Support different dtypes of inputs for broadcast for dropout optimization (#52093) * change judgement for DropoutGradGPUKernelDriver * add UnrollerWithoutVecSize and after this Loaddata to be refined * pass unittest * use same unroller with XPU * BroadcastWithInt64Index * BroadcastDataLoader template partial specialization * fix compile errs in ROCms * PR comment --- paddle/phi/kernels/funcs/broadcast_function.h | 620 ++++++++---------- paddle/phi/kernels/funcs/dropout_impl.cu.h | 58 +- paddle/phi/kernels/funcs/elementwise_base.h | 106 +-- .../kernels/primitive/datamover_primitives.h | 96 ++- 4 files changed, 425 insertions(+), 455 deletions(-) diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index f96a1764c24a5..b4125ab555079 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -31,20 +31,49 @@ namespace funcs { enum BroadcastLoadType { kMixed = 1, kBroadcast = 2, kElementwise = 3 }; -template +template +struct UseBroadcast { + template + static HOSTDEVICE void Apply( + const std::vector &ins_tensor, + const ArgsT &args, + int64_t numel, + Array1 *ins_data, + Array2 *use_broadcast, + int *broadcast_num, + bool *all_elementwise) { + (*ins_data)[Index] = (const _ptr_ char *)(ins_tensor[Index]->data()); + bool is_same_dim = ins_tensor[Index]->numel() == numel; + if (is_same_dim) { + (*use_broadcast)[Index] = false; + } else { + (*use_broadcast)[Index] = true; + (*broadcast_num)++; + } + *all_elementwise &= is_same_dim; + } +}; + +template struct LoaderTypeClassifier { public: int64_t numel{0}; - int vec_size{1}; + int vec_size{4}; int broadcast_num{0}; bool all_elementwise{true}; - phi::Array use_broadcast; - phi::Array ins_data; + phi::Array use_broadcast; + phi::Array ins_data; LoaderTypeClassifier() {} LoaderTypeClassifier(const std::vector &ins, std::vector *outs) { + using Traits = phi::funcs::FunctionTraits; + using ArgsT = typename Traits::ArgsTuple; + ArgsT arg; uint64_t out_addr = reinterpret_cast((*outs)[0]->data()); + + UnrollerWithoutVecSize::step(ins, arg, &vec_size); + for (auto i = 1; i < outs->size(); ++i) { PADDLE_ENFORCE_EQ( (*outs)[i]->dims(), @@ -56,165 +85,185 @@ struct LoaderTypeClassifier { out_addr = (out_addr | reinterpret_cast((*outs)[i]->data())); } - int out_vec_size = - phi::GetVectorizedSize(reinterpret_cast(out_addr)); - uint64_t in_addr = static_cast(0); + vec_size = std::min( + vec_size, + phi::GetVectorizedSize(reinterpret_cast(out_addr))); numel = (*outs)[0]->numel(); - for (int i = 0; i < Arity; ++i) { - auto in_data = ins[i]->data(); - ins_data[i] = (const _ptr_ InT *)(in_data); - - bool is_same_dim = ins[i]->numel() == numel; - if (is_same_dim) { - use_broadcast[i] = false; - in_addr = (in_addr | reinterpret_cast(in_data)); - } else { - use_broadcast[i] = true; - broadcast_num++; - } - all_elementwise &= is_same_dim; - } - int in_vec_size = std::min( - 4, phi::GetVectorizedSize(reinterpret_cast(in_addr))); - vec_size = std::min(out_vec_size, in_vec_size); + UnrollerWithoutVecSize::step(ins, + arg, + numel, + &ins_data, + &use_broadcast, + &broadcast_num, + &all_elementwise); } }; -#ifndef PADDLE_WITH_XPU_KP // Common broadcast/elementwise Loader. -template +template struct BroadcastDataLoader { - __device__ __forceinline__ void operator()( - T args[Arity][VecSize], - const phi::Array &ins, - const phi::Array &configs, - const phi::Array &use_broadcast, - const int block_offset, - const int num, - const uint32_t numel) { -#pragma unroll - for (int i = 0; i < Arity; ++i) { - kps::Init(args[i], static_cast(1.0f)); - if (use_broadcast[i]) { - kps::ReadDataBc( - args[i], ins[i], block_offset, configs[i], numel, VecSize); - } else { - kps::ReadData( - args[i], ins[i] + block_offset, num, VecSize); - } + template + static __device__ __forceinline__ void Apply(const Array1 &ins, + ArgsT *args, + const Array2 &configs, + const Array3 &use_broadcast, + const int block_offset, + const int num, + const uint32_t numel) { + using Type = std::tuple_element_t; + kps::Init(args, static_cast(1.0f)); + + if (use_broadcast[Index]) { + kps::ReadDataBc( + args, + reinterpret_cast(ins[Index]), + block_offset, + configs[Index], + numel, + VecSize); + } + // NOTE: If use if...else... with condition `use_broadcast[Index]` here, + // there will be some errs with clang12 while compiling in ROCm. + // When the compiler is upgraded, if...else... may be used. + if (!use_broadcast[Index]) { + kps::ReadData( + args, + reinterpret_cast(ins[Index]) + block_offset, + num, + VecSize); } } }; +/* BroadcastDataLoaders Partial specialization */ +#ifndef PADDLE_WITH_XPU_KP // Scalar elementwise Loader with consideration of IsBoundary. -template -struct BroadcastDataLoader { - __device__ __forceinline__ void operator()( - T args[Arity][VecSize], - const phi::Array &ins, - const phi::Array &configs, - const phi::Array &use_broadcast, - const int block_offset, - const int num, - const uint32_t numel) { +template +struct BroadcastDataLoader { + template + static __device__ __forceinline__ void Apply(const Array1 &ins, + ArgsT *args, + const Array2 &configs, + const Array3 &use_broadcast, + const int block_offset, + const int num, + const uint32_t numel) { + using Type = std::tuple_element_t; int thread_offset = threadIdx.x * VecSize + block_offset; #pragma unroll - for (int i = 0; i < Arity; ++i) { -#pragma unroll - for (int idx = 0; idx < VecSize; ++idx) { - args[i][idx] = static_cast(1); - int index = thread_offset + idx; - if (index < numel) { - args[i][idx] = ins[i][index]; - } + for (int idx = 0; idx < VecSize; ++idx) { + std::get(args[idx]) = static_cast(1); + int index = thread_offset + idx; + if (index < numel) { + std::get(args[idx]) = + reinterpret_cast(ins[Index])[index]; } } } }; // Vectorized elementwise Loader without consideration of IsBoundary. -template -struct BroadcastDataLoader { - __device__ __forceinline__ void operator()( - T args[Arity][VecSize], - const phi::Array &ins, - const phi::Array &configs, - const phi::Array &use_broadcast, - const int block_offset, - const int num, - const uint32_t numel) { - using VecType = phi::kps::details::VectorType; - VecType vec_temp[Arity]; +template +struct BroadcastDataLoader { + template + static __device__ __forceinline__ void Apply(const Array1 &ins, + ArgsT *args, + const Array2 &configs, + const Array3 &use_broadcast, + const int block_offset, + const int num, + const uint32_t numel) { + using Type = std::tuple_element_t; + using VecType = phi::kps::details::VectorType; + VecType vec_temp; int thread_offset = threadIdx.x + blockIdx.x * blockDim.x; + const VecType *__restrict__ vec_input = + reinterpret_cast(ins[Index]); + vec_temp = vec_input[thread_offset]; #pragma unroll - for (int i = 0; i < Arity; ++i) { - const VecType *__restrict__ vec_input = - reinterpret_cast(ins[i]); - vec_temp[i] = vec_input[thread_offset]; -#pragma unroll - for (int idx = 0; idx < VecSize; ++idx) { - args[i][idx] = vec_temp[i].val[idx]; - } + for (int idx = 0; idx < VecSize; ++idx) { + std::get(args[idx]) = vec_temp.val[idx]; } } }; // Common broadcast data loader. -template -struct BroadcastDataLoader { - __device__ __forceinline__ void operator()( - T args[Arity][VecSize], - const phi::Array &ins, - const phi::Array &configs, - const phi::Array &use_broadcast, - const int block_offset, - const int num, - const uint32_t numel) { - uint32_t index_bc[Arity][VecSize]; -#pragma unroll - for (int j = 0; j < Arity; ++j) { +template +struct BroadcastDataLoader { + template + static __device__ __forceinline__ void Apply(const Array1 &ins, + ArgsT *args, + const Array2 &configs, + const Array3 &use_broadcast, + const int block_offset, + const int num, + const uint32_t numel) { + using Type = std::tuple_element_t; + uint32_t index_bc[VecSize]; #pragma unroll - for (int k = 0; k < VecSize; ++k) { - index_bc[j][k] = 0; - args[j][k] = static_cast(1); - } + for (int k = 0; k < VecSize; ++k) { + index_bc[k] = 0; + std::get(args[k]) = static_cast(1); } uint32_t thread_offset = block_offset + threadIdx.x * VecSize; #pragma unroll for (int k = 0; k < VecSize; ++k) { uint32_t idx = thread_offset + k; - if (IsBoundary) { - if (idx == numel) break; + if (IsBoundary && idx == numel) { + break; } - #pragma unroll for (int i = 0; i < phi::DDim::kMaxRank; ++i) { if (i == configs[0].rank) break; auto fast_divmoder = configs[0].divmoders[i].Divmod(idx); idx = fast_divmoder.val[0]; -#pragma unroll - for (int j = 0; j < Arity; ++j) { - index_bc[j][k] += fast_divmoder.val[1] * configs[j].strides[i]; - } + index_bc[k] += fast_divmoder.val[1] * configs[Index].strides[i]; } } #pragma unroll - for (int j = 0; j < Arity; ++j) { -#pragma unroll - for (int k = 0; k < VecSize; ++k) { - args[j][k] = ins[j][index_bc[j][k]]; - } + for (int k = 0; k < VecSize; ++k) { + std::get(args[k]) = + reinterpret_cast(ins[Index])[index_bc[k]]; } } }; + #endif -template + typename Func, + bool IsBoundary, + int LoadType, + int VecSize, + int End, + int Begin = 0> +struct BcUnroller { + template + static HOSTDEVICE inline void step(Args &&...args) { + Func::Apply( + std::forward(args)...); + BcUnroller::step( + args...); + } +}; + +template