Skip to content

Commit

Permalink
Optimize Paddle diagonal (#47904)
Browse files Browse the repository at this point in the history
  • Loading branch information
201716010711 committed Dec 8, 2022
1 parent de2c5fd commit b91bbd3
Show file tree
Hide file tree
Showing 5 changed files with 133 additions and 74 deletions.
67 changes: 38 additions & 29 deletions paddle/phi/kernels/cpu/diagonal_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ void DiagonalKernel(const Context& dev_ctx,
auto* output = out;
T* output_data = dev_ctx.template Alloc<T>(output);
auto output_dim = vectorize(output->dims());
auto output_dim_size = output_dim.size();

const int64_t offset_ = offset;
int64_t axis1_ = axis1 < 0 ? input_dim_size + axis1 : axis1;
Expand All @@ -43,40 +44,48 @@ void DiagonalKernel(const Context& dev_ctx,
std::vector<int64_t> input_stride = funcs::ComputeDimStride(input_dim);
std::vector<int64_t> output_stride = funcs::ComputeDimStride(output_dim);

int64_t numel = input->numel();

for (int64_t idx = 0; idx < numel; idx++) {
std::vector<int64_t> idx_dim(input_dim_size);
int64_t out_numel = out->numel();
for (int64_t idx = 0; idx < out_numel; idx++) {
std::vector<int64_t> idx_dim(output_dim_size);
int64_t temp = 0;
for (size_t i = 0; i < input_dim_size; i++) {
idx_dim[i] = (idx - temp) / input_stride[i];
temp = temp + idx_dim[i] * input_stride[i];
for (size_t i = 0; i < output_dim_size; i++) {
idx_dim[i] = (idx - temp) / output_stride[i];
temp = temp + idx_dim[i] * output_stride[i];
}

int64_t axis1_dim = idx_dim[axis1_];
int64_t axis2_dim = idx_dim[axis2_];

idx_dim.erase(idx_dim.begin() + std::max(axis1_, axis2_));
idx_dim.erase(idx_dim.begin() + std::min(axis1_, axis2_));

bool flag = false;
if (offset_ == 0 && axis1_dim == axis2_dim) {
idx_dim.push_back(axis1_dim);
flag = true;
} else if (offset_ > 0 && (axis1_dim + offset_) == axis2_dim) {
idx_dim.push_back(axis1_dim);
flag = true;
} else if (offset_ < 0 && (axis1_dim + offset_) == axis2_dim) {
idx_dim.push_back(axis2_dim);
flag = true;
int64_t tmp = idx_dim[output_dim_size - 1];
std::vector<int64_t> list;
list.clear();
int64_t l = std::min(axis1_, axis2_);
int64_t r = std::max(axis1_, axis2_);
for (size_t j = 0; j < output_dim_size - 1; j++) {
list.push_back(idx_dim[j]);
}
if (flag) {
int64_t idx_output = 0;
for (size_t i = 0; i < idx_dim.size(); i++) {
idx_output = idx_output + idx_dim[i] * output_stride[i];
if (offset_ == 0) {
list.insert(list.begin() + l, tmp);
list.insert(list.begin() + r, tmp);
} else if (offset_ > 0) {
if (axis1_ < axis2_) {
list.insert(list.begin() + l, tmp);
list.insert(list.begin() + r, tmp + offset_);
} else {
list.insert(list.begin() + l, tmp + offset_);
list.insert(list.begin() + r, tmp);
}
output_data[idx_output] = input_data[idx];
} else if (offset_ < 0) {
if (axis1_ < axis2_) {
list.insert(list.begin() + l, tmp - offset_);
list.insert(list.begin() + r, tmp);
} else {
list.insert(list.begin() + l, tmp);
list.insert(list.begin() + r, tmp - offset_);
}
}

int64_t input_offset = 0;
for (size_t i = 0; i < input_dim_size; i++) {
input_offset = input_offset + list[i] * input_stride[i];
}
output_data[idx] = input_data[input_offset];
}
}
} // namespace phi
Expand Down
88 changes: 44 additions & 44 deletions paddle/phi/kernels/funcs/diagonal.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,59 +156,59 @@ __global__ void DiagonalCuda(const T* data1,
int64_t* x_stride,
int64_t* out_stride,
int64_t numel,
int64_t out_numel,
bool is_grad) {
CUDA_KERNEL_LOOP(idx, numel) {
int64_t idx_dim[X_DIM_SIZE] = {0};
CUDA_KERNEL_LOOP(idx, out_numel) {
int64_t idx_dim[OUT_DIM_SIZE] = {0};
int64_t temp = 0;
for (size_t i = 0; i < X_DIM_SIZE - 1; i++) {
idx_dim[i] = (idx - temp) / x_stride[i];
temp = temp + idx_dim[i] * x_stride[i];
for (size_t i = 0; i < OUT_DIM_SIZE - 1; i++) {
idx_dim[i] = (idx - temp) / out_stride[i];
temp = temp + idx_dim[i] * out_stride[i];
}
idx_dim[X_DIM_SIZE - 1] = idx - temp;

int64_t axis1_dim = idx_dim[axis1_];
int64_t axis2_dim = idx_dim[axis2_];

int64_t out_dim[OUT_DIM_SIZE] = {0};
int temp_pos = 0;
for (int i = 0; i < X_DIM_SIZE; i++) {
if (i != axis1_ && i != axis2_) {
out_dim[temp_pos] = idx_dim[i];
temp_pos++;
idx_dim[OUT_DIM_SIZE - 1] = idx - temp;
int64_t tmp = idx - temp;
int64_t list[9];
int64_t p = 0;
for (size_t j = 0; j < X_DIM_SIZE; j++) {
if (j == axis1_ || j == axis2_) {
list[j] = 0;
} else {
list[j] = idx_dim[p];
p += 1;
}
}
bool flag = false;
if (offset_ == 0 && axis1_dim == axis2_dim) {
out_dim[temp_pos] = axis1_dim;
flag = true;
} else if (offset_ > 0 && (axis1_dim + offset_) == axis2_dim) {
out_dim[temp_pos] = axis1_dim;
flag = true;
} else if (offset_ < 0 && (axis1_dim + offset_) == axis2_dim) {
out_dim[temp_pos] = axis2_dim;
flag = true;
}
if (!is_grad) {
if (flag) {
int64_t idx_output = 0;
for (size_t i = 0; i < OUT_DIM_SIZE - 1; i++) {
idx_output = idx_output + out_dim[i] * out_stride[i];
}
idx_output = idx_output + out_dim[OUT_DIM_SIZE - 1];
data2[idx_output] = data1[idx];
int64_t l = min(axis1_, axis2_);
int64_t r = max(axis1_, axis2_);
if (offset_ == 0) {
list[l] = tmp;
list[r] = tmp;
} else if (offset_ > 0) {
if (axis1_ < axis2_) {
list[l] = tmp;
list[r] = tmp + offset_;
} else {
list[l] = tmp + offset_;
list[r] = tmp;
}
} else {
if (flag) {
int64_t idx_output = 0;
for (size_t i = 0; i < OUT_DIM_SIZE - 1; i++) {
idx_output = idx_output + out_dim[i] * out_stride[i];
}
idx_output = idx_output + out_dim[OUT_DIM_SIZE - 1];
data2[idx] = data1[idx_output];
} else if (offset_ < 0) {
if (axis1_ < axis2_) {
list[l] = tmp - offset_;
list[r] = tmp;
} else {
data2[idx] = static_cast<T>(0);
list[l] = tmp;
list[r] = tmp - offset_;
}
}
int64_t input_offset = 0;

for (size_t i = 0; i < X_DIM_SIZE; i++) {
input_offset = input_offset + list[i] * x_stride[i];
}
if (!is_grad) {
data2[idx] = data1[input_offset];
} else {
data2[input_offset] = data1[idx];
}
}
}
#endif
Expand Down
12 changes: 12 additions & 0 deletions paddle/phi/kernels/gpu/diagonal_grad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ void DiagonalGradKernel(const Context& dev_ctx,
int threads = PADDLE_CUDA_NUM_THREADS;
int blocks = (numel + threads - 1) / threads;

int64_t dout_numel = out_grad.numel();
phi::backends::gpu::GpuMemsetAsync(
dx_data, 0, numel * sizeof(T), dev_ctx.stream());

switch (dx_dim_size) {
case 2:
funcs::DiagonalCuda<T, 2, 1><<<blocks, threads>>>(dout_data,
Expand All @@ -72,6 +76,7 @@ void DiagonalGradKernel(const Context& dev_ctx,
dx_stride,
dout_stride,
numel,
dout_numel,
true);
break;
case 3:
Expand All @@ -83,6 +88,7 @@ void DiagonalGradKernel(const Context& dev_ctx,
dx_stride,
dout_stride,
numel,
dout_numel,
true);
break;
case 4:
Expand All @@ -94,6 +100,7 @@ void DiagonalGradKernel(const Context& dev_ctx,
dx_stride,
dout_stride,
numel,
dout_numel,
true);
break;
case 5:
Expand All @@ -105,6 +112,7 @@ void DiagonalGradKernel(const Context& dev_ctx,
dx_stride,
dout_stride,
numel,
dout_numel,
true);
break;
case 6:
Expand All @@ -116,6 +124,7 @@ void DiagonalGradKernel(const Context& dev_ctx,
dx_stride,
dout_stride,
numel,
dout_numel,
true);
break;
case 7:
Expand All @@ -127,6 +136,7 @@ void DiagonalGradKernel(const Context& dev_ctx,
dx_stride,
dout_stride,
numel,
dout_numel,
true);
break;
case 8:
Expand All @@ -138,6 +148,7 @@ void DiagonalGradKernel(const Context& dev_ctx,
dx_stride,
dout_stride,
numel,
dout_numel,
true);
break;
case 9:
Expand All @@ -149,6 +160,7 @@ void DiagonalGradKernel(const Context& dev_ctx,
dx_stride,
dout_stride,
numel,
dout_numel,
true);
break;
default:
Expand Down
11 changes: 10 additions & 1 deletion paddle/phi/kernels/gpu/diagonal_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,10 @@ void DiagonalKernel(const Context& dev_ctx,
int64_t axis1_ = axis1 < 0 ? input_dim_size + axis1 : axis1;
int64_t axis2_ = axis2 < 0 ? input_dim_size + axis2 : axis2;
int64_t numel = input->numel();
int64_t out_numel = out->numel();

int threads = PADDLE_CUDA_NUM_THREADS;
int blocks = (numel + threads - 1) / threads;
int blocks = (out_numel + threads - 1) / threads;

switch (input_dim_size) {
case 2:
Expand All @@ -68,6 +69,7 @@ void DiagonalKernel(const Context& dev_ctx,
input_stride,
output_stride,
numel,
out_numel,
false);
break;
case 3:
Expand All @@ -79,6 +81,7 @@ void DiagonalKernel(const Context& dev_ctx,
input_stride,
output_stride,
numel,
out_numel,
false);
break;
case 4:
Expand All @@ -90,6 +93,7 @@ void DiagonalKernel(const Context& dev_ctx,
input_stride,
output_stride,
numel,
out_numel,
false);
break;
case 5:
Expand All @@ -101,6 +105,7 @@ void DiagonalKernel(const Context& dev_ctx,
input_stride,
output_stride,
numel,
out_numel,
false);
break;
case 6:
Expand All @@ -112,6 +117,7 @@ void DiagonalKernel(const Context& dev_ctx,
input_stride,
output_stride,
numel,
out_numel,
false);
break;
case 7:
Expand All @@ -123,6 +129,7 @@ void DiagonalKernel(const Context& dev_ctx,
input_stride,
output_stride,
numel,
out_numel,
false);
break;
case 8:
Expand All @@ -134,6 +141,7 @@ void DiagonalKernel(const Context& dev_ctx,
input_stride,
output_stride,
numel,
out_numel,
false);
break;
case 9:
Expand All @@ -145,6 +153,7 @@ void DiagonalKernel(const Context& dev_ctx,
input_stride,
output_stride,
numel,
out_numel,
false);
break;
default:
Expand Down
29 changes: 29 additions & 0 deletions python/paddle/fluid/tests/unittests/test_diagonal_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,35 @@ def test_check_grad(self):
pass


class TestDiagonalOpCase4(TestDiagonalOp):
def init_config(self):
self.case = np.random.randn(100, 100).astype('int64')
self.inputs = {'Input': self.case}
self.attrs = {'offset': 1, 'axis1': 1, 'axis2': 0}
self.target = np.diagonal(
self.inputs['Input'],
offset=self.attrs['offset'],
axis1=self.attrs['axis1'],
axis2=self.attrs['axis2'],
)

def test_check_grad(self):
pass


class TestDiagonalOpCase5(TestDiagonalOp):
def init_config(self):
self.case = np.random.randn(4, 2, 4, 4).astype('float32')
self.inputs = {'Input': self.case}
self.attrs = {'offset': -2, 'axis1': 0, 'axis2': 3}
self.target = np.diagonal(
self.inputs['Input'],
offset=self.attrs['offset'],
axis1=self.attrs['axis1'],
axis2=self.attrs['axis2'],
)


class TestDiagonalAPI(unittest.TestCase):
def setUp(self):
self.shape = [10, 3, 4]
Expand Down

0 comments on commit b91bbd3

Please sign in to comment.