Skip to content

Commit

Permalink
Revert "[vulkan] Pad channels when using texture storage instead of "…
Browse files Browse the repository at this point in the history
…tight packing" (pytorch#95251)"

This reverts commit 0eeb046.
  • Loading branch information
pruthvistony committed May 2, 2023
1 parent 23b9d4e commit cd941e4
Show file tree
Hide file tree
Showing 16 changed files with 247 additions and 418 deletions.
7 changes: 4 additions & 3 deletions aten/src/ATen/native/vulkan/api/Tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,8 @@ c10::SmallVector<int64_t, 6u> calc_gpu_sizes(

c10::SmallVector<int64_t, 6u> gpu_sizes(3);

// Channel dim will be be aligned to the next multiple of 4
// Channel dim will be always be aligned. For 4 dimensional tensors, batch
// and channel are combined, then aligned.
switch (ndim) {
case 1:
gpu_sizes[0] = 4;
Expand All @@ -145,8 +146,8 @@ c10::SmallVector<int64_t, 6u> calc_gpu_sizes(
break;

case 4:
int64_t padded_c = api::utils::align_up(sizes[1], INT64_C(4));
gpu_sizes[0] = sizes[0] * padded_c;
int64_t combined_depth = sizes[0] * sizes[1];
gpu_sizes[0] = api::utils::align_up(combined_depth, INT64_C(4));
gpu_sizes[1] = sizes[2];
gpu_sizes[2] = sizes[3];
break;
Expand Down
91 changes: 32 additions & 59 deletions aten/src/ATen/native/vulkan/glsl/cat_feature.glsl
Original file line number Diff line number Diff line change
@@ -1,74 +1,47 @@
#version 450 core
#define PRECISION $precision
#define FORMAT $format
#define FORMAT $format

layout(std430) buffer;

/*
* Output Image
*/
layout(set = 0, binding = 0, FORMAT) uniform PRECISION image3D uOutput;
/* Qualifiers: layout - storage - precision - memory */

/*
* Input Textures
*/
layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
layout(set = 0, binding = 0, FORMAT) uniform PRECISION image3D uOutput;
layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
layout(set = 0, binding = 2) uniform PRECISION restrict Block {
ivec4 size; // output texture size (x=width,y=height,z=depth,w=unused)
ivec4 isize; // input texture size (x=width,y=height,z=depth,w=unused)
uint batch_size; // input tensor's batch size
uint ch_size; // input tensor's channel size
uint ch_interval; // channel interval (total # of channels for all tensors)
uint ch_size_allprior; // # of channels for tensor 0 to i-1 at ith tensor
} uBlock;

/*
* Params Buffer
*/
layout(set = 0, binding = 2) uniform PRECISION restrict Block {
// output texture size (x=width,y=height,z=depth,w=unused)
ivec4 out_extents;
// input texture size (x=width,y=height,z=depth,w=unused)
ivec4 in_extents;
// input tensor's batch size
uint batch_size;
// input tensor's channel size
uint ch_size;
// channel interval (total # of channels for all tensors)
uint ch_interval;
// # of channels for tensor 0 to i-1 at ith tensor
uint ch_size_allprior;
}
uBlock;

/*
* Local Work Group
*/
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

void main() {
const ivec3 in_pos = ivec3(gl_GlobalInvocationID);
const ivec3 posIn = ivec3(gl_GlobalInvocationID);
const uint max_src_index = uBlock.ch_size * uBlock.batch_size;

if (any(greaterThanEqual(in_pos, uBlock.in_extents.xyz))) {
return;
}

// x and y don't change. only z and index matter
ivec3 out_pos = in_pos;
const vec4 in_tex = texelFetch(uInput, in_pos, 0);

for (uint i = 0; i < 4; ++i) {
uint src_index = in_pos.z * 4 + i;

if (src_index >= max_src_index) {
// out of range
break;
if (all(lessThan(posIn, uBlock.isize.xyz))) {
ivec3 posOut = posIn; // x and y don't change. only z and index matter
const vec4 inval = texelFetch(uInput, posIn, 0);

for (uint i = 0; i < 4; ++i)
{
uint src_index = posIn.z * 4 + i;
if (src_index >= max_src_index) {
// out of range
break;
}

uint dst_index = uint(src_index / uBlock.ch_size) * uBlock.ch_interval + (src_index % uBlock.ch_size) + uBlock.ch_size_allprior;
posOut.z = int(dst_index / 4);
uint j = (dst_index % 4);

vec4 outval = imageLoad(uOutput, posOut);
outval[j] = inval[i];
imageStore(uOutput, posOut, outval);
}

uint src_n_idx = src_index / uBlock.ch_size;
uint src_c_idx = src_index % uBlock.ch_size;

uint dst_nc_idx =
src_n_idx * uBlock.ch_interval + src_c_idx + uBlock.ch_size_allprior;

out_pos.z = int(dst_nc_idx / 4);
uint j = (dst_nc_idx % 4);

vec4 out_tex = imageLoad(uOutput, out_pos);
out_tex[j] = in_tex[i];
imageStore(uOutput, out_pos, out_tex);
}
}
27 changes: 7 additions & 20 deletions aten/src/ATen/native/vulkan/glsl/image_to_nchw.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,9 @@ uBuffer;
* Params Buffer
*/
layout(set = 0, binding = 2) uniform PRECISION restrict Block {
// Extents of the output texture
// xyz contain the extents of the input texture, w contains HxW to help
// calculate buffer offsets
ivec4 in_extents;
// Number of texels spanned by one channel
ivec2 c_info;
}
uBlock;

Expand All @@ -41,25 +40,13 @@ void main() {

const vec4 intex = texelFetch(uImage, pos, 0);

const int n_index = int(pos.z / uBlock.c_info.x);
const int c_index = (pos.z % uBlock.c_info.x) * 4;
int d_offset = (n_index * uBlock.c_info.y) + c_index;

const int base_index =
pos.x + uBlock.in_extents.x * pos.y + uBlock.in_extents.w * d_offset;
pos.x + uBlock.in_extents.x * pos.y + (4 * uBlock.in_extents.w) * pos.z;
const ivec4 buf_indices =
base_index + ivec4(0, 1, 2, 3) * uBlock.in_extents.w;

if (c_index < uBlock.c_info.y) {
uBuffer.data[buf_indices.x] = intex.x;
}
if (c_index + 1 < uBlock.c_info.y) {
uBuffer.data[buf_indices.y] = intex.y;
}
if (c_index + 2 < uBlock.c_info.y) {
uBuffer.data[buf_indices.z] = intex.z;
}
if (c_index + 3 < uBlock.c_info.y) {
uBuffer.data[buf_indices.w] = intex.w;
}
uBuffer.data[buf_indices.x] = intex.x;
uBuffer.data[buf_indices.y] = intex.y;
uBuffer.data[buf_indices.z] = intex.z;
uBuffer.data[buf_indices.w] = intex.w;
}
59 changes: 18 additions & 41 deletions aten/src/ATen/native/vulkan/glsl/mean.glsl
Original file line number Diff line number Diff line change
@@ -1,77 +1,54 @@
#version 450 core
#define PRECISION $precision
#define FORMAT $format
#define FORMAT $format

layout(std430) buffer;

/*
* Output Image
*/
layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D uOutput;
/* Qualifiers: layout - storage - precision - memory */

/*
* Input Textures
*/
layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D uOutput;
layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
layout(set = 0, binding = 2) uniform PRECISION restrict Block {
ivec4 size;
ivec3 isize;
} uBlock;

/*
* Params Buffer
*/
layout(set = 0, binding = 2) uniform PRECISION restrict Block {
// extents of the output texture
// w contains pre-computed H*W of the input texture for convenience
ivec4 out_extents;
// extents of the input texture
// w contains size of input channels aligned to 4
ivec4 in_extents;
}
uBlock;

/*
* Shared memory buffer
*/
shared vec4 sh_mem[64];

/*
* Local Work Group
*/
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

/*
* Computes the mean of an input tensor along the width, height, and channel
* axes.
*/
void main() {
const ivec3 pos = ivec3(gl_GlobalInvocationID);
const ivec3 tid = ivec3(gl_LocalInvocationID);
const ivec3 group_size = ivec3(gl_WorkGroupSize);

if (pos.z < uBlock.in_extents.z) {
if (pos.z < uBlock.isize.z) {
vec4 sum = vec4(0);

for (int y = tid.y; y < uBlock.in_extents.y; y += group_size.y) {
for (int x = tid.x; x < uBlock.in_extents.x; x += group_size.x) {
for (int y = tid.y; y < uBlock.isize.y; y+=group_size.y) {
for (int x = tid.x; x < uBlock.isize.x; x+=group_size.x) {
sum += texelFetch(uInput, ivec3(x, y, pos.z), 0);
}
}

sh_mem[tid.z * group_size.y * group_size.x + tid.y * group_size.x + tid.x] =
sum;
sh_mem[tid.z * group_size.y * group_size.x + tid.y * group_size.x + tid.x] = sum;
}
memoryBarrierShared();
barrier();

if (tid.y > 0 || tid.x > 0 || pos.z >= uBlock.out_extents.z) {
if (tid.y > 0 || tid.x > 0 || pos.z >= uBlock.size.z) {
return;
}

vec4 total = vec4(0);
for (int y = 0; y < group_size.y; ++y) {
for (int x = 0; x < group_size.x; ++x) {
total +=
sh_mem[tid.z * group_size.y * group_size.x + y * group_size.x + x];
total += sh_mem[tid.z * group_size.y * group_size.x + y * group_size.x + x];
}
}

imageStore(uOutput, pos, total / uBlock.out_extents.w);
imageStore(
uOutput,
pos,
total / uBlock.size.w);
}
93 changes: 38 additions & 55 deletions aten/src/ATen/native/vulkan/glsl/mean2d.glsl
Original file line number Diff line number Diff line change
@@ -1,90 +1,73 @@
#version 450 core
#define PRECISION $precision
#define FORMAT $format
#define FORMAT $format

layout(std430) buffer;

/*
* Output Image
*/
layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D uOutput;

/*
* Input Textures
*/
layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;

/*
* Params Buffer
*/
layout(set = 0, binding = 2) uniform PRECISION restrict Block {
// extents of the output texture
// w contains pre-computed H*W of the input texture for convenience
ivec4 out_extents;
// extents of the input texture
// w contains size of input channels aligned to 4
ivec4 in_extents;
}
uBlock;
/* Qualifiers: layout - storage - precision - memory */

layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D uOutput;
layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
layout(set = 0, binding = 2) uniform PRECISION restrict Block {
ivec4 size;
ivec3 isize;
} uBlock;

/*
* Shared memory buffer
*/
shared vec4 sh_mem[64];

/*
* Local Work Group
*/
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

/*
* Computes the mean of an input tensor along the width and height axes.
*/
void main() {
const ivec3 pos = ivec3(gl_GlobalInvocationID);
const ivec3 tid = ivec3(gl_LocalInvocationID);
const ivec3 group_size = ivec3(gl_WorkGroupSize);

if (pos.z < uBlock.in_extents.z) {
if (pos.z < uBlock.isize.z) {
vec4 sum = vec4(0);

for (int y = tid.y; y < uBlock.in_extents.y; y += group_size.y) {
for (int x = tid.x; x < uBlock.in_extents.x; x += group_size.x) {
for (int y = tid.y; y < uBlock.isize.y; y+=group_size.y) {
for (int x = tid.x; x < uBlock.isize.x; x+=group_size.x) {
sum += texelFetch(uInput, ivec3(x, y, pos.z), 0);
}
}

sh_mem[tid.z * group_size.y * group_size.x + tid.y * group_size.x + tid.x] =
sum;
sh_mem[tid.z * group_size.y * group_size.x + tid.y * group_size.x + tid.x] = sum;
}
memoryBarrierShared();
barrier();

if (tid.y > 0 || tid.x > 0 || pos.z >= uBlock.in_extents.z) {
if (tid.y > 0 || tid.x > 0 || pos.z >= uBlock.isize.z) {
return;
}

vec4 total = vec4(0);
for (int y = 0; y < group_size.y; ++y) {
for (int x = 0; x < group_size.x; ++x) {
total +=
sh_mem[tid.z * group_size.y * group_size.x + y * group_size.x + x];
total += sh_mem[tid.z * group_size.y * group_size.x + y * group_size.x + x];
}
}

const vec4 outtex = total / uBlock.out_extents.w;

const int nc_idx = pos.z * 4;
const int out_width = uBlock.out_extents.x;
const int out_height = uBlock.out_extents.y;

for (int i = 0; i < 4; ++i) {
const int n_idx = (nc_idx + i) / uBlock.in_extents.w;
const int c_idx = (nc_idx + i) % uBlock.in_extents.w;

ivec3 pos = ivec3(c_idx, n_idx, 0);
if (c_idx < out_width && n_idx < out_height) {
imageStore(uOutput, pos, vec4(outtex[i], 0, 0, 0));
}
}
const vec4 outtex = total / uBlock.size.w;
const int zoutx = 4*pos.z;
const int width = uBlock.size.x;
const int maxlen = uBlock.size.x * uBlock.size.y;

const int zouty = min(zoutx + 1, maxlen);
ivec3 posy = ivec3((zouty)%width, (zouty)/width, 0);
vec4 outy = vec4(outtex.y, 0, 0, 0);
imageStore(uOutput, posy, outy);

const int zoutz = min(zoutx + 2, maxlen);
ivec3 posz = ivec3((zoutz)%width, (zoutz)/width, 0);
vec4 outz = vec4(outtex.z, 0, 0, 0);
imageStore(uOutput, posz, outz);

const int zoutw = min(zoutx + 3, maxlen);
ivec3 posw = ivec3((zoutw)%width, (zoutw)/width, 0);
vec4 outw = vec4(outtex.w, 0, 0, 0);
imageStore(uOutput, posw, outw);

ivec3 posx = ivec3(zoutx%width, zoutx/width, 0);
vec4 outx = vec4(outtex.x, 0, 0, 0);
imageStore(uOutput, posx, outx);
}

0 comments on commit cd941e4

Please sign in to comment.