Revert "[vulkan] Pad channels when using texture storage instead of "…

…tight packing" (pytorch#95251)" This reverts commit 0eeb046.
ROCm · May 2, 2023 · cd941e4 · cd941e4
1 parent 23b9d4e
commit cd941e4
Show file tree

Hide file tree

Showing 16 changed files with 247 additions and 418 deletions.
diff --git a/aten/src/ATen/native/vulkan/api/Tensor.cpp b/aten/src/ATen/native/vulkan/api/Tensor.cpp
@@ -124,7 +124,8 @@ c10::SmallVector<int64_t, 6u> calc_gpu_sizes(
 
     c10::SmallVector<int64_t, 6u> gpu_sizes(3);
 
-    // Channel dim will be be aligned to the next multiple of 4
+    // Channel dim will be always be aligned. For 4 dimensional tensors, batch
+    // and channel are combined, then aligned.
     switch (ndim) {
       case 1:
         gpu_sizes[0] = 4;
@@ -145,8 +146,8 @@ c10::SmallVector<int64_t, 6u> calc_gpu_sizes(
         break;
 
       case 4:
-        int64_t padded_c = api::utils::align_up(sizes[1], INT64_C(4));
-        gpu_sizes[0] = sizes[0] * padded_c;
+        int64_t combined_depth = sizes[0] * sizes[1];
+        gpu_sizes[0] = api::utils::align_up(combined_depth, INT64_C(4));
         gpu_sizes[1] = sizes[2];
         gpu_sizes[2] = sizes[3];
         break;

diff --git a/aten/src/ATen/native/vulkan/glsl/cat_feature.glsl b/aten/src/ATen/native/vulkan/glsl/cat_feature.glsl
@@ -1,74 +1,47 @@
 #version 450 core
 #define PRECISION $precision
-#define FORMAT $format
+#define FORMAT    $format
 
 layout(std430) buffer;
 
-/*
- * Output Image
- */
-layout(set = 0, binding = 0, FORMAT) uniform PRECISION image3D uOutput;
+/* Qualifiers: layout - storage - precision - memory */
 
-/*
- * Input Textures
- */
-layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
+layout(set = 0, binding = 0, FORMAT) uniform PRECISION           image3D uOutput;
+layout(set = 0, binding = 1)         uniform PRECISION           sampler3D uInput;
+layout(set = 0, binding = 2)         uniform PRECISION restrict  Block {
+  ivec4 size;            // output texture size (x=width,y=height,z=depth,w=unused)
+  ivec4 isize;           // input texture size (x=width,y=height,z=depth,w=unused)
+  uint batch_size;       // input tensor's batch size
+  uint ch_size;          // input tensor's channel size
+  uint ch_interval;      // channel interval (total # of channels for all tensors)
+  uint ch_size_allprior; // # of channels for tensor 0 to i-1 at ith tensor
+} uBlock;
 
-/*
- * Params Buffer
- */
-layout(set = 0, binding = 2) uniform PRECISION restrict Block {
-  // output texture size (x=width,y=height,z=depth,w=unused)
-  ivec4 out_extents;
-  // input texture size (x=width,y=height,z=depth,w=unused)
-  ivec4 in_extents;
-  // input tensor's batch size
-  uint batch_size;
-  // input tensor's channel size
-  uint ch_size;
-  // channel interval (total # of channels for all tensors)
-  uint ch_interval;
-  // # of channels for tensor 0 to i-1 at ith tensor
-  uint ch_size_allprior;
-}
-uBlock;
-
-/*
- * Local Work Group
- */
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
-  const ivec3 in_pos = ivec3(gl_GlobalInvocationID);
+  const ivec3 posIn = ivec3(gl_GlobalInvocationID);
   const uint max_src_index = uBlock.ch_size * uBlock.batch_size;
 
-  if (any(greaterThanEqual(in_pos, uBlock.in_extents.xyz))) {
-    return;
-  }
-
-  // x and y don't change. only z and index matter
-  ivec3 out_pos = in_pos;
-  const vec4 in_tex = texelFetch(uInput, in_pos, 0);
-
-  for (uint i = 0; i < 4; ++i) {
-    uint src_index = in_pos.z * 4 + i;
-
-    if (src_index >= max_src_index) {
-      // out of range
-      break;
+  if (all(lessThan(posIn, uBlock.isize.xyz))) {
+    ivec3 posOut = posIn; // x and y don't change. only z and index matter
+    const vec4 inval = texelFetch(uInput, posIn, 0);
+
+    for (uint i = 0; i < 4; ++i)
+    {
+      uint src_index = posIn.z * 4 + i;
+      if (src_index >= max_src_index) {
+        // out of range
+        break;
+      }
+
+      uint dst_index = uint(src_index / uBlock.ch_size) * uBlock.ch_interval + (src_index % uBlock.ch_size) + uBlock.ch_size_allprior;
+      posOut.z = int(dst_index / 4);
+      uint j = (dst_index % 4);
+
+      vec4 outval = imageLoad(uOutput, posOut);
+      outval[j] = inval[i];
+      imageStore(uOutput, posOut, outval);
     }
-
-    uint src_n_idx = src_index / uBlock.ch_size;
-    uint src_c_idx = src_index % uBlock.ch_size;
-
-    uint dst_nc_idx =
-        src_n_idx * uBlock.ch_interval + src_c_idx + uBlock.ch_size_allprior;
-
-    out_pos.z = int(dst_nc_idx / 4);
-    uint j = (dst_nc_idx % 4);
-
-    vec4 out_tex = imageLoad(uOutput, out_pos);
-    out_tex[j] = in_tex[i];
-    imageStore(uOutput, out_pos, out_tex);
   }
 }
diff --git a/aten/src/ATen/native/vulkan/glsl/image_to_nchw.glsl b/aten/src/ATen/native/vulkan/glsl/image_to_nchw.glsl
@@ -20,10 +20,9 @@ uBuffer;
  * Params Buffer
  */
 layout(set = 0, binding = 2) uniform PRECISION restrict Block {
-  // Extents of the output texture
+  // xyz contain the extents of the input texture, w contains HxW to help
+  // calculate buffer offsets
   ivec4 in_extents;
-  // Number of texels spanned by one channel
-  ivec2 c_info;
 }
 uBlock;
 
@@ -41,25 +40,13 @@ void main() {
 
   const vec4 intex = texelFetch(uImage, pos, 0);
 
-  const int n_index = int(pos.z / uBlock.c_info.x);
-  const int c_index = (pos.z % uBlock.c_info.x) * 4;
-  int d_offset = (n_index * uBlock.c_info.y) + c_index;
-
   const int base_index =
-      pos.x + uBlock.in_extents.x * pos.y + uBlock.in_extents.w * d_offset;
+      pos.x + uBlock.in_extents.x * pos.y + (4 * uBlock.in_extents.w) * pos.z;
   const ivec4 buf_indices =
       base_index + ivec4(0, 1, 2, 3) * uBlock.in_extents.w;
 
-  if (c_index < uBlock.c_info.y) {
-    uBuffer.data[buf_indices.x] = intex.x;
-  }
-  if (c_index + 1 < uBlock.c_info.y) {
-    uBuffer.data[buf_indices.y] = intex.y;
-  }
-  if (c_index + 2 < uBlock.c_info.y) {
-    uBuffer.data[buf_indices.z] = intex.z;
-  }
-  if (c_index + 3 < uBlock.c_info.y) {
-    uBuffer.data[buf_indices.w] = intex.w;
-  }
+  uBuffer.data[buf_indices.x] = intex.x;
+  uBuffer.data[buf_indices.y] = intex.y;
+  uBuffer.data[buf_indices.z] = intex.z;
+  uBuffer.data[buf_indices.w] = intex.w;
 }
diff --git a/aten/src/ATen/native/vulkan/glsl/mean.glsl b/aten/src/ATen/native/vulkan/glsl/mean.glsl
@@ -1,77 +1,54 @@
 #version 450 core
 #define PRECISION $precision
-#define FORMAT $format
+#define FORMAT    $format
 
 layout(std430) buffer;
 
-/*
- * Output Image
- */
-layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D uOutput;
+/* Qualifiers: layout - storage - precision - memory */
 
-/*
- * Input Textures
- */
-layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
+layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D   uOutput;
+layout(set = 0, binding = 1)         uniform PRECISION                    sampler3D uInput;
+layout(set = 0, binding = 2)         uniform PRECISION restrict           Block {
+  ivec4 size;
+  ivec3 isize;
+} uBlock;
 
-/*
- * Params Buffer
- */
-layout(set = 0, binding = 2) uniform PRECISION restrict Block {
-  // extents of the output texture
-  // w contains pre-computed H*W of the input texture for convenience
-  ivec4 out_extents;
-  // extents of the input texture
-  // w contains size of input channels aligned to 4
-  ivec4 in_extents;
-}
-uBlock;
-
-/*
- * Shared memory buffer
- */
 shared vec4 sh_mem[64];
 
-/*
- * Local Work Group
- */
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-/*
- * Computes the mean of an input tensor along the width, height, and channel
- * axes.
- */
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
   const ivec3 tid = ivec3(gl_LocalInvocationID);
   const ivec3 group_size = ivec3(gl_WorkGroupSize);
 
-  if (pos.z < uBlock.in_extents.z) {
+  if (pos.z < uBlock.isize.z) {
     vec4 sum = vec4(0);
 
-    for (int y = tid.y; y < uBlock.in_extents.y; y += group_size.y) {
-      for (int x = tid.x; x < uBlock.in_extents.x; x += group_size.x) {
+    for (int y = tid.y; y < uBlock.isize.y; y+=group_size.y) {
+      for (int x = tid.x; x < uBlock.isize.x; x+=group_size.x) {
         sum += texelFetch(uInput, ivec3(x, y, pos.z), 0);
       }
     }
 
-    sh_mem[tid.z * group_size.y * group_size.x + tid.y * group_size.x + tid.x] =
-        sum;
+    sh_mem[tid.z * group_size.y * group_size.x + tid.y * group_size.x + tid.x] = sum;
   }
   memoryBarrierShared();
   barrier();
 
-  if (tid.y > 0 || tid.x > 0 || pos.z >= uBlock.out_extents.z) {
+  if (tid.y > 0 || tid.x > 0 || pos.z >= uBlock.size.z) {
     return;
   }
 
   vec4 total = vec4(0);
   for (int y = 0; y < group_size.y; ++y) {
     for (int x = 0; x < group_size.x; ++x) {
-      total +=
-          sh_mem[tid.z * group_size.y * group_size.x + y * group_size.x + x];
+      total += sh_mem[tid.z * group_size.y * group_size.x + y * group_size.x + x];
     }
   }
 
-  imageStore(uOutput, pos, total / uBlock.out_extents.w);
+  imageStore(
+      uOutput,
+      pos,
+      total / uBlock.size.w);
 }
diff --git a/aten/src/ATen/native/vulkan/glsl/mean2d.glsl b/aten/src/ATen/native/vulkan/glsl/mean2d.glsl
@@ -1,90 +1,73 @@
 #version 450 core
 #define PRECISION $precision
-#define FORMAT $format
+#define FORMAT    $format
 
 layout(std430) buffer;
 
-/*
- * Output Image
- */
-layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D uOutput;
-
-/*
- * Input Textures
- */
-layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
-
-/*
- * Params Buffer
- */
-layout(set = 0, binding = 2) uniform PRECISION restrict Block {
-  // extents of the output texture
-  // w contains pre-computed H*W of the input texture for convenience
-  ivec4 out_extents;
-  // extents of the input texture
-  // w contains size of input channels aligned to 4
-  ivec4 in_extents;
-}
-uBlock;
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D   uOutput;
+layout(set = 0, binding = 1)         uniform PRECISION                    sampler3D uInput;
+layout(set = 0, binding = 2)         uniform PRECISION restrict           Block {
+  ivec4 size;
+  ivec3 isize;
+} uBlock;
 
-/*
- * Shared memory buffer
- */
 shared vec4 sh_mem[64];
 
-/*
- * Local Work Group
- */
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-/*
- * Computes the mean of an input tensor along the width and height axes.
- */
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
   const ivec3 tid = ivec3(gl_LocalInvocationID);
   const ivec3 group_size = ivec3(gl_WorkGroupSize);
 
-  if (pos.z < uBlock.in_extents.z) {
+  if (pos.z < uBlock.isize.z) {
     vec4 sum = vec4(0);
 
-    for (int y = tid.y; y < uBlock.in_extents.y; y += group_size.y) {
-      for (int x = tid.x; x < uBlock.in_extents.x; x += group_size.x) {
+    for (int y = tid.y; y < uBlock.isize.y; y+=group_size.y) {
+      for (int x = tid.x; x < uBlock.isize.x; x+=group_size.x) {
         sum += texelFetch(uInput, ivec3(x, y, pos.z), 0);
       }
     }
 
-    sh_mem[tid.z * group_size.y * group_size.x + tid.y * group_size.x + tid.x] =
-        sum;
+    sh_mem[tid.z * group_size.y * group_size.x + tid.y * group_size.x + tid.x] = sum;
   }
   memoryBarrierShared();
   barrier();
 
-  if (tid.y > 0 || tid.x > 0 || pos.z >= uBlock.in_extents.z) {
+  if (tid.y > 0 || tid.x > 0 || pos.z >= uBlock.isize.z) {
     return;
   }
 
   vec4 total = vec4(0);
   for (int y = 0; y < group_size.y; ++y) {
     for (int x = 0; x < group_size.x; ++x) {
-      total +=
-          sh_mem[tid.z * group_size.y * group_size.x + y * group_size.x + x];
+      total += sh_mem[tid.z * group_size.y * group_size.x + y * group_size.x + x];
     }
   }
 
-  const vec4 outtex = total / uBlock.out_extents.w;
-
-  const int nc_idx = pos.z * 4;
-  const int out_width = uBlock.out_extents.x;
-  const int out_height = uBlock.out_extents.y;
-
-  for (int i = 0; i < 4; ++i) {
-    const int n_idx = (nc_idx + i) / uBlock.in_extents.w;
-    const int c_idx = (nc_idx + i) % uBlock.in_extents.w;
-
-    ivec3 pos = ivec3(c_idx, n_idx, 0);
-    if (c_idx < out_width && n_idx < out_height) {
-      imageStore(uOutput, pos, vec4(outtex[i], 0, 0, 0));
-    }
-  }
+  const vec4 outtex = total / uBlock.size.w;
+  const int zoutx = 4*pos.z;
+  const int width = uBlock.size.x;
+  const int maxlen = uBlock.size.x * uBlock.size.y;
+
+  const int zouty = min(zoutx + 1, maxlen);
+  ivec3 posy = ivec3((zouty)%width, (zouty)/width, 0);
+  vec4 outy = vec4(outtex.y, 0, 0, 0);
+  imageStore(uOutput, posy, outy);
+
+  const int zoutz = min(zoutx + 2, maxlen);
+  ivec3 posz = ivec3((zoutz)%width, (zoutz)/width, 0);
+  vec4 outz = vec4(outtex.z, 0, 0, 0);
+  imageStore(uOutput, posz, outz);
+
+  const int zoutw = min(zoutx + 3, maxlen);
+  ivec3 posw = ivec3((zoutw)%width, (zoutw)/width, 0);
+  vec4 outw = vec4(outtex.w, 0, 0, 0);
+  imageStore(uOutput, posw, outw);
+
+  ivec3 posx = ivec3(zoutx%width, zoutx/width, 0);
+  vec4 outx = vec4(outtex.x, 0, 0, 0);
+  imageStore(uOutput, posx, outx);
 }