forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 50
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Revert "[vulkan] Pad channels when using texture storage instead of "…
…tight packing" (pytorch#95251)" This reverts commit 0eeb046.
- Loading branch information
1 parent
23b9d4e
commit cd941e4
Showing
16 changed files
with
247 additions
and
418 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,74 +1,47 @@ | ||
#version 450 core | ||
#define PRECISION $precision | ||
#define FORMAT $format | ||
#define FORMAT $format | ||
|
||
layout(std430) buffer; | ||
|
||
/* | ||
* Output Image | ||
*/ | ||
layout(set = 0, binding = 0, FORMAT) uniform PRECISION image3D uOutput; | ||
/* Qualifiers: layout - storage - precision - memory */ | ||
|
||
/* | ||
* Input Textures | ||
*/ | ||
layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; | ||
layout(set = 0, binding = 0, FORMAT) uniform PRECISION image3D uOutput; | ||
layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; | ||
layout(set = 0, binding = 2) uniform PRECISION restrict Block { | ||
ivec4 size; // output texture size (x=width,y=height,z=depth,w=unused) | ||
ivec4 isize; // input texture size (x=width,y=height,z=depth,w=unused) | ||
uint batch_size; // input tensor's batch size | ||
uint ch_size; // input tensor's channel size | ||
uint ch_interval; // channel interval (total # of channels for all tensors) | ||
uint ch_size_allprior; // # of channels for tensor 0 to i-1 at ith tensor | ||
} uBlock; | ||
|
||
/* | ||
* Params Buffer | ||
*/ | ||
layout(set = 0, binding = 2) uniform PRECISION restrict Block { | ||
// output texture size (x=width,y=height,z=depth,w=unused) | ||
ivec4 out_extents; | ||
// input texture size (x=width,y=height,z=depth,w=unused) | ||
ivec4 in_extents; | ||
// input tensor's batch size | ||
uint batch_size; | ||
// input tensor's channel size | ||
uint ch_size; | ||
// channel interval (total # of channels for all tensors) | ||
uint ch_interval; | ||
// # of channels for tensor 0 to i-1 at ith tensor | ||
uint ch_size_allprior; | ||
} | ||
uBlock; | ||
|
||
/* | ||
* Local Work Group | ||
*/ | ||
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; | ||
|
||
void main() { | ||
const ivec3 in_pos = ivec3(gl_GlobalInvocationID); | ||
const ivec3 posIn = ivec3(gl_GlobalInvocationID); | ||
const uint max_src_index = uBlock.ch_size * uBlock.batch_size; | ||
|
||
if (any(greaterThanEqual(in_pos, uBlock.in_extents.xyz))) { | ||
return; | ||
} | ||
|
||
// x and y don't change. only z and index matter | ||
ivec3 out_pos = in_pos; | ||
const vec4 in_tex = texelFetch(uInput, in_pos, 0); | ||
|
||
for (uint i = 0; i < 4; ++i) { | ||
uint src_index = in_pos.z * 4 + i; | ||
|
||
if (src_index >= max_src_index) { | ||
// out of range | ||
break; | ||
if (all(lessThan(posIn, uBlock.isize.xyz))) { | ||
ivec3 posOut = posIn; // x and y don't change. only z and index matter | ||
const vec4 inval = texelFetch(uInput, posIn, 0); | ||
|
||
for (uint i = 0; i < 4; ++i) | ||
{ | ||
uint src_index = posIn.z * 4 + i; | ||
if (src_index >= max_src_index) { | ||
// out of range | ||
break; | ||
} | ||
|
||
uint dst_index = uint(src_index / uBlock.ch_size) * uBlock.ch_interval + (src_index % uBlock.ch_size) + uBlock.ch_size_allprior; | ||
posOut.z = int(dst_index / 4); | ||
uint j = (dst_index % 4); | ||
|
||
vec4 outval = imageLoad(uOutput, posOut); | ||
outval[j] = inval[i]; | ||
imageStore(uOutput, posOut, outval); | ||
} | ||
|
||
uint src_n_idx = src_index / uBlock.ch_size; | ||
uint src_c_idx = src_index % uBlock.ch_size; | ||
|
||
uint dst_nc_idx = | ||
src_n_idx * uBlock.ch_interval + src_c_idx + uBlock.ch_size_allprior; | ||
|
||
out_pos.z = int(dst_nc_idx / 4); | ||
uint j = (dst_nc_idx % 4); | ||
|
||
vec4 out_tex = imageLoad(uOutput, out_pos); | ||
out_tex[j] = in_tex[i]; | ||
imageStore(uOutput, out_pos, out_tex); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,77 +1,54 @@ | ||
#version 450 core | ||
#define PRECISION $precision | ||
#define FORMAT $format | ||
#define FORMAT $format | ||
|
||
layout(std430) buffer; | ||
|
||
/* | ||
* Output Image | ||
*/ | ||
layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D uOutput; | ||
/* Qualifiers: layout - storage - precision - memory */ | ||
|
||
/* | ||
* Input Textures | ||
*/ | ||
layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; | ||
layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D uOutput; | ||
layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; | ||
layout(set = 0, binding = 2) uniform PRECISION restrict Block { | ||
ivec4 size; | ||
ivec3 isize; | ||
} uBlock; | ||
|
||
/* | ||
* Params Buffer | ||
*/ | ||
layout(set = 0, binding = 2) uniform PRECISION restrict Block { | ||
// extents of the output texture | ||
// w contains pre-computed H*W of the input texture for convenience | ||
ivec4 out_extents; | ||
// extents of the input texture | ||
// w contains size of input channels aligned to 4 | ||
ivec4 in_extents; | ||
} | ||
uBlock; | ||
|
||
/* | ||
* Shared memory buffer | ||
*/ | ||
shared vec4 sh_mem[64]; | ||
|
||
/* | ||
* Local Work Group | ||
*/ | ||
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; | ||
|
||
/* | ||
* Computes the mean of an input tensor along the width, height, and channel | ||
* axes. | ||
*/ | ||
void main() { | ||
const ivec3 pos = ivec3(gl_GlobalInvocationID); | ||
const ivec3 tid = ivec3(gl_LocalInvocationID); | ||
const ivec3 group_size = ivec3(gl_WorkGroupSize); | ||
|
||
if (pos.z < uBlock.in_extents.z) { | ||
if (pos.z < uBlock.isize.z) { | ||
vec4 sum = vec4(0); | ||
|
||
for (int y = tid.y; y < uBlock.in_extents.y; y += group_size.y) { | ||
for (int x = tid.x; x < uBlock.in_extents.x; x += group_size.x) { | ||
for (int y = tid.y; y < uBlock.isize.y; y+=group_size.y) { | ||
for (int x = tid.x; x < uBlock.isize.x; x+=group_size.x) { | ||
sum += texelFetch(uInput, ivec3(x, y, pos.z), 0); | ||
} | ||
} | ||
|
||
sh_mem[tid.z * group_size.y * group_size.x + tid.y * group_size.x + tid.x] = | ||
sum; | ||
sh_mem[tid.z * group_size.y * group_size.x + tid.y * group_size.x + tid.x] = sum; | ||
} | ||
memoryBarrierShared(); | ||
barrier(); | ||
|
||
if (tid.y > 0 || tid.x > 0 || pos.z >= uBlock.out_extents.z) { | ||
if (tid.y > 0 || tid.x > 0 || pos.z >= uBlock.size.z) { | ||
return; | ||
} | ||
|
||
vec4 total = vec4(0); | ||
for (int y = 0; y < group_size.y; ++y) { | ||
for (int x = 0; x < group_size.x; ++x) { | ||
total += | ||
sh_mem[tid.z * group_size.y * group_size.x + y * group_size.x + x]; | ||
total += sh_mem[tid.z * group_size.y * group_size.x + y * group_size.x + x]; | ||
} | ||
} | ||
|
||
imageStore(uOutput, pos, total / uBlock.out_extents.w); | ||
imageStore( | ||
uOutput, | ||
pos, | ||
total / uBlock.size.w); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,90 +1,73 @@ | ||
#version 450 core | ||
#define PRECISION $precision | ||
#define FORMAT $format | ||
#define FORMAT $format | ||
|
||
layout(std430) buffer; | ||
|
||
/* | ||
* Output Image | ||
*/ | ||
layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D uOutput; | ||
|
||
/* | ||
* Input Textures | ||
*/ | ||
layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; | ||
|
||
/* | ||
* Params Buffer | ||
*/ | ||
layout(set = 0, binding = 2) uniform PRECISION restrict Block { | ||
// extents of the output texture | ||
// w contains pre-computed H*W of the input texture for convenience | ||
ivec4 out_extents; | ||
// extents of the input texture | ||
// w contains size of input channels aligned to 4 | ||
ivec4 in_extents; | ||
} | ||
uBlock; | ||
/* Qualifiers: layout - storage - precision - memory */ | ||
|
||
layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D uOutput; | ||
layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; | ||
layout(set = 0, binding = 2) uniform PRECISION restrict Block { | ||
ivec4 size; | ||
ivec3 isize; | ||
} uBlock; | ||
|
||
/* | ||
* Shared memory buffer | ||
*/ | ||
shared vec4 sh_mem[64]; | ||
|
||
/* | ||
* Local Work Group | ||
*/ | ||
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; | ||
|
||
/* | ||
* Computes the mean of an input tensor along the width and height axes. | ||
*/ | ||
void main() { | ||
const ivec3 pos = ivec3(gl_GlobalInvocationID); | ||
const ivec3 tid = ivec3(gl_LocalInvocationID); | ||
const ivec3 group_size = ivec3(gl_WorkGroupSize); | ||
|
||
if (pos.z < uBlock.in_extents.z) { | ||
if (pos.z < uBlock.isize.z) { | ||
vec4 sum = vec4(0); | ||
|
||
for (int y = tid.y; y < uBlock.in_extents.y; y += group_size.y) { | ||
for (int x = tid.x; x < uBlock.in_extents.x; x += group_size.x) { | ||
for (int y = tid.y; y < uBlock.isize.y; y+=group_size.y) { | ||
for (int x = tid.x; x < uBlock.isize.x; x+=group_size.x) { | ||
sum += texelFetch(uInput, ivec3(x, y, pos.z), 0); | ||
} | ||
} | ||
|
||
sh_mem[tid.z * group_size.y * group_size.x + tid.y * group_size.x + tid.x] = | ||
sum; | ||
sh_mem[tid.z * group_size.y * group_size.x + tid.y * group_size.x + tid.x] = sum; | ||
} | ||
memoryBarrierShared(); | ||
barrier(); | ||
|
||
if (tid.y > 0 || tid.x > 0 || pos.z >= uBlock.in_extents.z) { | ||
if (tid.y > 0 || tid.x > 0 || pos.z >= uBlock.isize.z) { | ||
return; | ||
} | ||
|
||
vec4 total = vec4(0); | ||
for (int y = 0; y < group_size.y; ++y) { | ||
for (int x = 0; x < group_size.x; ++x) { | ||
total += | ||
sh_mem[tid.z * group_size.y * group_size.x + y * group_size.x + x]; | ||
total += sh_mem[tid.z * group_size.y * group_size.x + y * group_size.x + x]; | ||
} | ||
} | ||
|
||
const vec4 outtex = total / uBlock.out_extents.w; | ||
|
||
const int nc_idx = pos.z * 4; | ||
const int out_width = uBlock.out_extents.x; | ||
const int out_height = uBlock.out_extents.y; | ||
|
||
for (int i = 0; i < 4; ++i) { | ||
const int n_idx = (nc_idx + i) / uBlock.in_extents.w; | ||
const int c_idx = (nc_idx + i) % uBlock.in_extents.w; | ||
|
||
ivec3 pos = ivec3(c_idx, n_idx, 0); | ||
if (c_idx < out_width && n_idx < out_height) { | ||
imageStore(uOutput, pos, vec4(outtex[i], 0, 0, 0)); | ||
} | ||
} | ||
const vec4 outtex = total / uBlock.size.w; | ||
const int zoutx = 4*pos.z; | ||
const int width = uBlock.size.x; | ||
const int maxlen = uBlock.size.x * uBlock.size.y; | ||
|
||
const int zouty = min(zoutx + 1, maxlen); | ||
ivec3 posy = ivec3((zouty)%width, (zouty)/width, 0); | ||
vec4 outy = vec4(outtex.y, 0, 0, 0); | ||
imageStore(uOutput, posy, outy); | ||
|
||
const int zoutz = min(zoutx + 2, maxlen); | ||
ivec3 posz = ivec3((zoutz)%width, (zoutz)/width, 0); | ||
vec4 outz = vec4(outtex.z, 0, 0, 0); | ||
imageStore(uOutput, posz, outz); | ||
|
||
const int zoutw = min(zoutx + 3, maxlen); | ||
ivec3 posw = ivec3((zoutw)%width, (zoutw)/width, 0); | ||
vec4 outw = vec4(outtex.w, 0, 0, 0); | ||
imageStore(uOutput, posw, outw); | ||
|
||
ivec3 posx = ivec3(zoutx%width, zoutx/width, 0); | ||
vec4 outx = vec4(outtex.x, 0, 0, 0); | ||
imageStore(uOutput, posx, outx); | ||
} |
Oops, something went wrong.