hipFuncGetAttributes returns inaccurate maxThreadsPerBlock #1662

jglaser · 2019-11-18T05:16:03Z

With the HCC backend, using hipFuncGetAttributes on a __global__ function does not always return an accurate upper bound of the maximum block size. Neither does it return a multiple of the warp size. As a consequence, when iterating over possible kernel parameters allowed by attr.maxThreadsPerBlock, I get an error:

### HCC STATUS_CHECK Error: HSA_STATUS_ERROR_INVALID_ISA (0x100f) at file:mcwamp_hsa.cpp line:1194

I believe this is because because the code in hip_module.cpp hardcodes a fixed number of registers per CU (65536), and does not distinguish between SGPR and VGPR usage of a kernel function. Instead, it should do something more sensible like in hipOccupancyMaxPotentialBlockSize.

The text was updated successfully, but these errors were encountered:

jglaser · 2019-11-18T20:58:51Z

here's a reproducer:

#include <hip/hip_runtime.h>

// data types

//! Storage for group members (GPU declaration)
template<unsigned int group_size>
union group_storage
    {
    unsigned int tag[group_size]; // access 'tags'
    unsigned int idx[group_size]; // access 'indices'
    };

typedef float Scalar;
typedef float3 Scalar3;
typedef float4 Scalar4;

#define make_scalar3 make_float3
#define make_scalar4 make_float4

//! BoxDim
#ifdef __HIPCC__
#define HOSTDEVICE __host__ __device__ inline
#else
#define HOSTDEVICE inline __attribute__((always_inline))
#endif

struct BoxDim
    {
    public:
        HOSTDEVICE explicit BoxDim(Scalar Len_x, Scalar Len_y, Scalar Len_z)
            {
            setL(make_scalar3(Len_x, Len_y, Len_z));
            m_periodic = make_uchar3(1,1,1);
            m_xz = m_xy = m_yz = Scalar(0.0);
            }

        HOSTDEVICE void setL(const Scalar3& L)
            {
            m_hi = L/Scalar(2.0);
            m_lo = -m_hi;
            m_Linv = Scalar(1.0)/L;
            m_L = L;
            }


        HOSTDEVICE Scalar3 getL() const
            {
            return m_L;
            }

        HOSTDEVICE Scalar3 minImage(const Scalar3& v) const
            {
            Scalar3 w = v;
            Scalar3 L = getL();

            #ifdef __HIPCC__
            if (m_periodic.z)
                {
                Scalar img = rintf(w.z * m_Linv.z);
                w.z -= L.z * img;
                w.y -= L.z * m_yz * img;
                w.x -= L.z * m_xz * img;
                }

            if (m_periodic.y)
                {
                Scalar img = rintf(w.y * m_Linv.y);
                w.y -= L.y * img;
                w.x -= L.y * m_xy * img;
               }

            if (m_periodic.x)
                {
                w.x -= L.x * rintf(w.x * m_Linv.x);
                }
            #else
            // on the cpu, branches are faster than calling rintf
            if (m_periodic.z)
                {
                if (w.z >= m_hi.z)
                    {
                    w.z -= L.z;
                    w.y -= L.z * m_yz;
                    w.x -= L.z * m_xz;
                    }
                else if (w.z < m_lo.z)
                    {
                    w.z += L.z;
                    w.y += L.z * m_yz;
                    w.x += L.z * m_xz;
                    }
                }

            if (m_periodic.y)
                {
                if (w.y >= m_hi.y)
                    {
                    int i = (w.y*m_Linv.y+Scalar(0.5));
                    w.y -= (Scalar)i*L.y;
                    w.x -= (Scalar)i*L.y * m_xy;
                    }
                else if (w.y < m_lo.y)
                    {
                    int i = (-w.y*m_Linv.y+Scalar(0.5));
                    w.y += (Scalar)i*L.y;
                    w.x += (Scalar)i*L.y * m_xy;
                    }
                }

            if (m_periodic.x)
                {
                if (w.x >= m_hi.x)
                    {
                    int i = (w.x*m_Linv.x+Scalar(0.5));
                    w.x -= (Scalar)i*L.x;
                    }
                else if (w.x < m_lo.x)
                    {
                    int i = (-w.x*m_Linv.x+Scalar(0.5));
                    w.x += (Scalar)i*L.x;
                    }
                }
            #endif

            return w;
            }
    private:
        Scalar3 m_lo;      //!< Minimum coords in the box
        Scalar3 m_hi;      //!< Maximum coords in the box
        Scalar3 m_L;       //!< L precomputed (used to avoid subtractions in boundary conditions)
        Scalar3 m_Linv;    //!< 1/L precomputed (used to avoid divisions in boundary conditions)
        Scalar m_xy;       //!< xy tilt factor
        Scalar m_xz;       //!< xz tilt factor
        Scalar m_yz;       //!< yz tilt factor
        uchar3 m_periodic; //!< 0/1 in each direction to tell if the box is periodic in that direction

    };

//! Kernel
__global__
void gpu_compute_opls_dihedral_forces_kernel(Scalar4* d_force,
                                                 Scalar* d_virial,
                                                 const unsigned int virial_pitch,
                                                 const unsigned int N,
                                                 const Scalar4 *d_pos,
                                                 const Scalar4 *d_params,
                                                 BoxDim box,
                                                 const group_storage<4> *tlist,
                                                 const unsigned int *dihedral_ABCD,
                                                 const unsigned int pitch,
                                                 const unsigned int *n_dihedrals_list)
    {
    // start by identifying which particle we are to handle
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx >= N)
        return;

    // load in the length of the list for this thread (MEM TRANSFER: 4 bytes)
    int n_dihedrals = n_dihedrals_list[idx];

    // read in the position of our b-particle from the a-b-c-d set. (MEM TRANSFER: 16 bytes)
    Scalar4 idx_postype = d_pos[idx];  // we can be either a, b, or c in the a-b-c-d quartet
    Scalar3 idx_pos = make_scalar3(idx_postype.x, idx_postype.y, idx_postype.z);
    Scalar3 pos_a,pos_b,pos_c, pos_d; // allocate space for the a,b, and c atoms in the a-b-c-d quartet

    // initialize the force to 0
    Scalar4 force_idx = make_scalar4(Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0));

    // initialize the virial to 0
    Scalar virial_idx[6];
    for (unsigned int i = 0; i < 6; i++)
        virial_idx[i] = Scalar(0.0);

    // loop over all dihedrals
    for (int dihedral_idx = 0; dihedral_idx < n_dihedrals; dihedral_idx++)
        {
        group_storage<4> cur_dihedral = tlist[pitch*dihedral_idx + idx];
        unsigned int cur_ABCD = dihedral_ABCD[pitch*dihedral_idx + idx];

        int cur_dihedral_x_idx = cur_dihedral.idx[0];
        int cur_dihedral_y_idx = cur_dihedral.idx[1];
        int cur_dihedral_z_idx = cur_dihedral.idx[2];
        int cur_dihedral_type = cur_dihedral.idx[3];
        int cur_dihedral_abcd = cur_ABCD;

        // get the a-particle's position (MEM TRANSFER: 16 bytes)
        Scalar4 x_postype = d_pos[cur_dihedral_x_idx];
        Scalar3 x_pos = make_scalar3(x_postype.x, x_postype.y, x_postype.z);
        // get the c-particle's position (MEM TRANSFER: 16 bytes)
        Scalar4 y_postype = d_pos[cur_dihedral_y_idx];
        Scalar3 y_pos = make_scalar3(y_postype.x, y_postype.y, y_postype.z);
        // get the c-particle's position (MEM TRANSFER: 16 bytes)
        Scalar4 z_postype = d_pos[cur_dihedral_z_idx];
        Scalar3 z_pos = make_scalar3(z_postype.x, z_postype.y, z_postype.z);

        if (cur_dihedral_abcd == 0)
            {
            pos_a = idx_pos;
            pos_b = x_pos;
            pos_c = y_pos;
            pos_d = z_pos;
           }
        if (cur_dihedral_abcd == 1)
            {
            pos_b = idx_pos;
            pos_a = x_pos;
            pos_c = y_pos;
            pos_d = z_pos;
            }
        if (cur_dihedral_abcd == 2)
            {
            pos_c = idx_pos;
            pos_a = x_pos;
            pos_b = y_pos;
            pos_d = z_pos;
            }
        if (cur_dihedral_abcd == 3)
            {
            pos_d = idx_pos;
            pos_a = x_pos;
            pos_b = y_pos;
            pos_c = z_pos;
            }

        // the three bonds

        Scalar3 vb1 = pos_a - pos_b;
        Scalar3 vb2 = pos_c - pos_b;
        Scalar3 vb3 = pos_d - pos_c;

        // apply periodic boundary conditions
        vb1 = box.minImage(vb1);
        vb2 = box.minImage(vb2);
        vb3 = box.minImage(vb3);

        Scalar3 vb2m = -vb2;
        vb2m = box.minImage(vb2m);

        // c,s calculation

        Scalar ax, ay, az, bx, by, bz;
        ax = vb1.y*vb2m.z - vb1.z*vb2m.y;
        ay = vb1.z*vb2m.x - vb1.x*vb2m.z;
        az = vb1.x*vb2m.y - vb1.y*vb2m.x;
        bx = vb3.y*vb2m.z - vb3.z*vb2m.y;
        by = vb3.z*vb2m.x - vb3.x*vb2m.z;
        bz = vb3.x*vb2m.y - vb3.y*vb2m.x;

        Scalar rasq = ax*ax + ay*ay + az*az;
        Scalar rbsq = bx*bx + by*by + bz*bz;
        Scalar rgsq = vb2m.x*vb2m.x + vb2m.y*vb2m.y + vb2m.z*vb2m.z;
        Scalar rg = ::sqrtf(rgsq);

        Scalar rginv, ra2inv, rb2inv;
        rginv = ra2inv = rb2inv = 0.0;
        if (rg > 0) rginv = 1.0/rg;
        if (rasq > 0) ra2inv = 1.0/rasq;
        if (rbsq > 0) rb2inv = 1.0/rbsq;
        Scalar rabinv = ::sqrtf(ra2inv*rb2inv);

        Scalar c = (ax*bx + ay*by + az*bz)*rabinv;
        Scalar s = rg*rabinv*(ax*vb3.x + ay*vb3.y + az*vb3.z);

        if (c > 1.0) c = 1.0;
        if (c < -1.0) c = -1.0;

        // get values for k1/2 through k4/2 (MEM TRANSFER: 16 bytes)
        // ----- The 1/2 factor is already stored in the parameters --------
        Scalar4 params = __ldg(d_params + cur_dihedral_type);
        Scalar k1 = params.x;
        Scalar k2 = params.y;
        Scalar k3 = params.z;
        Scalar k4 = params.w;

        // calculate the potential p = sum (i=1,4) k_i * (1 + (-1)**(i+1)*cos(i*phi) )
        // and df = dp/dc

        // cos(phi) term
        Scalar ddf1 = c;
        Scalar df1 = s;
        Scalar cos_term = ddf1;

        Scalar p = k1 * (1.0 + cos_term);
        Scalar df = k1*df1;

        // cos(2*phi) term
        ddf1 = cos_term*c - df1*s;
        df1 = cos_term*s + df1*c;
        cos_term = ddf1;

        p += k2 * (1.0 - cos_term);
        df += -2.0*k2*df1;

        // cos(3*phi) term
        ddf1 = cos_term*c - df1*s;
        df1 = cos_term*s + df1*c;
        cos_term = ddf1;

        p += k3 * (1.0 + cos_term);
        df += 3.0*k3*df1;

        // cos(4*phi) term
        ddf1 = cos_term*c - df1*s;
        df1 = cos_term*s + df1*c;
        cos_term = ddf1;

        p += k4 * (1.0 - cos_term);
        df += -4.0*k4*df1;

        // Compute 1/4 of energy to assign to each of 4 atoms in the dihedral
        Scalar e_dihedral = 0.25*p;

        Scalar fg = vb1.x*vb2m.x + vb1.y*vb2m.y + vb1.z*vb2m.z;
        Scalar hg = vb3.x*vb2m.x + vb3.y*vb2m.y + vb3.z*vb2m.z;
        Scalar fga = fg*ra2inv*rginv;
        Scalar hgb = hg*rb2inv*rginv;
        Scalar gaa = -ra2inv*rg;
        Scalar gbb = rb2inv*rg;

        Scalar dtfx = gaa*ax;
        Scalar dtfy = gaa*ay;
        Scalar dtfz = gaa*az;
        Scalar dtgx = fga*ax - hgb*bx;
        Scalar dtgy = fga*ay - hgb*by;
        Scalar dtgz = fga*az - hgb*bz;
        Scalar dthx = gbb*bx;
        Scalar dthy = gbb*by;
        Scalar dthz = gbb*bz;

        Scalar sx2 = df*dtgx;
        Scalar sy2 = df*dtgy;
        Scalar sz2 = df*dtgz;

        Scalar3 f1, f2, f3, f4;
        f1.x = df*dtfx;
        f1.y = df*dtfy;
        f1.z = df*dtfz;

        f2.x = sx2 - f1.x;
        f2.y = sy2 - f1.y;
        f2.z = sz2 - f1.z;

        f4.x = df*dthx;
        f4.y = df*dthy;
        f4.z = df*dthz;

        f3.x = -sx2 - f4.x;
        f3.y = -sy2 - f4.y;
        f3.z = -sz2 - f4.z;

        // Compute 1/4 of the virial, 1/4 for each atom in the dihedral
        // upper triangular version of virial tensor

        Scalar dihedral_virial[6];
        dihedral_virial[0] = 0.25*(vb1.x*f1.x + vb2.x*f3.x + (vb3.x+vb2.x)*f4.x);
        dihedral_virial[1] = 0.25*(vb1.y*f1.x + vb2.y*f3.x + (vb3.y+vb2.y)*f4.x);
        dihedral_virial[2] = 0.25*(vb1.z*f1.x + vb2.z*f3.x + (vb3.z+vb2.z)*f4.x);
        dihedral_virial[3] = 0.25*(vb1.y*f1.y + vb2.y*f3.y + (vb3.y+vb2.y)*f4.y);
        dihedral_virial[4] = 0.25*(vb1.z*f1.y + vb2.z*f3.y + (vb3.z+vb2.z)*f4.y);
        dihedral_virial[5] = 0.25*(vb1.z*f1.z + vb2.z*f3.z + (vb3.z+vb2.z)*f4.z);

        if (cur_dihedral_abcd == 0)
            {
            force_idx.x += f1.x;
            force_idx.y += f1.y;
            force_idx.z += f1.z;
            }
        if (cur_dihedral_abcd == 1)
            {
            force_idx.x += f2.x;
            force_idx.y += f2.y;
            force_idx.z += f2.z;
            }
        if (cur_dihedral_abcd == 2)
            {
            force_idx.x += f3.x;
            force_idx.y += f3.y;
            force_idx.z += f3.z;
            }
        if (cur_dihedral_abcd == 3)
            {
            force_idx.x += f4.x;
            force_idx.y += f4.y;
            force_idx.z += f4.z;
            }
        force_idx.w += e_dihedral;

        for (int k = 0; k < 6; k++)
            virial_idx[k] += dihedral_virial[k];
        }

    // now that the force calculation is complete, write out the result (MEM TRANSFER: 20 bytes)
    d_force[idx] = force_idx;
    for (int k = 0; k < 6; k++)
       d_virial[k*virial_pitch+idx] = virial_idx[k];
    }

int main(int argc, char **argv)
    {
    // dummy arguments
    Scalar4 *d_force = 0;
    Scalar *d_virial = 0;
    unsigned int virial_pitch = 0;
   unsigned int N = 1234;
    const Scalar4 *d_pos = 0;
    const Scalar4 *d_params = 0;
    const BoxDim box(1.0,1.0,1.0);
    const group_storage<4> *tlist = 0;
    const unsigned int *dihedral_ABCD = 0;
    unsigned int pitch =0 ;
    const unsigned int *n_dihedrals_list = 0;

    static unsigned int max_block_size = UINT_MAX;
    if (max_block_size == UINT_MAX)
        {
        hipFuncAttributes attr;
        hipFuncGetAttributes(&attr, (const void *)gpu_compute_opls_dihedral_forces_kernel);
        max_block_size = attr.maxThreadsPerBlock;
        }

    unsigned int run_block_size = max_block_size;

    // setup the grid to run the kernel
    dim3 grid( N / run_block_size + 1, 1, 1);
    dim3 threads(run_block_size, 1, 1);

    // run the kernel

   hipLaunchKernelGGL((gpu_compute_opls_dihedral_forces_kernel), dim3(grid), dim3(threads), 0, 0, d_force, d_virial, virial_pitch, N, d_pos, d_params,
                                                                box, tlist, dihedral_ABCD, pitch, n_dihedrals_list);

   return 0;
   }

returns

### HCC STATUS_CHECK Error: HSA_STATUS_ERROR_INVALID_ISA (0x100f) at file:mcwamp_hsa.cpp line:1194
Aborted

on gfx-900 with hcc (HIP on ubuntu 16.04, hip_base version 2.8.19361.4078-rocm-rel-2.9-6-cbe6b65)

This PR takes ensures that the maxThreadsPerBlock returned by hipFuncGetAttributes is both a multiple of the warp size and that the register usage of the maximum block does not exceed the number of available registers. Fixes #1662

jglaser mentioned this issue Nov 20, 2019

Implement accurate max block size in hipFuncGetAttributes() #1676

Merged

mangupta closed this as completed in #1676 Mar 18, 2020

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

hipFuncGetAttributes returns inaccurate maxThreadsPerBlock #1662

hipFuncGetAttributes returns inaccurate maxThreadsPerBlock #1662

jglaser commented Nov 18, 2019

jglaser commented Nov 18, 2019

hipFuncGetAttributes returns inaccurate maxThreadsPerBlock #1662

hipFuncGetAttributes returns inaccurate maxThreadsPerBlock #1662

Comments

jglaser commented Nov 18, 2019

jglaser commented Nov 18, 2019