Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

hipFuncGetAttributes returns inaccurate maxThreadsPerBlock #1662

Closed
jglaser opened this issue Nov 18, 2019 · 1 comment · Fixed by #1676
Closed

hipFuncGetAttributes returns inaccurate maxThreadsPerBlock #1662

jglaser opened this issue Nov 18, 2019 · 1 comment · Fixed by #1676

Comments

@jglaser
Copy link

jglaser commented Nov 18, 2019

With the HCC backend, using hipFuncGetAttributes on a __global__ function does not always return an accurate upper bound of the maximum block size. Neither does it return a multiple of the warp size. As a consequence, when iterating over possible kernel parameters allowed by attr.maxThreadsPerBlock, I get an error:

### HCC STATUS_CHECK Error: HSA_STATUS_ERROR_INVALID_ISA (0x100f) at file:mcwamp_hsa.cpp line:1194

I believe this is because because the code in hip_module.cpp hardcodes a fixed number of registers per CU (65536), and does not distinguish between SGPR and VGPR usage of a kernel function. Instead, it should do something more sensible like in hipOccupancyMaxPotentialBlockSize.

@jglaser
Copy link
Author

jglaser commented Nov 18, 2019

here's a reproducer:

#include <hip/hip_runtime.h>

// data types

//! Storage for group members (GPU declaration)
template<unsigned int group_size>
union group_storage
    {
    unsigned int tag[group_size]; // access 'tags'
    unsigned int idx[group_size]; // access 'indices'
    };

typedef float Scalar;
typedef float3 Scalar3;
typedef float4 Scalar4;

#define make_scalar3 make_float3
#define make_scalar4 make_float4

//! BoxDim
#ifdef __HIPCC__
#define HOSTDEVICE __host__ __device__ inline
#else
#define HOSTDEVICE inline __attribute__((always_inline))
#endif

struct BoxDim
    {
    public:
        HOSTDEVICE explicit BoxDim(Scalar Len_x, Scalar Len_y, Scalar Len_z)
            {
            setL(make_scalar3(Len_x, Len_y, Len_z));
            m_periodic = make_uchar3(1,1,1);
            m_xz = m_xy = m_yz = Scalar(0.0);
            }

        HOSTDEVICE void setL(const Scalar3& L)
            {
            m_hi = L/Scalar(2.0);
            m_lo = -m_hi;
            m_Linv = Scalar(1.0)/L;
            m_L = L;
            }


        HOSTDEVICE Scalar3 getL() const
            {
            return m_L;
            }

        HOSTDEVICE Scalar3 minImage(const Scalar3& v) const
            {
            Scalar3 w = v;
            Scalar3 L = getL();

            #ifdef __HIPCC__
            if (m_periodic.z)
                {
                Scalar img = rintf(w.z * m_Linv.z);
                w.z -= L.z * img;
                w.y -= L.z * m_yz * img;
                w.x -= L.z * m_xz * img;
                }

            if (m_periodic.y)
                {
                Scalar img = rintf(w.y * m_Linv.y);
                w.y -= L.y * img;
                w.x -= L.y * m_xy * img;
               }

            if (m_periodic.x)
                {
                w.x -= L.x * rintf(w.x * m_Linv.x);
                }
            #else
            // on the cpu, branches are faster than calling rintf
            if (m_periodic.z)
                {
                if (w.z >= m_hi.z)
                    {
                    w.z -= L.z;
                    w.y -= L.z * m_yz;
                    w.x -= L.z * m_xz;
                    }
                else if (w.z < m_lo.z)
                    {
                    w.z += L.z;
                    w.y += L.z * m_yz;
                    w.x += L.z * m_xz;
                    }
                }

            if (m_periodic.y)
                {
                if (w.y >= m_hi.y)
                    {
                    int i = (w.y*m_Linv.y+Scalar(0.5));
                    w.y -= (Scalar)i*L.y;
                    w.x -= (Scalar)i*L.y * m_xy;
                    }
                else if (w.y < m_lo.y)
                    {
                    int i = (-w.y*m_Linv.y+Scalar(0.5));
                    w.y += (Scalar)i*L.y;
                    w.x += (Scalar)i*L.y * m_xy;
                    }
                }

            if (m_periodic.x)
                {
                if (w.x >= m_hi.x)
                    {
                    int i = (w.x*m_Linv.x+Scalar(0.5));
                    w.x -= (Scalar)i*L.x;
                    }
                else if (w.x < m_lo.x)
                    {
                    int i = (-w.x*m_Linv.x+Scalar(0.5));
                    w.x += (Scalar)i*L.x;
                    }
                }
            #endif

            return w;
            }
    private:
        Scalar3 m_lo;      //!< Minimum coords in the box
        Scalar3 m_hi;      //!< Maximum coords in the box
        Scalar3 m_L;       //!< L precomputed (used to avoid subtractions in boundary conditions)
        Scalar3 m_Linv;    //!< 1/L precomputed (used to avoid divisions in boundary conditions)
        Scalar m_xy;       //!< xy tilt factor
        Scalar m_xz;       //!< xz tilt factor
        Scalar m_yz;       //!< yz tilt factor
        uchar3 m_periodic; //!< 0/1 in each direction to tell if the box is periodic in that direction

    };

//! Kernel
__global__
void gpu_compute_opls_dihedral_forces_kernel(Scalar4* d_force,
                                                 Scalar* d_virial,
                                                 const unsigned int virial_pitch,
                                                 const unsigned int N,
                                                 const Scalar4 *d_pos,
                                                 const Scalar4 *d_params,
                                                 BoxDim box,
                                                 const group_storage<4> *tlist,
                                                 const unsigned int *dihedral_ABCD,
                                                 const unsigned int pitch,
                                                 const unsigned int *n_dihedrals_list)
    {
    // start by identifying which particle we are to handle
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx >= N)
        return;

    // load in the length of the list for this thread (MEM TRANSFER: 4 bytes)
    int n_dihedrals = n_dihedrals_list[idx];

    // read in the position of our b-particle from the a-b-c-d set. (MEM TRANSFER: 16 bytes)
    Scalar4 idx_postype = d_pos[idx];  // we can be either a, b, or c in the a-b-c-d quartet
    Scalar3 idx_pos = make_scalar3(idx_postype.x, idx_postype.y, idx_postype.z);
    Scalar3 pos_a,pos_b,pos_c, pos_d; // allocate space for the a,b, and c atoms in the a-b-c-d quartet

    // initialize the force to 0
    Scalar4 force_idx = make_scalar4(Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0));

    // initialize the virial to 0
    Scalar virial_idx[6];
    for (unsigned int i = 0; i < 6; i++)
        virial_idx[i] = Scalar(0.0);

    // loop over all dihedrals
    for (int dihedral_idx = 0; dihedral_idx < n_dihedrals; dihedral_idx++)
        {
        group_storage<4> cur_dihedral = tlist[pitch*dihedral_idx + idx];
        unsigned int cur_ABCD = dihedral_ABCD[pitch*dihedral_idx + idx];

        int cur_dihedral_x_idx = cur_dihedral.idx[0];
        int cur_dihedral_y_idx = cur_dihedral.idx[1];
        int cur_dihedral_z_idx = cur_dihedral.idx[2];
        int cur_dihedral_type = cur_dihedral.idx[3];
        int cur_dihedral_abcd = cur_ABCD;

        // get the a-particle's position (MEM TRANSFER: 16 bytes)
        Scalar4 x_postype = d_pos[cur_dihedral_x_idx];
        Scalar3 x_pos = make_scalar3(x_postype.x, x_postype.y, x_postype.z);
        // get the c-particle's position (MEM TRANSFER: 16 bytes)
        Scalar4 y_postype = d_pos[cur_dihedral_y_idx];
        Scalar3 y_pos = make_scalar3(y_postype.x, y_postype.y, y_postype.z);
        // get the c-particle's position (MEM TRANSFER: 16 bytes)
        Scalar4 z_postype = d_pos[cur_dihedral_z_idx];
        Scalar3 z_pos = make_scalar3(z_postype.x, z_postype.y, z_postype.z);

        if (cur_dihedral_abcd == 0)
            {
            pos_a = idx_pos;
            pos_b = x_pos;
            pos_c = y_pos;
            pos_d = z_pos;
           }
        if (cur_dihedral_abcd == 1)
            {
            pos_b = idx_pos;
            pos_a = x_pos;
            pos_c = y_pos;
            pos_d = z_pos;
            }
        if (cur_dihedral_abcd == 2)
            {
            pos_c = idx_pos;
            pos_a = x_pos;
            pos_b = y_pos;
            pos_d = z_pos;
            }
        if (cur_dihedral_abcd == 3)
            {
            pos_d = idx_pos;
            pos_a = x_pos;
            pos_b = y_pos;
            pos_c = z_pos;
            }

        // the three bonds

        Scalar3 vb1 = pos_a - pos_b;
        Scalar3 vb2 = pos_c - pos_b;
        Scalar3 vb3 = pos_d - pos_c;

        // apply periodic boundary conditions
        vb1 = box.minImage(vb1);
        vb2 = box.minImage(vb2);
        vb3 = box.minImage(vb3);

        Scalar3 vb2m = -vb2;
        vb2m = box.minImage(vb2m);

        // c,s calculation

        Scalar ax, ay, az, bx, by, bz;
        ax = vb1.y*vb2m.z - vb1.z*vb2m.y;
        ay = vb1.z*vb2m.x - vb1.x*vb2m.z;
        az = vb1.x*vb2m.y - vb1.y*vb2m.x;
        bx = vb3.y*vb2m.z - vb3.z*vb2m.y;
        by = vb3.z*vb2m.x - vb3.x*vb2m.z;
        bz = vb3.x*vb2m.y - vb3.y*vb2m.x;

        Scalar rasq = ax*ax + ay*ay + az*az;
        Scalar rbsq = bx*bx + by*by + bz*bz;
        Scalar rgsq = vb2m.x*vb2m.x + vb2m.y*vb2m.y + vb2m.z*vb2m.z;
        Scalar rg = ::sqrtf(rgsq);

        Scalar rginv, ra2inv, rb2inv;
        rginv = ra2inv = rb2inv = 0.0;
        if (rg > 0) rginv = 1.0/rg;
        if (rasq > 0) ra2inv = 1.0/rasq;
        if (rbsq > 0) rb2inv = 1.0/rbsq;
        Scalar rabinv = ::sqrtf(ra2inv*rb2inv);

        Scalar c = (ax*bx + ay*by + az*bz)*rabinv;
        Scalar s = rg*rabinv*(ax*vb3.x + ay*vb3.y + az*vb3.z);

        if (c > 1.0) c = 1.0;
        if (c < -1.0) c = -1.0;

        // get values for k1/2 through k4/2 (MEM TRANSFER: 16 bytes)
        // ----- The 1/2 factor is already stored in the parameters --------
        Scalar4 params = __ldg(d_params + cur_dihedral_type);
        Scalar k1 = params.x;
        Scalar k2 = params.y;
        Scalar k3 = params.z;
        Scalar k4 = params.w;

        // calculate the potential p = sum (i=1,4) k_i * (1 + (-1)**(i+1)*cos(i*phi) )
        // and df = dp/dc

        // cos(phi) term
        Scalar ddf1 = c;
        Scalar df1 = s;
        Scalar cos_term = ddf1;

        Scalar p = k1 * (1.0 + cos_term);
        Scalar df = k1*df1;

        // cos(2*phi) term
        ddf1 = cos_term*c - df1*s;
        df1 = cos_term*s + df1*c;
        cos_term = ddf1;

        p += k2 * (1.0 - cos_term);
        df += -2.0*k2*df1;

        // cos(3*phi) term
        ddf1 = cos_term*c - df1*s;
        df1 = cos_term*s + df1*c;
        cos_term = ddf1;

        p += k3 * (1.0 + cos_term);
        df += 3.0*k3*df1;

        // cos(4*phi) term
        ddf1 = cos_term*c - df1*s;
        df1 = cos_term*s + df1*c;
        cos_term = ddf1;

        p += k4 * (1.0 - cos_term);
        df += -4.0*k4*df1;

        // Compute 1/4 of energy to assign to each of 4 atoms in the dihedral
        Scalar e_dihedral = 0.25*p;

        Scalar fg = vb1.x*vb2m.x + vb1.y*vb2m.y + vb1.z*vb2m.z;
        Scalar hg = vb3.x*vb2m.x + vb3.y*vb2m.y + vb3.z*vb2m.z;
        Scalar fga = fg*ra2inv*rginv;
        Scalar hgb = hg*rb2inv*rginv;
        Scalar gaa = -ra2inv*rg;
        Scalar gbb = rb2inv*rg;

        Scalar dtfx = gaa*ax;
        Scalar dtfy = gaa*ay;
        Scalar dtfz = gaa*az;
        Scalar dtgx = fga*ax - hgb*bx;
        Scalar dtgy = fga*ay - hgb*by;
        Scalar dtgz = fga*az - hgb*bz;
        Scalar dthx = gbb*bx;
        Scalar dthy = gbb*by;
        Scalar dthz = gbb*bz;

        Scalar sx2 = df*dtgx;
        Scalar sy2 = df*dtgy;
        Scalar sz2 = df*dtgz;

        Scalar3 f1, f2, f3, f4;
        f1.x = df*dtfx;
        f1.y = df*dtfy;
        f1.z = df*dtfz;

        f2.x = sx2 - f1.x;
        f2.y = sy2 - f1.y;
        f2.z = sz2 - f1.z;

        f4.x = df*dthx;
        f4.y = df*dthy;
        f4.z = df*dthz;

        f3.x = -sx2 - f4.x;
        f3.y = -sy2 - f4.y;
        f3.z = -sz2 - f4.z;

        // Compute 1/4 of the virial, 1/4 for each atom in the dihedral
        // upper triangular version of virial tensor

        Scalar dihedral_virial[6];
        dihedral_virial[0] = 0.25*(vb1.x*f1.x + vb2.x*f3.x + (vb3.x+vb2.x)*f4.x);
        dihedral_virial[1] = 0.25*(vb1.y*f1.x + vb2.y*f3.x + (vb3.y+vb2.y)*f4.x);
        dihedral_virial[2] = 0.25*(vb1.z*f1.x + vb2.z*f3.x + (vb3.z+vb2.z)*f4.x);
        dihedral_virial[3] = 0.25*(vb1.y*f1.y + vb2.y*f3.y + (vb3.y+vb2.y)*f4.y);
        dihedral_virial[4] = 0.25*(vb1.z*f1.y + vb2.z*f3.y + (vb3.z+vb2.z)*f4.y);
        dihedral_virial[5] = 0.25*(vb1.z*f1.z + vb2.z*f3.z + (vb3.z+vb2.z)*f4.z);

        if (cur_dihedral_abcd == 0)
            {
            force_idx.x += f1.x;
            force_idx.y += f1.y;
            force_idx.z += f1.z;
            }
        if (cur_dihedral_abcd == 1)
            {
            force_idx.x += f2.x;
            force_idx.y += f2.y;
            force_idx.z += f2.z;
            }
        if (cur_dihedral_abcd == 2)
            {
            force_idx.x += f3.x;
            force_idx.y += f3.y;
            force_idx.z += f3.z;
            }
        if (cur_dihedral_abcd == 3)
            {
            force_idx.x += f4.x;
            force_idx.y += f4.y;
            force_idx.z += f4.z;
            }
        force_idx.w += e_dihedral;

        for (int k = 0; k < 6; k++)
            virial_idx[k] += dihedral_virial[k];
        }

    // now that the force calculation is complete, write out the result (MEM TRANSFER: 20 bytes)
    d_force[idx] = force_idx;
    for (int k = 0; k < 6; k++)
       d_virial[k*virial_pitch+idx] = virial_idx[k];
    }

int main(int argc, char **argv)
    {
    // dummy arguments
    Scalar4 *d_force = 0;
    Scalar *d_virial = 0;
    unsigned int virial_pitch = 0;
   unsigned int N = 1234;
    const Scalar4 *d_pos = 0;
    const Scalar4 *d_params = 0;
    const BoxDim box(1.0,1.0,1.0);
    const group_storage<4> *tlist = 0;
    const unsigned int *dihedral_ABCD = 0;
    unsigned int pitch =0 ;
    const unsigned int *n_dihedrals_list = 0;

    static unsigned int max_block_size = UINT_MAX;
    if (max_block_size == UINT_MAX)
        {
        hipFuncAttributes attr;
        hipFuncGetAttributes(&attr, (const void *)gpu_compute_opls_dihedral_forces_kernel);
        max_block_size = attr.maxThreadsPerBlock;
        }

    unsigned int run_block_size = max_block_size;

    // setup the grid to run the kernel
    dim3 grid( N / run_block_size + 1, 1, 1);
    dim3 threads(run_block_size, 1, 1);

    // run the kernel

   hipLaunchKernelGGL((gpu_compute_opls_dihedral_forces_kernel), dim3(grid), dim3(threads), 0, 0, d_force, d_virial, virial_pitch, N, d_pos, d_params,
                                                                box, tlist, dihedral_ABCD, pitch, n_dihedrals_list);

   return 0;
   }

returns

### HCC STATUS_CHECK Error: HSA_STATUS_ERROR_INVALID_ISA (0x100f) at file:mcwamp_hsa.cpp line:1194
Aborted

on gfx-900 with hcc (HIP on ubuntu 16.04, hip_base version 2.8.19361.4078-rocm-rel-2.9-6-cbe6b65)

mangupta pushed a commit that referenced this issue Mar 18, 2020
This PR takes ensures that the maxThreadsPerBlock returned by hipFuncGetAttributes is both a multiple of the warp size and that the register usage of the maximum block does not exceed the number of available registers.

Fixes #1662
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging a pull request may close this issue.

1 participant