Skip to content

Commit

Permalink
Merge pull request #34 from ROCmSoftwarePlatform/develop
Browse files Browse the repository at this point in the history
Memory footprint reduction
  • Loading branch information
mkarunan committed May 26, 2022
2 parents e195d34 + c7b72ea commit 1c4614a
Show file tree
Hide file tree
Showing 19 changed files with 670 additions and 77 deletions.
223 changes: 223 additions & 0 deletions test/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@

#include <rocwmma/internal/types.hpp>

#include "device/common.hpp"

#ifndef CHECK_HIP_ERROR
#define CHECK_HIP_ERROR(status) \
if(status != hipSuccess) \
Expand Down Expand Up @@ -211,6 +213,56 @@ namespace rocwmma
assert(mat.size() == n * m);
fill(mat.data(), m, n, value);
}

// fill kernel wrapper for M x N matrix
template <typename DataT>
__host__ static inline void fillLaunchKernel(DataT* d_mat, uint32_t m, uint32_t n)
{
auto blockDim = dim3(1024, 1, 1);
auto gridDim = dim3(ceilDiv(m * n, blockDim.x), 1, 1);
hipLaunchKernelGGL((fillKernel<DataT, Layout>),
gridDim,
blockDim,
0,
0,
d_mat,
m,
n);
}

// fill kernel wrapper for batched M x K matrices
template <typename DataT>
__host__ static inline void fillLaunchKernel(DataT* d_mat, uint32_t m, uint32_t k, uint32_t b)
{
auto blockDim = dim3(1024, 1, 1);
auto gridDim = dim3(ceilDiv(m * k, blockDim.x), 1, b);
hipLaunchKernelGGL((fillKernel<DataT, Layout>),
gridDim,
blockDim,
0,
0,
d_mat,
m,
k,
b);
}

// fill kernel wrapper for M x N matrix for a specific value
template <typename DataT>
__host__ static inline void fillLaunchKernel(DataT* d_mat, uint32_t m, uint32_t n, DataT value)
{
auto blockDim = dim3(1024, 1, 1);
auto gridDim = dim3(ceilDiv(m * n, blockDim.x), 1, 1);
hipLaunchKernelGGL((fillKernel<DataT, Layout>),
gridDim,
blockDim,
0,
0,
d_mat,
m,
n,
value);
}
};

// compareEqual on two different layouts: must calculate index offsets
Expand Down Expand Up @@ -437,6 +489,177 @@ namespace rocwmma
a.data(), b.data(), m, n, lda, ldb, tolerance);
}

// compareEqual kernel wrapper for gemm tests
template <typename TypeA, typename TypeB, typename LayoutA, typename LayoutB>
std::pair<bool, double> compareEqualLaunchKernel(TypeA* matrixA,
TypeB* matrixB,
uint32_t m,
uint32_t n,
double tolerance = 10.0)
{
uint32_t lda = std::is_same<LayoutA, row_major>::value ? n : m;
uint32_t ldb = std::is_same<LayoutB, row_major>::value ? n : m;

auto blockDim = dim3(1024, 1, 1);
auto gridDim = dim3(ceilDiv(m * n, blockDim.x), 1, 1);

double *d_relativeError;
double maxRelativeError;
CHECK_HIP_ERROR(hipMalloc(&d_relativeError, m * n * sizeof(double)));

hipEvent_t syncEvent;
CHECK_HIP_ERROR(hipEventCreate(&syncEvent));

// Calculate the relative error for each element of matrix A/B
hipLaunchKernelGGL((compareEqualKernel<TypeA, TypeB, LayoutA, LayoutB>),
gridDim,
blockDim,
0,
0,
matrixA,
matrixB,
d_relativeError,
m,
n,
lda,
ldb);
CHECK_HIP_ERROR(hipEventRecord(syncEvent));
CHECK_HIP_ERROR(hipEventSynchronize(syncEvent));

// Determine the maximum relative error
blockDim = dim3(512, 1, 1);
uint32_t maxElements = 1024;
uint32_t offset = 1;

for (uint32_t i = m * n; i > 1; i = ceilDiv(i, maxElements))
{
gridDim = dim3(ceilDiv(i, maxElements), 1, 1);
auto elements = i > maxElements ? maxElements : i;

hipLaunchKernelGGL((maxReduceKernel),
gridDim,
blockDim,
0,
0,
d_relativeError,
elements,
offset,
m * n);

CHECK_HIP_ERROR(hipEventRecord(syncEvent));
CHECK_HIP_ERROR(hipEventSynchronize(syncEvent));
offset = offset * maxElements;
}

CHECK_HIP_ERROR(hipMemcpy(&maxRelativeError, d_relativeError, sizeof(double), hipMemcpyDeviceToHost));

// Free allocated device memory
CHECK_HIP_ERROR(hipFree(d_relativeError));

bool retval = true;
bool isNaN = std::isnan(maxRelativeError);

auto toDoubleA
= [](TypeA const& val) { return static_cast<double>(static_cast<float>(val)); };

auto eps = toDoubleA(std::numeric_limits<TypeA>::epsilon());
if(isNaN)
{
retval = false;
maxRelativeError = std::numeric_limits<TypeA>::signaling_NaN();
}
else if(maxRelativeError > (eps * tolerance))
{
retval = false;
}

return std::make_pair(retval, maxRelativeError);
}

// compareEqual kernel wrapper for batched matrices
template <typename TypeA, typename TypeB>
std::pair<bool, double> compareEqualLaunchKernel(TypeA* matrixA,
TypeB* matrixB,
uint32_t m,
uint32_t k,
uint32_t b,
double tolerance = 10.0)
{
auto blockDim = dim3(1024, 1, 1);
auto gridDim = dim3(ceilDiv(m * k, blockDim.x), 1, b);

double *d_relativeError;
double maxRelativeError;
CHECK_HIP_ERROR(hipMalloc(&d_relativeError, m * k * b * sizeof(double)));

hipEvent_t syncEvent;
CHECK_HIP_ERROR(hipEventCreate(&syncEvent));

// Calculate the relative error for each element of matrix A/B
hipLaunchKernelGGL((compareEqualKernel<TypeA, TypeB>),
gridDim,
blockDim,
0,
0,
matrixA,
matrixB,
d_relativeError,
m,
k,
b);
CHECK_HIP_ERROR(hipEventRecord(syncEvent));
CHECK_HIP_ERROR(hipEventSynchronize(syncEvent));

// Determine the maximum relative error
blockDim = dim3(512, 1, 1);
uint32_t maxElements = 1024;
uint32_t offset = 1;

for (uint32_t i = m * k * b; i > 1; i = ceilDiv(i, maxElements))
{
gridDim = dim3(ceilDiv(i, maxElements), 1, 1);
auto elements = i > maxElements ? maxElements : i;

hipLaunchKernelGGL((maxReduceKernel),
gridDim,
blockDim,
0,
0,
d_relativeError,
elements,
offset,
m * k * b);

CHECK_HIP_ERROR(hipEventRecord(syncEvent));
CHECK_HIP_ERROR(hipEventSynchronize(syncEvent));
offset = offset * maxElements;
}

CHECK_HIP_ERROR(hipMemcpy(&maxRelativeError, d_relativeError, sizeof(double), hipMemcpyDeviceToHost));

// Free allocated device memory
CHECK_HIP_ERROR(hipFree(d_relativeError));

bool retval = true;
bool isNaN = std::isnan(maxRelativeError);

auto toDoubleA
= [](TypeA const& val) { return static_cast<double>(static_cast<float>(val)); };

auto eps = toDoubleA(std::numeric_limits<TypeA>::epsilon());
if(isNaN)
{
retval = false;
maxRelativeError = std::numeric_limits<TypeA>::signaling_NaN();
}
else if(maxRelativeError > (eps * tolerance))
{
retval = false;
}

return std::make_pair(retval, maxRelativeError);
}

// Count occurrences of val in the input array
template <typename DataT>
uint64_t countVal(DataT const* a, uint64_t size, DataT const& val, double tolerance = 10.0)
Expand Down

0 comments on commit 1c4614a

Please sign in to comment.