Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Copy from and to stream #100

Merged
merged 7 commits into from
Oct 26, 2022
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions driver/xrt/include/accl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,54 @@ class ACCL {
bool from_fpga = false, bool to_fpga = false,
bool run_async = false, std::vector<CCLO *> waitfor = {});

/**
* Copy a buffer on the FPGA.
*
* @param dstbuf Buffer where the data should be stored to. Create a
* buffer using ACCL::create_buffer.
* @param count Amount of elements in buffer to copy.
* @param to_fpga Set to true if the data is already on the FPGA.
* @param run_async Run the ACCL call asynchronously.
* @param waitfor ACCL call will wait for these operations before it
* will start. Currently not implemented.
* @return CCLO* CCLO object that can be waited on and passed to
* waitfor; nullptr if run_async is false.
*/
CCLO *copy_from_stream(BaseBuffer &dstbuf, unsigned int count,
bool to_fpga = false,
bool run_async = false, std::vector<CCLO *> waitfor = {});

/**
* Copy a buffer on the FPGA.
*
* @param srcbuf Buffer that contains the data to be copied. Create a
* buffer using ACCL::create_buffer.
* @param count Amount of elements in buffer to copy.
* @param from_fpga Set to true if the data is already on the FPGA.
* @param run_async Run the ACCL call asynchronously.
* @param waitfor ACCL call will wait for these operations before it
* will start. Currently not implemented.
* @return CCLO* CCLO object that can be waited on and passed to
* waitfor; nullptr if run_async is false.
*/
CCLO *copy_to_stream(BaseBuffer &srcbuf, unsigned int count,
bool from_fpga = false,
bool run_async = false, std::vector<CCLO *> waitfor = {});

/**
* Copy a buffer on the FPGA.
*
* @param dst_data_type Data type of input and output to stream.
* @param count Amount of elements in buffer to copy.
* @param run_async Run the ACCL call asynchronously.
* @param waitfor ACCL call will wait for these operations before it
* will start. Currently not implemented.
* @return CCLO* CCLO object that can be waited on and passed to
* waitfor; nullptr if run_async is false.
*/
CCLO *copy_from_to_stream(dataType dst_data_type, unsigned int count,
bool run_async = false, std::vector<CCLO *> waitfor = {});

/**
* Perform reduce operation on two buffers on the FPGA.
*
Expand Down Expand Up @@ -867,6 +915,11 @@ class ACCL {
const int networkmem;
xrt::device device;

CCLO *copy(BaseBuffer *srcbuf, BaseBuffer *dstbuf, unsigned int count,
bool from_fpga, bool to_fpga, streamFlags stream_flags,
dataType data_type, bool run_async,
std::vector<CCLO *> waitfor);

void initialize_accl(const std::vector<rank_t> &ranks, int local_rank,
int nbufs, addr_t bufsize);

Expand Down
47 changes: 41 additions & 6 deletions driver/xrt/src/accl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -296,8 +296,9 @@ CCLO *ACCL::recv(dataType dst_data_type, unsigned int count,
return nullptr;
}

CCLO *ACCL::copy(BaseBuffer &srcbuf, BaseBuffer &dstbuf, unsigned int count,
bool from_fpga, bool to_fpga, bool run_async,
CCLO *ACCL::copy(BaseBuffer *srcbuf, BaseBuffer *dstbuf, unsigned int count,
bool from_fpga, bool to_fpga, streamFlags stream_flags,
dataType data_type, bool run_async,
std::vector<CCLO *> waitfor) {
CCLO::Options options{};

Expand All @@ -308,13 +309,16 @@ CCLO *ACCL::copy(BaseBuffer &srcbuf, BaseBuffer &dstbuf, unsigned int count,
}

if (from_fpga == false) {
srcbuf.sync_to_device();
srcbuf->sync_to_device();
}

options.scenario = operation::copy;
options.addr_0 = &srcbuf;
options.addr_2 = &dstbuf;
options.addr_0 = srcbuf;
options.addr_2 = dstbuf;
options.data_type_io_0 = data_type;
options.data_type_io_2 = data_type;
options.count = count;
options.stream_flags = stream_flags;
options.waitfor = waitfor;
CCLO *handle = call_async(options);

Expand All @@ -323,14 +327,45 @@ CCLO *ACCL::copy(BaseBuffer &srcbuf, BaseBuffer &dstbuf, unsigned int count,
} else {
handle->wait();
if (to_fpga == false) {
dstbuf.sync_from_device();
dstbuf->sync_from_device();
}
check_return_value("copy");
}

return nullptr;
}

CCLO *ACCL::copy(BaseBuffer &srcbuf, BaseBuffer &dstbuf, unsigned int count,
bool from_fpga, bool to_fpga, bool run_async,
std::vector<CCLO *> waitfor) {
return copy(&srcbuf, &dstbuf, count,
from_fpga, to_fpga, streamFlags::NO_STREAM,
dataType::none, run_async, waitfor);
}

CCLO *ACCL::copy_from_stream(BaseBuffer &dstbuf, unsigned int count,
bool to_fpga, bool run_async,
std::vector<CCLO *> waitfor) {
return copy(nullptr, &dstbuf, count,
true, to_fpga, streamFlags::OP0_STREAM,
dstbuf.type(), run_async, waitfor);
}

CCLO *ACCL::copy_to_stream(BaseBuffer &srcbuf, unsigned int count,
bool from_fpga, bool run_async,
std::vector<CCLO *> waitfor) {
return copy(&srcbuf, nullptr, count,
from_fpga, true, streamFlags::RES_STREAM,
srcbuf.type(), run_async, waitfor);
}

CCLO *ACCL::copy_from_to_stream(dataType data_type, unsigned int count,
bool run_async, std::vector<CCLO *> waitfor) {
return copy(nullptr, nullptr, count,
true, true, streamFlags::OP0_STREAM | streamFlags::RES_STREAM,
data_type, run_async, waitfor);
}

CCLO *ACCL::combine(unsigned int count, reduceFunction function,
BaseBuffer &val1, BaseBuffer &val2, BaseBuffer &result,
bool val1_from_fpga, bool val2_from_fpga, bool to_fpga,
Expand Down
45 changes: 43 additions & 2 deletions test/host/hls/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,47 @@ std::unique_ptr<ACCL::ACCL> test_vadd_put(options_t options) {
return accl;
}

void test_copy(ACCL::ACCL& accl, options_t options) {
//run test here:
//initialize a CCLO BFM and streams as needed
hlslib::Stream<command_word> callreq, callack;
hlslib::Stream<stream_word, 512> data_cclo2krnl("cclo2krnl"), data_krnl2cclo("krnl2cclo");

std::vector<unsigned int> dest = {0};

CCLO_BFM cclo(options.start_port, rank, size, dest, callreq, callack, data_cclo2krnl, data_krnl2cclo);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is not applicable on hardware. When I run it with the axis3x design for U55c it tries to connect to a non-existing ZMQ server and the test crashes with CCLO @0x0: during copy the following error(s) occured: DMA DECODE ERROR, DMA NOT OKAY ERROR (00000000000000000000010100).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, I create a BFM, which is not used in hardware. So I have to change this. But the error sounds like something else is... not okay. Let me check.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I fixed the BFM creation for hardware builds. But I get the same error during hardware execution. Maybe we need a recent build to test this. It works in the simulator though...

cclo.run();
std::cout << "CCLO BFM started" << std::endl;
MPI_Barrier(MPI_COMM_WORLD);

//allocate float arrays for the HLS function to use
auto src_buffer = accl.create_buffer<int>(options.count, ACCL::dataType::int32, 0);
auto dst_buffer = accl.create_buffer<int>(options.count, ACCL::dataType::int32, 0);
for(int i=0; i<options.count; i++){
src_buffer->buffer()[i] = rank;
dst_buffer->buffer()[i] = 0;
}

accl.copy_to_stream(*src_buffer, options.count, false);

//loop back data (divide count by 16 and round up to get number of stream words)
for (int i=0; i < (options.count+15)/16; i++) {
data_krnl2cclo.write(data_cclo2krnl.read());
}

accl.copy_from_stream(*dst_buffer, options.count, false);

//check HLS function outputs
unsigned int err_count = 0;
for(int i=0; i<options.count; i++){
err_count += (dst_buffer->buffer()[i] != rank);
}

std::cout << "Test finished with " << err_count << " errors" << std::endl;
//clean up
cclo.stop();
}

void test_loopback_local_res(ACCL::ACCL& accl, options_t options) {

//run test here:
Expand Down Expand Up @@ -183,7 +224,6 @@ void test_loopback_local_res(ACCL::ACCL& accl, options_t options) {
cclo.stop();
}


void test_loopback(ACCL::ACCL& accl, options_t options, unsigned char stream_id) {

//run test here:
Expand Down Expand Up @@ -299,7 +339,8 @@ int main(int argc, char *argv[]) {

auto accl = test_vadd_put(options);
MPI_Barrier(MPI_COMM_WORLD);

test_copy(*accl, options);
MPI_Barrier(MPI_COMM_WORLD);
if(!options.hardware){
std::srand(42);
for(int i=0; i<options.nruns; i++){
Expand Down