From f111dc1bd32de6368a9baf4d2ae0331fba0015dd Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 6 Oct 2022 16:48:54 +0100 Subject: [PATCH 1/7] Add stream support for copy operation --- driver/xrt/include/accl.hpp | 2 +- driver/xrt/src/accl.cpp | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/driver/xrt/include/accl.hpp b/driver/xrt/include/accl.hpp index b0d8512b..8335057c 100644 --- a/driver/xrt/include/accl.hpp +++ b/driver/xrt/include/accl.hpp @@ -303,7 +303,7 @@ class ACCL { * waitfor; nullptr if run_async is false. */ CCLO *copy(BaseBuffer &srcbuf, BaseBuffer &dstbuf, unsigned int count, - bool from_fpga = false, bool to_fpga = false, + bool from_fpga = false, bool to_fpga = false, streamFlags stream_flags = streamFlags::NO_STREAM, bool run_async = false, std::vector waitfor = {}); /** diff --git a/driver/xrt/src/accl.cpp b/driver/xrt/src/accl.cpp index 9ca5fefa..16ae1cfa 100644 --- a/driver/xrt/src/accl.cpp +++ b/driver/xrt/src/accl.cpp @@ -297,8 +297,8 @@ CCLO *ACCL::recv(dataType dst_data_type, unsigned int count, } CCLO *ACCL::copy(BaseBuffer &srcbuf, BaseBuffer &dstbuf, unsigned int count, - bool from_fpga, bool to_fpga, bool run_async, - std::vector waitfor) { + bool from_fpga, bool to_fpga, streamFlags stream_flags, + bool run_async, std::vector waitfor) { CCLO::Options options{}; if (to_fpga == false && run_async == true) { @@ -315,6 +315,7 @@ CCLO *ACCL::copy(BaseBuffer &srcbuf, BaseBuffer &dstbuf, unsigned int count, options.addr_0 = &srcbuf; options.addr_2 = &dstbuf; options.count = count; + options.stream_flags = stream_flags; options.waitfor = waitfor; CCLO *handle = call_async(options); From cc4c230703ef2b59fbab61172f8fdfbf19e80d14 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 6 Oct 2022 16:49:13 +0100 Subject: [PATCH 2/7] Add test for copy from/to stream --- test/host/hls/test.cpp | 46 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/test/host/hls/test.cpp b/test/host/hls/test.cpp index f2e5e9cf..21f21737 100644 --- a/test/host/hls/test.cpp +++ b/test/host/hls/test.cpp @@ -28,6 +28,7 @@ #include "cclo_bfm.h" #include #include +#include "dummybuffer.hpp" using namespace ACCL; @@ -143,6 +144,47 @@ std::unique_ptr test_vadd_put(options_t options) { return accl; } +void test_copy(ACCL::ACCL& accl, options_t options) { + //run test here: + //initialize a CCLO BFM and streams as needed + hlslib::Stream callreq, callack; + hlslib::Stream data_cclo2krnl("cclo2krnl"), data_krnl2cclo("krnl2cclo"); + + std::vector dest = {0}; + + CCLO_BFM cclo(options.start_port, rank, size, dest, callreq, callack, data_cclo2krnl, data_krnl2cclo); + cclo.run(); + std::cout << "CCLO BFM started" << std::endl; + MPI_Barrier(MPI_COMM_WORLD); + + //allocate float arrays for the HLS function to use + auto src_buffer = accl.create_buffer(options.count, ACCL::dataType::int32, 0); + auto dst_buffer = accl.create_buffer(options.count, ACCL::dataType::int32, 0); + for(int i=0; ibuffer()[i] = rank; + dst_buffer->buffer()[i] = 0; + } + + accl.copy(*src_buffer, dummy_buffer, options.count, false, false, ACCL::streamFlags::RES_STREAM); + + //loop back data (divide count by 16 and round up to get number of stream words) + for (int i=0; i < (options.count+15)/16; i++) { + data_krnl2cclo.write(data_cclo2krnl.read()); + } + + accl.copy(dummy_buffer, *dst_buffer, options.count, false, false, ACCL::streamFlags::OP0_STREAM); + + //check HLS function outputs + unsigned int err_count = 0; + for(int i=0; ibuffer()[i] != rank); + } + + std::cout << "Test finished with " << err_count << " errors" << std::endl; + //clean up + cclo.stop(); +} + void test_loopback_local_res(ACCL::ACCL& accl, options_t options) { //run test here: @@ -183,7 +225,6 @@ void test_loopback_local_res(ACCL::ACCL& accl, options_t options) { cclo.stop(); } - void test_loopback(ACCL::ACCL& accl, options_t options, unsigned char stream_id) { //run test here: @@ -299,7 +340,8 @@ int main(int argc, char *argv[]) { auto accl = test_vadd_put(options); MPI_Barrier(MPI_COMM_WORLD); - + test_copy(*accl, options); + MPI_Barrier(MPI_COMM_WORLD); if(!options.hardware){ std::srand(42); for(int i=0; i Date: Tue, 18 Oct 2022 17:49:24 +0100 Subject: [PATCH 3/7] Update signatures for copy with streams --- driver/xrt/include/accl.hpp | 50 +++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/driver/xrt/include/accl.hpp b/driver/xrt/include/accl.hpp index 8335057c..7ee6be0a 100644 --- a/driver/xrt/include/accl.hpp +++ b/driver/xrt/include/accl.hpp @@ -306,6 +306,56 @@ class ACCL { bool from_fpga = false, bool to_fpga = false, streamFlags stream_flags = streamFlags::NO_STREAM, bool run_async = false, std::vector waitfor = {}); + /** + * Copy a buffer on the FPGA. + * + * @param dstbuf Buffer where the data should be stored to. Create a + * buffer using ACCL::create_buffer. + * @param count Amount of elements in buffer to copy. + * @param to_fpga Set to true if the data is already on the FPGA. + * @param run_async Run the ACCL call asynchronously. + * @param waitfor ACCL call will wait for these operations before it + * will start. Currently not implemented. + * @return CCLO* CCLO object that can be waited on and passed to + * waitfor; nullptr if run_async is false. + */ + CCLO *copy_from_stream(BaseBuffer &dstbuf, unsigned int count, + bool to_fpga = false, + bool run_async = false, std::vector waitfor = {}); + + /** + * Copy a buffer on the FPGA. + * + * @param srcbuf Buffer that contains the data to be copied. Create a + * buffer using ACCL::create_buffer. + * @param count Amount of elements in buffer to copy. + * @param from_fpga Set to true if the data is already on the FPGA. + * @param to_fpga Set to true if the copied data will be used on the + * FPGA only. + * @param run_async Run the ACCL call asynchronously. + * @param waitfor ACCL call will wait for these operations before it + * will start. Currently not implemented. + * @return CCLO* CCLO object that can be waited on and passed to + * waitfor; nullptr if run_async is false. + */ + CCLO *copy_to_stream(BaseBuffer &srcbuf, unsigned int count, + bool from_fpga = false, + bool run_async = false, std::vector waitfor = {}); + + /** + * Copy a buffer on the FPGA. + * + * @param dst_data_type Data type of input and output to stream. + * @param count Amount of elements in buffer to copy. + * @param run_async Run the ACCL call asynchronously. + * @param waitfor ACCL call will wait for these operations before it + * will start. Currently not implemented. + * @return CCLO* CCLO object that can be waited on and passed to + * waitfor; nullptr if run_async is false. + */ + CCLO *copy_from_to_stream(dataType dst_data_type, unsigned int count, + bool run_async = false, std::vector waitfor = {}); + /** * Perform reduce operation on two buffers on the FPGA. * From cd9daf4e5fc157e0b9d8dffff21c279d49cf173a Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 19 Oct 2022 12:08:52 +0100 Subject: [PATCH 4/7] Implement copy to and from stream operations --- driver/xrt/include/accl.hpp | 9 +++++--- driver/xrt/src/accl.cpp | 46 ++++++++++++++++++++++++++++++++----- 2 files changed, 46 insertions(+), 9 deletions(-) diff --git a/driver/xrt/include/accl.hpp b/driver/xrt/include/accl.hpp index 7ee6be0a..286d8b41 100644 --- a/driver/xrt/include/accl.hpp +++ b/driver/xrt/include/accl.hpp @@ -303,7 +303,7 @@ class ACCL { * waitfor; nullptr if run_async is false. */ CCLO *copy(BaseBuffer &srcbuf, BaseBuffer &dstbuf, unsigned int count, - bool from_fpga = false, bool to_fpga = false, streamFlags stream_flags = streamFlags::NO_STREAM, + bool from_fpga = false, bool to_fpga = false, bool run_async = false, std::vector waitfor = {}); /** @@ -330,8 +330,6 @@ class ACCL { * buffer using ACCL::create_buffer. * @param count Amount of elements in buffer to copy. * @param from_fpga Set to true if the data is already on the FPGA. - * @param to_fpga Set to true if the copied data will be used on the - * FPGA only. * @param run_async Run the ACCL call asynchronously. * @param waitfor ACCL call will wait for these operations before it * will start. Currently not implemented. @@ -917,6 +915,11 @@ class ACCL { const int networkmem; xrt::device device; + CCLO *copy(BaseBuffer *srcbuf, BaseBuffer *dstbuf, unsigned int count, + bool from_fpga, bool to_fpga, streamFlags stream_flags, + dataType data_type, bool run_async, + std::vector waitfor); + void initialize_accl(const std::vector &ranks, int local_rank, int nbufs, addr_t bufsize); diff --git a/driver/xrt/src/accl.cpp b/driver/xrt/src/accl.cpp index 16ae1cfa..aa77c590 100644 --- a/driver/xrt/src/accl.cpp +++ b/driver/xrt/src/accl.cpp @@ -296,9 +296,10 @@ CCLO *ACCL::recv(dataType dst_data_type, unsigned int count, return nullptr; } -CCLO *ACCL::copy(BaseBuffer &srcbuf, BaseBuffer &dstbuf, unsigned int count, +CCLO *ACCL::copy(BaseBuffer *srcbuf, BaseBuffer *dstbuf, unsigned int count, bool from_fpga, bool to_fpga, streamFlags stream_flags, - bool run_async, std::vector waitfor) { + dataType data_type, bool run_async, + std::vector waitfor) { CCLO::Options options{}; if (to_fpga == false && run_async == true) { @@ -308,12 +309,14 @@ CCLO *ACCL::copy(BaseBuffer &srcbuf, BaseBuffer &dstbuf, unsigned int count, } if (from_fpga == false) { - srcbuf.sync_to_device(); + srcbuf->sync_to_device(); } options.scenario = operation::copy; - options.addr_0 = &srcbuf; - options.addr_2 = &dstbuf; + options.addr_0 = srcbuf; + options.addr_2 = dstbuf; + options.data_type_io_0 = data_type; + options.data_type_io_2 = data_type; options.count = count; options.stream_flags = stream_flags; options.waitfor = waitfor; @@ -324,7 +327,7 @@ CCLO *ACCL::copy(BaseBuffer &srcbuf, BaseBuffer &dstbuf, unsigned int count, } else { handle->wait(); if (to_fpga == false) { - dstbuf.sync_from_device(); + dstbuf->sync_from_device(); } check_return_value("copy"); } @@ -332,6 +335,37 @@ CCLO *ACCL::copy(BaseBuffer &srcbuf, BaseBuffer &dstbuf, unsigned int count, return nullptr; } +CCLO *ACCL::copy(BaseBuffer &srcbuf, BaseBuffer &dstbuf, unsigned int count, + bool from_fpga, bool to_fpga, bool run_async, + std::vector waitfor) { + return copy(&srcbuf, &dstbuf, count, + from_fpga, to_fpga, streamFlags::NO_STREAM, + dataType::none, run_async, waitfor); +} + +CCLO *ACCL::copy_from_stream(BaseBuffer &dstbuf, unsigned int count, + bool to_fpga, bool run_async, + std::vector waitfor) { + return copy(nullptr, &dstbuf, count, + true, to_fpga, streamFlags::OP0_STREAM, + dstbuf.type(), run_async, waitfor); +} + +CCLO *ACCL::copy_to_stream(BaseBuffer &srcbuf, unsigned int count, + bool from_fpga, bool run_async, + std::vector waitfor) { + return copy(&srcbuf, nullptr, count, + from_fpga, true, streamFlags::RES_STREAM, + srcbuf.type(), run_async, waitfor); +} + +CCLO *ACCL::copy_from_to_stream(dataType data_type, unsigned int count, + bool run_async, std::vector waitfor) { + return copy(nullptr, nullptr, count, + true, true, streamFlags::OP0_STREAM | streamFlags::RES_STREAM, + data_type, run_async, waitfor); +} + CCLO *ACCL::combine(unsigned int count, reduceFunction function, BaseBuffer &val1, BaseBuffer &val2, BaseBuffer &result, bool val1_from_fpga, bool val2_from_fpga, bool to_fpga, From 705eede4daa44a33889dd8e178852306a8d2533e Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 19 Oct 2022 14:36:54 +0100 Subject: [PATCH 5/7] Update tests with new copy signature --- test/host/hls/test.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test/host/hls/test.cpp b/test/host/hls/test.cpp index 21f21737..f40acb67 100644 --- a/test/host/hls/test.cpp +++ b/test/host/hls/test.cpp @@ -28,7 +28,6 @@ #include "cclo_bfm.h" #include #include -#include "dummybuffer.hpp" using namespace ACCL; @@ -165,14 +164,14 @@ void test_copy(ACCL::ACCL& accl, options_t options) { dst_buffer->buffer()[i] = 0; } - accl.copy(*src_buffer, dummy_buffer, options.count, false, false, ACCL::streamFlags::RES_STREAM); + accl.copy_to_stream(*src_buffer, options.count, false); //loop back data (divide count by 16 and round up to get number of stream words) for (int i=0; i < (options.count+15)/16; i++) { data_krnl2cclo.write(data_cclo2krnl.read()); } - accl.copy(dummy_buffer, *dst_buffer, options.count, false, false, ACCL::streamFlags::OP0_STREAM); + accl.copy_from_stream(*dst_buffer, options.count, false); //check HLS function outputs unsigned int err_count = 0; From 5ec61cbbeddf8e940eb268a3afe875f2fc4aab1c Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 21 Oct 2022 15:12:23 +0100 Subject: [PATCH 6/7] Make copy test work with hardware --- test/host/hls/test.cpp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/test/host/hls/test.cpp b/test/host/hls/test.cpp index f40acb67..12b2e35a 100644 --- a/test/host/hls/test.cpp +++ b/test/host/hls/test.cpp @@ -150,10 +150,13 @@ void test_copy(ACCL::ACCL& accl, options_t options) { hlslib::Stream data_cclo2krnl("cclo2krnl"), data_krnl2cclo("krnl2cclo"); std::vector dest = {0}; + std::unique_ptr cclo; - CCLO_BFM cclo(options.start_port, rank, size, dest, callreq, callack, data_cclo2krnl, data_krnl2cclo); - cclo.run(); - std::cout << "CCLO BFM started" << std::endl; + if (!options.hardware) { + cclo = std::make_unique(options.start_port, rank, size, dest, callreq, callack, data_cclo2krnl, data_krnl2cclo); + cclo->run(); + std::cout << "CCLO BFM started" << std::endl; + } MPI_Barrier(MPI_COMM_WORLD); //allocate float arrays for the HLS function to use @@ -180,8 +183,10 @@ void test_copy(ACCL::ACCL& accl, options_t options) { } std::cout << "Test finished with " << err_count << " errors" << std::endl; - //clean up - cclo.stop(); + if (!options.hardware) { + //clean up + cclo->stop(); + } } void test_loopback_local_res(ACCL::ACCL& accl, options_t options) { From 557fa5fd8a5aacfaa009f09f23ac7c05a181e352 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 26 Oct 2022 11:31:25 +0100 Subject: [PATCH 7/7] Move copy stream test to XRT test suite --- test/host/hls/test.cpp | 48 ------------------------------------------ test/host/xrt/test.cpp | 33 +++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 48 deletions(-) diff --git a/test/host/hls/test.cpp b/test/host/hls/test.cpp index 12b2e35a..1da8bb48 100644 --- a/test/host/hls/test.cpp +++ b/test/host/hls/test.cpp @@ -143,52 +143,6 @@ std::unique_ptr test_vadd_put(options_t options) { return accl; } -void test_copy(ACCL::ACCL& accl, options_t options) { - //run test here: - //initialize a CCLO BFM and streams as needed - hlslib::Stream callreq, callack; - hlslib::Stream data_cclo2krnl("cclo2krnl"), data_krnl2cclo("krnl2cclo"); - - std::vector dest = {0}; - std::unique_ptr cclo; - - if (!options.hardware) { - cclo = std::make_unique(options.start_port, rank, size, dest, callreq, callack, data_cclo2krnl, data_krnl2cclo); - cclo->run(); - std::cout << "CCLO BFM started" << std::endl; - } - MPI_Barrier(MPI_COMM_WORLD); - - //allocate float arrays for the HLS function to use - auto src_buffer = accl.create_buffer(options.count, ACCL::dataType::int32, 0); - auto dst_buffer = accl.create_buffer(options.count, ACCL::dataType::int32, 0); - for(int i=0; ibuffer()[i] = rank; - dst_buffer->buffer()[i] = 0; - } - - accl.copy_to_stream(*src_buffer, options.count, false); - - //loop back data (divide count by 16 and round up to get number of stream words) - for (int i=0; i < (options.count+15)/16; i++) { - data_krnl2cclo.write(data_cclo2krnl.read()); - } - - accl.copy_from_stream(*dst_buffer, options.count, false); - - //check HLS function outputs - unsigned int err_count = 0; - for(int i=0; ibuffer()[i] != rank); - } - - std::cout << "Test finished with " << err_count << " errors" << std::endl; - if (!options.hardware) { - //clean up - cclo->stop(); - } -} - void test_loopback_local_res(ACCL::ACCL& accl, options_t options) { //run test here: @@ -344,8 +298,6 @@ int main(int argc, char *argv[]) { auto accl = test_vadd_put(options); MPI_Barrier(MPI_COMM_WORLD); - test_copy(*accl, options); - MPI_Barrier(MPI_COMM_WORLD); if(!options.hardware){ std::srand(42); for(int i=0; i(count, dataType::float32); + auto res_buf = accl.create_buffer(count, dataType::float32); + random_array(op_buf->buffer(), count); + + test_debug("Copy data from buffer to stream", options); + accl.copy_to_stream(*op_buf, count, false); + test_debug("Copy data from stream to buffer", options); + accl.copy_from_stream(*res_buf, count, false); + int errors = 0; + for (unsigned int i = 0; i < count; ++i) { + float ref = (*op_buf)[i]; + float res = (*res_buf)[i]; + if (res != ref) { + std::cout << i + 1 + << "th item is incorrect! (" + std::to_string(res) + + " != " + std::to_string(ref) + ")" + << std::endl; + errors += 1; + } + } + + if (errors > 0) { + std::cout << errors << " errors!" << std::endl; + } else { + std::cout << "Test succesfull!" << std::endl; + } +} + void test_copy_p2p(ACCL::ACCL &accl, options_t &options) { std::cout << "Start copy p2p test..." << std::endl; unsigned int count = options.count; @@ -1350,6 +1381,8 @@ int start_test(options_t options) { MPI_Barrier(MPI_COMM_WORLD); test_copy(*accl, options); MPI_Barrier(MPI_COMM_WORLD); + test_copy_stream(*accl, options); + MPI_Barrier(MPI_COMM_WORLD); test_copy_p2p(*accl, options); MPI_Barrier(MPI_COMM_WORLD); test_combine_sum(*accl, options);