-
Notifications
You must be signed in to change notification settings - Fork 201
/
host.cpp
132 lines (120 loc) · 5.76 KB
/
host.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
/**
* Copyright (C) 2020 Xilinx, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License"). You may
* not use this file except in compliance with the License. A copy of the
* License is located at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*/
#include "xcl2.hpp"
#include <algorithm>
#include <vector>
#define DATA_SIZE 4096
int main(int argc, char** argv) {
if (argc != 2) {
std::cout << "Usage: " << argv[0] << " <XCLBIN File>" << std::endl;
return EXIT_FAILURE;
}
std::string binaryFile = argv[1];
size_t vector_size_bytes = sizeof(int) * DATA_SIZE;
cl_int err;
cl::Context context;
cl::Kernel krnl_vector_add;
cl::CommandQueue q;
// Allocate Memory in Host Memory
// When creating a buffer with user pointer (CL_MEM_USE_HOST_PTR), under the
// hood user ptr
// is used if it is properly aligned. when not aligned, runtime had no choice
// but to create
// its own host side buffer. So it is recommended to use this allocator if
// user wish to
// create buffer using CL_MEM_USE_HOST_PTR to align user buffer to page
// boundary. It will
// ensure that user buffer is used when user create Buffer/Mem object with
// CL_MEM_USE_HOST_PTR
std::vector<int, aligned_allocator<int> > source_in1(DATA_SIZE);
std::vector<int, aligned_allocator<int> > source_in2(DATA_SIZE);
std::vector<int, aligned_allocator<int> > source_hw_results(DATA_SIZE);
std::vector<int, aligned_allocator<int> > source_sw_results(DATA_SIZE);
// Create the test data
std::generate(source_in1.begin(), source_in1.end(), std::rand);
std::generate(source_in2.begin(), source_in2.end(), std::rand);
for (int i = 0; i < DATA_SIZE; i++) {
source_sw_results[i] = source_in1[i] + source_in2[i];
source_hw_results[i] = 0;
}
// OPENCL HOST CODE AREA START
// get_xil_devices() is a utility API which will find the xilinx
// platforms and will return list of devices connected to Xilinx platform
auto devices = xcl::get_xil_devices();
// read_binary_file() is a utility API which will load the binaryFile
// and will return the pointer to file buffer.
auto fileBuf = xcl::read_binary_file(binaryFile);
cl::Program::Binaries bins{{fileBuf.data(), fileBuf.size()}};
bool valid_device = false;
for (unsigned int i = 0; i < devices.size(); i++) {
auto device = devices[i];
// Creating Context and Command Queue for selected Device
OCL_CHECK(err, context = cl::Context(device, NULL, NULL, NULL, &err));
OCL_CHECK(err, q = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err));
std::cout << "Trying to program device[" << i << "]: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
cl::Program program(context, {device}, bins, NULL, &err);
if (err != CL_SUCCESS) {
std::cout << "Failed to program device[" << i << "] with xclbin file!\n";
} else {
std::cout << "Device[" << i << "]: program successful!\n";
OCL_CHECK(err, krnl_vector_add = cl::Kernel(program, "vadd", &err));
valid_device = true;
break; // we break because we found a valid device
}
}
if (!valid_device) {
std::cout << "Failed to program any device found, exit!\n";
exit(EXIT_FAILURE);
}
// Allocate Buffer in Global Memory
// Buffers are allocated using CL_MEM_USE_HOST_PTR for efficient memory and
// Device-to-host communication
OCL_CHECK(err, cl::Buffer buffer_in1(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, vector_size_bytes,
source_in1.data(), &err));
OCL_CHECK(err, cl::Buffer buffer_in2(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, vector_size_bytes,
source_in2.data(), &err));
OCL_CHECK(err, cl::Buffer buffer_output(context, CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY, vector_size_bytes,
source_hw_results.data(), &err));
int size = DATA_SIZE;
OCL_CHECK(err, err = krnl_vector_add.setArg(0, buffer_in1));
OCL_CHECK(err, err = krnl_vector_add.setArg(1, buffer_in2));
OCL_CHECK(err, err = krnl_vector_add.setArg(2, buffer_output));
OCL_CHECK(err, err = krnl_vector_add.setArg(3, size));
// Copy input data to device global memory
OCL_CHECK(err, err = q.enqueueMigrateMemObjects({buffer_in1, buffer_in2}, 0 /* 0 means from host*/));
// Launch the Kernel
// For HLS kernels global and local size is always (1,1,1). So, it is
// recommended
// to always use enqueueTask() for invoking HLS kernel
OCL_CHECK(err, err = q.enqueueTask(krnl_vector_add));
// Copy Result from Device Global Memory to Host Local Memory
OCL_CHECK(err, err = q.enqueueMigrateMemObjects({buffer_output}, CL_MIGRATE_MEM_OBJECT_HOST));
q.finish();
// OPENCL HOST CODE AREA END
// Compare the results of the Device to the simulation
bool match = true;
for (int i = 0; i < DATA_SIZE; i++) {
if (source_hw_results[i] != source_sw_results[i]) {
std::cout << "Error: Result mismatch" << std::endl;
std::cout << "i = " << i << " CPU result = " << source_sw_results[i]
<< " Device result = " << source_hw_results[i] << std::endl;
match = false;
break;
}
}
std::cout << "TEST " << (match ? "PASSED" : "FAILED") << std::endl;
return (match ? EXIT_SUCCESS : EXIT_FAILURE);
}