From d295f2299101705f7c470c813b80542296087328 Mon Sep 17 00:00:00 2001 From: Nuno Miguel Nobre Date: Thu, 11 Aug 2022 03:40:09 +0100 Subject: [PATCH] [SYCL] Remove amrex::oneapi and update deprecated device descriptors (#2910) * Remove amrex::oneapi in favour of standard features * Change deprecated device descriptors --- Src/Base/AMReX_GpuDevice.cpp | 4 ++-- Src/Base/AMReX_GpuQualifiers.H | 4 ---- Src/Base/AMReX_GpuReduce.H | 8 ++++---- Src/Base/AMReX_Scan.H | 22 +++++++++++----------- 4 files changed, 17 insertions(+), 21 deletions(-) diff --git a/Src/Base/AMReX_GpuDevice.cpp b/Src/Base/AMReX_GpuDevice.cpp index c0e9b3e6785..e0ab64b76e3 100644 --- a/Src/Base/AMReX_GpuDevice.cpp +++ b/Src/Base/AMReX_GpuDevice.cpp @@ -463,8 +463,8 @@ Device::initialize_gpu () device_prop.warpSize = warp_size; auto sgss = d.get_info(); device_prop.maxMemAllocSize = d.get_info(); - device_prop.managedMemory = d.get_info(); - device_prop.concurrentManagedAccess = d.get_info(); + device_prop.managedMemory = d.has(sycl::aspect::usm_host_allocations); + device_prop.concurrentManagedAccess = d.has(sycl::aspect::usm_shared_allocations); device_prop.maxParameterSize = d.get_info(); { amrex::Print() << "Device Properties:\n" diff --git a/Src/Base/AMReX_GpuQualifiers.H b/Src/Base/AMReX_GpuQualifiers.H index ce07a3e52c2..b5d5ea58fbd 100644 --- a/Src/Base/AMReX_GpuQualifiers.H +++ b/Src/Base/AMReX_GpuQualifiers.H @@ -41,10 +41,6 @@ # include -namespace amrex { - namespace oneapi = sycl::ext::oneapi; -} - # define AMREX_REQUIRE_SUBGROUP_SIZE(x) \ _Pragma("clang diagnostic push") \ _Pragma("clang diagnostic ignored \"-Wattributes\"") \ diff --git a/Src/Base/AMReX_GpuReduce.H b/Src/Base/AMReX_GpuReduce.H index 3907ca385f6..7b9b0e42355 100644 --- a/Src/Base/AMReX_GpuReduce.H +++ b/Src/Base/AMReX_GpuReduce.H @@ -55,10 +55,10 @@ template struct warpReduce { AMREX_GPU_DEVICE AMREX_FORCE_INLINE - T operator() (T x, amrex::oneapi::sub_group const& sg) const noexcept + T operator() (T x, sycl::sub_group const& sg) const noexcept { for (int offset = warpSize/2; offset > 0; offset /= 2) { - T y = sg.shuffle_down(x, offset); + T y = sycl::shift_group_left(sg, x, offset); x = F()(x,y); } return x; @@ -71,7 +71,7 @@ T blockReduce (T x, WARPREDUCE && warp_reduce, T x0, Gpu::Handler const& h) { T* shared = (T*)h.local; int tid = h.item->get_local_id(0); - amrex::oneapi::sub_group const& sg = h.item->get_sub_group(); + sycl::sub_group const& sg = h.item->get_sub_group(); int lane = sg.get_local_id()[0]; int wid = sg.get_group_id()[0]; int numwarps = sg.get_group_range()[0]; @@ -94,7 +94,7 @@ AMREX_GPU_DEVICE AMREX_FORCE_INLINE void blockReduce_partial (T* dest, T x, WARPREDUCE && warp_reduce, ATOMICOP && atomic_op, Gpu::Handler const& handler) { - amrex::oneapi::sub_group const& sg = handler.item->get_sub_group(); + sycl::sub_group const& sg = handler.item->get_sub_group(); int wid = sg.get_group_id()[0]; if ((wid+1)*warpSize <= handler.numActiveThreads) { x = warp_reduce(x, sg); // full warp diff --git a/Src/Base/AMReX_Scan.H b/Src/Base/AMReX_Scan.H index 96aefb870b6..3dc5cb98f9a 100644 --- a/Src/Base/AMReX_Scan.H +++ b/Src/Base/AMReX_Scan.H @@ -197,7 +197,7 @@ T PrefixSum_mp (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum) amrex::launch(nblocks, nthreads, sm, stream, [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept { - amrex::oneapi::sub_group const& sg = gh.item->get_sub_group(); + sycl::sub_group const& sg = gh.item->get_sub_group(); int lane = sg.get_local_id()[0]; int warp = sg.get_group_id()[0]; int nwarps = sg.get_group_range()[0]; @@ -226,7 +226,7 @@ T PrefixSum_mp (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum) T x = x0; // Scan within a warp for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) { - T s = sg.shuffle_up(x, i); + T s = sycl::shift_group_right(sg, x, i); if (lane >= i) x += s; } @@ -244,7 +244,7 @@ T PrefixSum_mp (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum) if (warp == 0) { T y = (lane < nwarps) ? shared[lane] : 0; for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) { - T s = sg.shuffle_up(y, i); + T s = sycl::shift_group_right(sg, y, i); if (lane >= i) y += s; } @@ -277,7 +277,7 @@ T PrefixSum_mp (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum) amrex::launch(1, nthreads, sm, stream, [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept { - amrex::oneapi::sub_group const& sg = gh.item->get_sub_group(); + sycl::sub_group const& sg = gh.item->get_sub_group(); int lane = sg.get_local_id()[0]; int warp = sg.get_group_id()[0]; int nwarps = sg.get_group_range()[0]; @@ -293,7 +293,7 @@ T PrefixSum_mp (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum) T x = (offset < nblocks) ? blocksum_p[offset] : 0; // Scan within a warp for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) { - T s = sg.shuffle_up(x, i); + T s = sycl::shift_group_right(sg, x, i); if (lane >= i) x += s; } @@ -311,7 +311,7 @@ T PrefixSum_mp (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum) if (warp == 0) { T y = (lane < nwarps) ? shared[lane] : 0; for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) { - T s = sg.shuffle_up(y, i); + T s = sycl::shift_group_right(sg, y, i); if (lane >= i) y += s; } @@ -417,7 +417,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum amrex::launch(nblocks, nthreads, sm, stream, [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept { - amrex::oneapi::sub_group const& sg = gh.item->get_sub_group(); + sycl::sub_group const& sg = gh.item->get_sub_group(); int lane = sg.get_local_id()[0]; int warp = sg.get_group_id()[0]; int nwarps = sg.get_group_range()[0]; @@ -472,7 +472,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum T x = x0; // Scan within a warp for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) { - T s = sg.shuffle_up(x, i); + T s = sycl::shift_group_right(sg, x, i); if (lane >= i) x += s; } @@ -490,7 +490,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum if (warp == 0) { T y = (lane < nwarps) ? shared[lane] : 0; for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) { - T s = sg.shuffle_up(y, i); + T s = sycl::shift_group_right(sg, y, i); if (lane >= i) y += s; } @@ -543,7 +543,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum // implement our own __ballot unsigned status_bf = (stva.status == 'p') ? (0x1u << lane) : 0; for (int i = 1; i < Gpu::Device::warp_size; i *= 2) { - status_bf |= sg.shuffle_xor(status_bf, i); + status_bf |= sycl::permute_group_by_xor(sg, status_bf, i); } bool stop_lookback = status_bf & 0x1u; @@ -563,7 +563,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum } for (int i = Gpu::Device::warp_size/2; i > 0; i /= 2) { - x += sg.shuffle_down(x,i); + x += sycl::shift_group_left(sg, x,i); } }