From d295f2299101705f7c470c813b80542296087328 Mon Sep 17 00:00:00 2001
From: Nuno Miguel Nobre <nuno.nobre@stfc.ac.uk>
Date: Thu, 11 Aug 2022 03:40:09 +0100
Subject: [PATCH] [SYCL] Remove amrex::oneapi and update deprecated device
 descriptors (#2910)

* Remove amrex::oneapi in favour of standard features

* Change deprecated device descriptors
---
 Src/Base/AMReX_GpuDevice.cpp   |  4 ++--
 Src/Base/AMReX_GpuQualifiers.H |  4 ----
 Src/Base/AMReX_GpuReduce.H     |  8 ++++----
 Src/Base/AMReX_Scan.H          | 22 +++++++++++-----------
 4 files changed, 17 insertions(+), 21 deletions(-)
diff --git a/Src/Base/AMReX_GpuDevice.cpp b/Src/Base/AMReX_GpuDevice.cpp
index c0e9b3e6785..e0ab64b76e3 100644
--- a/Src/Base/AMReX_GpuDevice.cpp
+++ b/Src/Base/AMReX_GpuDevice.cpp
@@ -463,8 +463,8 @@ Device::initialize_gpu ()
         device_prop.warpSize = warp_size;
         auto sgss = d.get_info<sycl::info::device::sub_group_sizes>();
         device_prop.maxMemAllocSize = d.get_info<sycl::info::device::max_mem_alloc_size>();
-        device_prop.managedMemory = d.get_info<sycl::info::device::host_unified_memory>();
-        device_prop.concurrentManagedAccess = d.get_info<sycl::info::device::usm_shared_allocations>();
+        device_prop.managedMemory = d.has(sycl::aspect::usm_host_allocations);
+        device_prop.concurrentManagedAccess = d.has(sycl::aspect::usm_shared_allocations);
         device_prop.maxParameterSize = d.get_info<sycl::info::device::max_parameter_size>();
         {
             amrex::Print() << "Device Properties:\n"
diff --git a/Src/Base/AMReX_GpuQualifiers.H b/Src/Base/AMReX_GpuQualifiers.H
index ce07a3e52c2..b5d5ea58fbd 100644
--- a/Src/Base/AMReX_GpuQualifiers.H
+++ b/Src/Base/AMReX_GpuQualifiers.H
@@ -41,10 +41,6 @@
 
 # include <CL/sycl.hpp>
 
-namespace amrex {
-    namespace oneapi = sycl::ext::oneapi;
-}
-
 # define AMREX_REQUIRE_SUBGROUP_SIZE(x) \
   _Pragma("clang diagnostic push") \
   _Pragma("clang diagnostic ignored \"-Wattributes\"") \
diff --git a/Src/Base/AMReX_GpuReduce.H b/Src/Base/AMReX_GpuReduce.H
index 3907ca385f6..7b9b0e42355 100644
--- a/Src/Base/AMReX_GpuReduce.H
+++ b/Src/Base/AMReX_GpuReduce.H
@@ -55,10 +55,10 @@ template <int warpSize, typename T, typename F>
 struct warpReduce
 {
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-    T operator() (T x, amrex::oneapi::sub_group const& sg) const noexcept
+    T operator() (T x, sycl::sub_group const& sg) const noexcept
     {
         for (int offset = warpSize/2; offset > 0; offset /= 2) {
-            T y = sg.shuffle_down(x, offset);
+            T y = sycl::shift_group_left(sg, x, offset);
             x = F()(x,y);
         }
         return x;
@@ -71,7 +71,7 @@ T blockReduce (T x, WARPREDUCE && warp_reduce, T x0, Gpu::Handler const& h)
 {
     T* shared = (T*)h.local;
     int tid = h.item->get_local_id(0);
-    amrex::oneapi::sub_group const& sg = h.item->get_sub_group();
+    sycl::sub_group const& sg = h.item->get_sub_group();
     int lane = sg.get_local_id()[0];
     int wid = sg.get_group_id()[0];
     int numwarps = sg.get_group_range()[0];
@@ -94,7 +94,7 @@ AMREX_GPU_DEVICE AMREX_FORCE_INLINE
 void blockReduce_partial (T* dest, T x, WARPREDUCE && warp_reduce, ATOMICOP && atomic_op,
                           Gpu::Handler const& handler)
 {
-   amrex::oneapi::sub_group const& sg = handler.item->get_sub_group();
+   sycl::sub_group const& sg = handler.item->get_sub_group();
    int wid = sg.get_group_id()[0];
    if ((wid+1)*warpSize <= handler.numActiveThreads) {
        x = warp_reduce(x, sg); // full warp
diff --git a/Src/Base/AMReX_Scan.H b/Src/Base/AMReX_Scan.H
index 96aefb870b6..3dc5cb98f9a 100644
--- a/Src/Base/AMReX_Scan.H
+++ b/Src/Base/AMReX_Scan.H
@@ -197,7 +197,7 @@ T PrefixSum_mp (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum)
     amrex::launch(nblocks, nthreads, sm, stream,
     [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
     {
-        amrex::oneapi::sub_group const& sg = gh.item->get_sub_group();
+        sycl::sub_group const& sg = gh.item->get_sub_group();
         int lane = sg.get_local_id()[0];
         int warp = sg.get_group_id()[0];
         int nwarps = sg.get_group_range()[0];
@@ -226,7 +226,7 @@ T PrefixSum_mp (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum)
             T x = x0;
             // Scan within a warp
             for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
-                T s = sg.shuffle_up(x, i);
+                T s = sycl::shift_group_right(sg, x, i);
                 if (lane >= i) x += s;
             }
 
@@ -244,7 +244,7 @@ T PrefixSum_mp (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum)
             if (warp == 0) {
                 T y = (lane < nwarps) ? shared[lane] : 0;
                 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
-                    T s = sg.shuffle_up(y, i);
+                    T s = sycl::shift_group_right(sg, y, i);
                     if (lane >= i) y += s;
                 }
 
@@ -277,7 +277,7 @@ T PrefixSum_mp (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum)
     amrex::launch(1, nthreads, sm, stream,
     [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
     {
-        amrex::oneapi::sub_group const& sg = gh.item->get_sub_group();
+        sycl::sub_group const& sg = gh.item->get_sub_group();
         int lane = sg.get_local_id()[0];
         int warp = sg.get_group_id()[0];
         int nwarps = sg.get_group_range()[0];
@@ -293,7 +293,7 @@ T PrefixSum_mp (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum)
             T x = (offset < nblocks) ? blocksum_p[offset] : 0;
             // Scan within a warp
             for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
-                T s = sg.shuffle_up(x, i);
+                T s = sycl::shift_group_right(sg, x, i);
                 if (lane >= i) x += s;
             }
 
@@ -311,7 +311,7 @@ T PrefixSum_mp (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum)
             if (warp == 0) {
                 T y = (lane < nwarps) ? shared[lane] : 0;
                 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
-                    T s = sg.shuffle_up(y, i);
+                    T s = sycl::shift_group_right(sg, y, i);
                     if (lane >= i) y += s;
                 }
 
@@ -417,7 +417,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum
     amrex::launch(nblocks, nthreads, sm, stream,
     [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
     {
-        amrex::oneapi::sub_group const& sg = gh.item->get_sub_group();
+        sycl::sub_group const& sg = gh.item->get_sub_group();
         int lane = sg.get_local_id()[0];
         int warp = sg.get_group_id()[0];
         int nwarps = sg.get_group_range()[0];
@@ -472,7 +472,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum
             T x = x0;
             // Scan within a warp
             for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
-                T s = sg.shuffle_up(x, i);
+                T s = sycl::shift_group_right(sg, x, i);
                 if (lane >= i) x += s;
             }
 
@@ -490,7 +490,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum
             if (warp == 0) {
                 T y = (lane < nwarps) ? shared[lane] : 0;
                 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
-                    T s = sg.shuffle_up(y, i);
+                    T s = sycl::shift_group_right(sg, y, i);
                     if (lane >= i) y += s;
                 }
 
@@ -543,7 +543,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum
                     // implement our own __ballot
                     unsigned status_bf = (stva.status == 'p') ? (0x1u << lane) : 0;
                     for (int i = 1; i < Gpu::Device::warp_size; i *= 2) {
-                        status_bf |= sg.shuffle_xor(status_bf, i);
+                        status_bf |= sycl::permute_group_by_xor(sg, status_bf, i);
                     }
 
                     bool stop_lookback = status_bf & 0x1u;
@@ -563,7 +563,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum
                         }
 
                         for (int i = Gpu::Device::warp_size/2; i > 0; i /= 2) {
-                            x += sg.shuffle_down(x,i);
+                            x += sycl::shift_group_left(sg, x,i);
                         }
                     }