[OpenMP 4.5] Remove superfluous map clauses

Fixes #41. OpenMP 4.5 does not require the map clauses on the target region if the data has been previously defined using unstructured data enter/exit clauses. Removing this clauses works fine with the Clang compiler, however we noticed issues with the Cray compiler. The issue is that the Cray compiler does not block the target region "kernel calls" and so the timing is incorrect. This was not noticed before due to the presence of the map clauses. For now, we have had to add an update from clause of a scalar value to ensure that the kenel blocks. It is hoped that we can remove this in due course. But in the vein of showing how the models work we want to keep the minimum required correct code (which is without the map clause) but need the code to also work correctly.
UoB-HPC · Feb 7, 2018 · 73f1220 · 73f1220
1 parent cfb8901
commit 73f1220
Showing 1 changed file with 30 additions and 5 deletions.
diff --git a/OMPStream.cpp b/OMPStream.cpp
@@ -58,7 +58,7 @@ void OMPStream<T>::init_arrays(T initA, T initB, T initC)
   T *a = this->a;
   T *b = this->b;
   T *c = this->c;
-  #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size])
+  #pragma omp target teams distribute parallel for simd
 #else
   #pragma omp parallel for
 #endif
@@ -68,6 +68,11 @@ void OMPStream<T>::init_arrays(T initA, T initB, T initC)
     b[i] = initB;
     c[i] = initC;
   }
+  #if defined(OMP_TARGET_GPU) && defined(_CRAYC)
+  // If using the Cray compiler, the kernels do not block, so this update forces
+  // a small copy to ensure blocking so that timing is correct
+  #pragma omp target update from(a[0:0])
+  #endif
 }
 
 template <class T>
@@ -97,14 +102,19 @@ void OMPStream<T>::copy()
   unsigned int array_size = this->array_size;
   T *a = this->a;
   T *c = this->c;
-  #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], c[0:array_size])
+  #pragma omp target teams distribute parallel for simd
 #else
   #pragma omp parallel for
 #endif
   for (int i = 0; i < array_size; i++)
   {
     c[i] = a[i];
   }
+  #if defined(OMP_TARGET_GPU) && defined(_CRAYC)
+  // If using the Cray compiler, the kernels do not block, so this update forces
+  // a small copy to ensure blocking so that timing is correct
+  #pragma omp target update from(a[0:0])
+  #endif
 }
 
 template <class T>
@@ -116,14 +126,19 @@ void OMPStream<T>::mul()
   unsigned int array_size = this->array_size;
   T *b = this->b;
   T *c = this->c;
-  #pragma omp target teams distribute parallel for simd map(to: b[0:array_size], c[0:array_size])
+  #pragma omp target teams distribute parallel for simd
 #else
   #pragma omp parallel for
 #endif
   for (int i = 0; i < array_size; i++)
   {
     b[i] = scalar * c[i];
   }
+  #if defined(OMP_TARGET_GPU) && defined(_CRAYC)
+  // If using the Cray compiler, the kernels do not block, so this update forces
+  // a small copy to ensure blocking so that timing is correct
+  #pragma omp target update from(c[0:0])
+  #endif
 }
 
 template <class T>
@@ -134,14 +149,19 @@ void OMPStream<T>::add()
   T *a = this->a;
   T *b = this->b;
   T *c = this->c;
-  #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size])
+  #pragma omp target teams distribute parallel for simd
 #else
   #pragma omp parallel for
 #endif
   for (int i = 0; i < array_size; i++)
   {
     c[i] = a[i] + b[i];
   }
+  #if defined(OMP_TARGET_GPU) && defined(_CRAYC)
+  // If using the Cray compiler, the kernels do not block, so this update forces
+  // a small copy to ensure blocking so that timing is correct
+  #pragma omp target update from(a[0:0])
+  #endif
 }
 
 template <class T>
@@ -154,14 +174,19 @@ void OMPStream<T>::triad()
   T *a = this->a;
   T *b = this->b;
   T *c = this->c;
-  #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size])
+  #pragma omp target teams distribute parallel for simd
 #else
   #pragma omp parallel for
 #endif
   for (int i = 0; i < array_size; i++)
   {
     a[i] = b[i] + scalar * c[i];
   }
+  #if defined(OMP_TARGET_GPU) && defined(_CRAYC)
+  // If using the Cray compiler, the kernels do not block, so this update forces
+  // a small copy to ensure blocking so that timing is correct
+  #pragma omp target update from(a[0:0])
+  #endif
 }
 
 template <class T>