From f3411209c9306b00afbee99a4b99c3c35074a997 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Sun, 5 Nov 2017 15:27:55 -0800
Subject: [PATCH 001/245] fix bug when iterations are odd

---
 RUST/transpose.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/RUST/transpose.rs b/RUST/transpose.rs
index a315a84ef..5d1ba1e87 100644
--- a/RUST/transpose.rs
+++ b/RUST/transpose.rs
@@ -154,19 +154,19 @@ fn main()
   let t1 = timer.elapsed();
   let dt = (t1.checked_sub(t0)).unwrap();
   let dtt : u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64;
-  let transpose_time : f64 = dtt as f64 / 1.0e9_f64 as f64;
+  let transpose_time : f64 = dtt as f64 * 1.0e-9;
 
   //////////////////////////////////////////////////////////////////////
   /// Analyze and output results
   //////////////////////////////////////////////////////////////////////
 
-  let addit : usize = (iterations as usize + 1) * (iterations as usize / 2);
+  let addit : usize = ((iterations as usize + 1) * (iterations as usize)) / 2;
   let mut abserr : f64 = 0.0;
   for i in 0..order {
     for j in 0..order {
       let ij = i*order+j;
       let ji = j*order+i;
-      let reference : f64 = (ij*(1+iterations as usize)+addit) as f64;
+      let reference : f64 = (ij*(iterations as usize + 1)+addit) as f64;
       abserr += (b[ji] - reference).abs();
     }
   }

From 4c8025dc0ff757630621b4a6d34843bcf69f0fce Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 3 Jan 2018 12:23:30 -0800
Subject: [PATCH 002/245] initial attempt at OCCA port

---
 Cxx11/nstream-occa.cc | 178 ++++++++++++++++++++++++++++++++++++++++++
 Cxx11/nstream.okl     |   7 ++
 2 files changed, 185 insertions(+)
 create mode 100644 Cxx11/nstream-occa.cc
 create mode 100644 Cxx11/nstream.okl

diff --git a/Cxx11/nstream-occa.cc b/Cxx11/nstream-occa.cc
new file mode 100644
index 000000000..fb85c7b91
--- /dev/null
+++ b/Cxx11/nstream-occa.cc
@@ -0,0 +1,178 @@
+///
+/// Copyright (c) 2017, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    nstream
+///
+/// PURPOSE: To compute memory bandwidth when adding a vector of a given
+///          number of double precision values to the scalar multiple of
+///          another vector of the same length, and storing the result in
+///          a third vector.
+///
+/// USAGE:   The program takes as input the number
+///          of iterations to loop over the triad vectors, the length of the
+///          vectors, and the offset between vectors
+///
+///          <progname> <# iterations> <vector length> <offset>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// NOTES:   Bandwidth is determined as the number of words read, plus the
+///          number of words written, times the size of the words, divided
+///          by the execution time. For a vector length of N, the total
+///          number of words read and written is 4*N*sizeof(double).
+///
+/// HISTORY: This code is loosely based on the Stream benchmark by John
+///          McCalpin, but does not follow all the Stream rules. Hence,
+///          reported results should not be associated with Stream in
+///          external publications
+///
+///          Converted to C++11 by Jeff Hammond, November 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/OCCA STREAM triad: A = B + scalar * C" << std::endl;
+
+  occa::device device("mode: 'Serial'");
+  //occa::device device("mode: 'OpenMP'");
+  //occa::device device("mode: 'OpenCL'");
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations, offset;
+  size_t length;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <vector length> [<offset>]";
+      }
+
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      length = std::atol(argv[2]);
+      if (length <= 0) {
+        throw "ERROR: vector length must be positive";
+      }
+
+      offset = (argc>3) ? std::atoi(argv[3]) : 0;
+      if (length <= 0) {
+        throw "ERROR: offset must be nonnegative";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Vector length        = " << length << std::endl;
+  std::cout << "Offset               = " << offset << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  auto nstream_time = 0.0;
+
+  std::vector<double> h_A;
+  std::vector<double> h_B;
+  std::vector<double> h_C;
+  h_A.resize(length,0.0);
+  h_B.resize(length,2.0);
+  h_C.resize(length,2.0);
+
+  // hard-coded in nstream.okl
+  const double scalar(3);
+
+  occa::memory d_A = device.malloc(length * sizeof(float), h_A);
+  occa::memory d_B = device.malloc(length * sizeof(float), h_B);
+  occa::memory d_C = device.malloc(length * sizeof(float), h_C);
+
+  occa::kernel nstream = device.buildKernel("nstream.okl", "nstream");
+  {
+    for (auto iter = 0; iter<=iterations; iter++) {
+      if (iter==1) nstream_time = prk::wtime();
+      nstream(length, d_A, d_B, d_C);
+      device.finish();
+    }
+    nstream_time = prk::wtime() - nstream_time;
+  }
+  occa::memcpy(h_C, d_C);
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  double ar(0);
+  double br(2);
+  double cr(2);
+  double ref(0);
+  for (auto i=0; i<=iterations; i++) {
+      ar += br + scalar * cr;
+  }
+
+  ar *= length;
+
+  double asum(0);
+  for (auto i=0; i<length; i++) {
+      asum += std::fabs(A[i]);
+  }
+
+  double epsilon=1.e-8;
+  if (std::fabs(ar-asum)/asum > epsilon) {
+      std::cout << "Failed Validation on output array\n"
+                << "       Expected checksum: " << ar << "\n"
+                << "       Observed checksum: " << asum << std::endl;
+      std::cout << "ERROR: solution did not validate" << std::endl;
+      return 1;
+  } else {
+      std::cout << "Solution validates" << std::endl;
+      double avgtime = nstream_time/iterations;
+      double nbytes = 4.0 * length * sizeof(double);
+      std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime
+                << " Avg time (s): " << avgtime << std::endl;
+  }
+
+  return 0;
+}
+
+
diff --git a/Cxx11/nstream.okl b/Cxx11/nstream.okl
new file mode 100644
index 000000000..1539b7a5c
--- /dev/null
+++ b/Cxx11/nstream.okl
@@ -0,0 +1,7 @@
+@kernel void nstream(int N, double * A, const double * B, const double * C) {
+  for (int group = 0; group < N; group += 64; outer) {
+    for (int i = group; i < (group + 64); ++i; inner) {
+      A[i] += B[i] + 3 * C[i];
+    }
+  }
+}

From 61882b9965694952628947326ef4884bde90ea1f Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 3 Jan 2018 16:25:31 -0800
Subject: [PATCH 003/245] OCCA nstream working

---
 Cxx11/Makefile        | 13 +++++++++-
 Cxx11/nstream-occa.cc | 55 ++++++++++++++++++++++++++++---------------
 Cxx11/nstream.okl     |  6 +++--
 Cxx11/prk_util.h      |  4 ++++
 4 files changed, 56 insertions(+), 22 deletions(-)

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index 86ccdfb6c..e4345d87b 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -39,7 +39,7 @@ OPENCLFLAGS = $(OPENCLFLAG)
 # We do not yet handle all possible exceptions...
 #OPENCLFLAGS += -D__CL_ENABLE_EXCEPTIONS
 ORNLACCFLAGS = $(ORNLACCFLAG)
-TBBFLAGS = $(TBBFLAG) -DUSE_TBB -DPRK_TBB_PARTITIONER=$(PRK_TBB_PARTITIONER)
+TBBFLAGS = $(TBBFLAG) -DPRK_TBB_PARTITIONER=$(PRK_TBB_PARTITIONER)
 CBLASFLAGS = $(CBLASFLAG) $(OPENMPFLAG)
 BOOSTFLAGS = $(BOOSTFLAG)
 STLFLAGS = $(STLFLAG) $(BOOSTFLAGS)
@@ -47,6 +47,11 @@ PSTLFLAGS = $(PSTLFLAG) $(BOOSTFLAGS)
 RAJAFLAGS = $(RAJAFLAG)
 KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) $(BOOSTFLAGS)
 
+ifdef OCCADIR
+  include ${OCCADIR}/scripts/makefile
+endif
+OCCAFLAGS = -DUSE_OCCA -I${OCCADIR}/include -Wl,-rpath=${OCCADIR}/lib -L${OCCADIR}/lib -locca
+
 .PHONY: all clean vector valarray openmp target opencl taskloop tbb stl pstl rangefor kokkos raja cuda
 
 EXTRA=
@@ -104,6 +109,8 @@ raja: p2p-vector-raja stencil-vector-raja transpose-vector-raja nstream-vector-r
 
 cuda: transpose-cuda transpose-cublas nstream-vector-cuda
 
+occa: nstream-occa
+
 p2p-innerloop-vector: p2p-innerloop-vector-openmp.cc prk_util.h
 	$(CXX) $(CXXFLAGS) $< -o $@
 
@@ -152,6 +159,9 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h
 %-cblas: %-cblas.cc prk_util.h
 	$(CXX) $(CXXFLAGS) $< $(CBLASFLAGS) -o $@
 
+%-occa: %-occa.cc prk_util.h
+	$(CXX) $(CXXFLAGS) $< $(OCCAFLAGS) -o $@
+
 %: %.cc prk_util.h
 	$(CXX) $(CXXFLAGS) $< -o $@
 
@@ -183,6 +193,7 @@ clean:
 	-rm -f *-cuda
 	-rm -f *-cublas
 	-rm -f *-cblas
+	-rm -f *-occa
 	-rm -f transpose-vector-async transpose-vector-thread
 
 cleancl:
diff --git a/Cxx11/nstream-occa.cc b/Cxx11/nstream-occa.cc
index fb85c7b91..4e09578fd 100644
--- a/Cxx11/nstream-occa.cc
+++ b/Cxx11/nstream-occa.cc
@@ -68,16 +68,16 @@ int main(int argc, char * argv[])
   std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
   std::cout << "C++11/OCCA STREAM triad: A = B + scalar * C" << std::endl;
 
-  occa::device device("mode: 'Serial'");
-  //occa::device device("mode: 'OpenMP'");
-  //occa::device device("mode: 'OpenCL'");
+  occa::device device("mode = Serial");
+  //occa::device device("mode = OpenMP");
+  //occa::device device("mode = OpenCL, platformID = 0, deviceID = 0");
 
   //////////////////////////////////////////////////////////////////////
   /// Read and test input parameters
   //////////////////////////////////////////////////////////////////////
 
   int iterations, offset;
-  size_t length;
+  int length;
   try {
       if (argc < 3) {
         throw "Usage: <# iterations> <vector length> [<offset>]";
@@ -88,7 +88,7 @@ int main(int argc, char * argv[])
         throw "ERROR: iterations must be >= 1";
       }
 
-      length = std::atol(argv[2]);
+      length = std::atoi(argv[2]);
       if (length <= 0) {
         throw "ERROR: vector length must be positive";
       }
@@ -113,30 +113,43 @@ int main(int argc, char * argv[])
 
   auto nstream_time = 0.0;
 
-  std::vector<double> h_A;
-  std::vector<double> h_B;
-  std::vector<double> h_C;
-  h_A.resize(length,0.0);
-  h_B.resize(length,2.0);
-  h_C.resize(length,2.0);
+  double * h_A = new double[length];
+  double * h_B = new double[length];
+  double * h_C = new double[length];
+  for (size_t i=0; i<length; ++i) {
+      h_A[i] = 0.0;
+      h_B[i] = 2.0;
+      h_C[i] = 2.0;
+  }
+
+  double scalar(3);
 
-  // hard-coded in nstream.okl
-  const double scalar(3);
+  occa::memory d_A = device.malloc(length * sizeof(double), h_A);
+  occa::memory d_B = device.malloc(length * sizeof(double), h_B);
+  occa::memory d_C = device.malloc(length * sizeof(double), h_C);
 
-  occa::memory d_A = device.malloc(length * sizeof(float), h_A);
-  occa::memory d_B = device.malloc(length * sizeof(float), h_B);
-  occa::memory d_C = device.malloc(length * sizeof(float), h_C);
+  d_A.copyFrom(h_A);
+  d_B.copyFrom(h_B);
+  d_C.copyFrom(h_C);
 
   occa::kernel nstream = device.buildKernel("nstream.okl", "nstream");
+
   {
     for (auto iter = 0; iter<=iterations; iter++) {
       if (iter==1) nstream_time = prk::wtime();
-      nstream(length, d_A, d_B, d_C);
+      nstream(length, scalar, d_A, d_B, d_C);
       device.finish();
     }
     nstream_time = prk::wtime() - nstream_time;
   }
-  occa::memcpy(h_C, d_C);
+
+  d_A.copyTo(h_A);
+
+  d_A.free();
+  d_B.free();
+  d_C.free();
+  nstream.free();
+  device.free();
 
   //////////////////////////////////////////////////////////////////////
   /// Analyze and output results
@@ -154,9 +167,13 @@ int main(int argc, char * argv[])
 
   double asum(0);
   for (auto i=0; i<length; i++) {
-      asum += std::fabs(A[i]);
+      asum += std::fabs(h_A[i]);
   }
 
+  delete[] h_A;
+  delete[] h_B;
+  delete[] h_C;
+
   double epsilon=1.e-8;
   if (std::fabs(ar-asum)/asum > epsilon) {
       std::cout << "Failed Validation on output array\n"
diff --git a/Cxx11/nstream.okl b/Cxx11/nstream.okl
index 1539b7a5c..fa561facf 100644
--- a/Cxx11/nstream.okl
+++ b/Cxx11/nstream.okl
@@ -1,7 +1,9 @@
-@kernel void nstream(int N, double * A, const double * B, const double * C) {
+@kernel void nstream(int N, double scalar, double * A, const double * B, const double * C) {
   for (int group = 0; group < N; group += 64; outer) {
     for (int i = group; i < (group + 64); ++i; inner) {
-      A[i] += B[i] + 3 * C[i];
+      if (i<N) {
+        A[i] += B[i] + scalar * C[i];
+      }
     }
   }
 }
diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h
index 13a42fa3c..e152e85fd 100644
--- a/Cxx11/prk_util.h
+++ b/Cxx11/prk_util.h
@@ -212,6 +212,10 @@ const T prk_reduce(I first, I last, T init) {
 # include "RAJA/RAJA.hpp"
 #endif
 
+#ifdef USE_OCCA
+# include "occa.hpp"
+#endif
+
 #define RESTRICT __restrict__
 
 namespace prk {

From e2d908de0735d5e9d7035c22fc231741627e441f Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 3 Jan 2018 16:27:12 -0800
Subject: [PATCH 004/245] add OCCA to make.defs examples

---
 common/make.defs.gcc   | 6 +++++-
 common/make.defs.intel | 6 +++++-
 common/make.defs.llvm  | 6 +++++-
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index 174da2362..e4ccf911f 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -35,6 +35,10 @@ OPENCLFLAG=-framework OpenCL
 #OPENCLDIR=/etc/alternatives/opencl-intel-tools
 #OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL
 #
+# OCCA
+#
+OCCADIR=${HOME}/prk-repo/Cxx11/occa
+#
 # Cilk
 #
 CILKFLAG=-fcilkplus
@@ -42,7 +46,7 @@ CILKFLAG=-fcilkplus
 # TBB
 #
 TBBDIR=/usr/local/Cellar/tbb/2018_U1
-TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
+TBBFLAG=-USE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
 #
 # Parallel STL, Boost, etc.
 #
diff --git a/common/make.defs.intel b/common/make.defs.intel
index 3157acead..d919113b5 100644
--- a/common/make.defs.intel
+++ b/common/make.defs.intel
@@ -35,13 +35,17 @@ OFFLOADFLAG=-qopenmp-offload=host
 OPENCLDIR=/etc/alternatives/opencl-intel-tools
 OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL
 #
+# OCCA
+#
+OCCADIR=${HOME}/prk-repo/Cxx11/occa
+#
 # Cilk
 #
 CILKFLAG=-intel-extensions # default
 #
 # TBB
 #
-TBBFLAG=-tbb
+TBBFLAG=-USE_TBB -tbb
 #
 # Parallel STL, Boost, etc.
 #
diff --git a/common/make.defs.llvm b/common/make.defs.llvm
index d8357cd6b..133967dc7 100644
--- a/common/make.defs.llvm
+++ b/common/make.defs.llvm
@@ -54,10 +54,14 @@ OPENCLFLAG=-framework OpenCL
 #OPENCLDIR=/etc/alternatives/opencl-intel-tools
 #OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL
 #
+#
+# OCCA
+#
+OCCADIR=${HOME}/prk-repo/Cxx11/occa
 # TBB
 #
 TBBDIR=/usr/local/Cellar/tbb/2018_U1
-TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
+TBBFLAG=-USE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
 #
 # Parallel STL, Boost, etc.
 #

From cb4a3e005c887c9694c64f02ce508dee38aa67c2 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 3 Jan 2018 21:10:56 -0800
Subject: [PATCH 005/245] make OCCA nicer

---
 Cxx11/Makefile        |  8 ++++----
 Cxx11/nstream-occa.cc | 15 ++++++++++++---
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index e4345d87b..f37c830ce 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -20,15 +20,12 @@ ifeq ($(USE_PRK_TBB_PARTITIONER),simple)
     PRK_TBB_PARTITIONER=3
 endif
 ifndef PRK_TBB_PARTITIONER
-    $(info PRK help: Consider setting USE_PRK_TBB_PARTITIONER={static,affinity,simple} when invoking make)
     PRK_TBB_PARTITIONER=0
 endif
 
 # Valid choices are OpenMP, Threads, Serial, Cuda
 ifdef USE_PRK_KOKKOS_BACKEND
     KOKKOS_BACKEND_FLAG = -DPRK_KOKKOS_BACKEND=$(USE_PRK_KOKKOS_BACKEND)
-else
-    $(info PRK help: Set USE_PRK_KOKKOS_BACKEND={Threads,Serial,Cuda} when invoking make to not use OpenMP)
 endif
 
 ASMFLAGS = -fsource-asm -fverbose-asm -fasm-blocks -fcode-asm
@@ -63,7 +60,7 @@ else
   EXTRA += target
 endif
 
-all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl $(EXTRA)
+all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl occa $(EXTRA)
 
 p2p: p2p-vector p2p-doacross-vector-openmp p2p-innerloop-vector-openmp p2p-tasks-openmp p2p-openmp-target \
      p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl
@@ -133,6 +130,7 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h
 	$(CXX) $(CXXFLAGS) $< $(OMPFLAGS) -o $@
 
 %-tbb: %-tbb.cc prk_util.h
+	$(info PRK help: Consider setting USE_PRK_TBB_PARTITIONER={static,affinity,simple} when invoking make)
 	$(CXX) $(CXXFLAGS) $< $(TBBFLAGS) -o $@
 
 %-stl: %-pstl.cc prk_util.h
@@ -148,6 +146,7 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h
 	$(CXX) $(CXXFLAGS) $< $(RAJAFLAGS) -o $@
 
 %-kokkos: %-kokkos.cc prk_util.h
+	$(info PRK help: Set USE_PRK_KOKKOS_BACKEND={Threads,Serial,Cuda} when invoking make to not use OpenMP)
 	$(CXX) $(CXXFLAGS) $< $(KOKKOSFLAGS) -o $@
 
 %-cuda: %-cuda.cu prk_util.h prk_cuda.h
@@ -160,6 +159,7 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h
 	$(CXX) $(CXXFLAGS) $< $(CBLASFLAGS) -o $@
 
 %-occa: %-occa.cc prk_util.h
+	$(info PRK help: Set OCCA_CXX=$(firstword $(CXX)) to use that compiler for OKL files.)
 	$(CXX) $(CXXFLAGS) $< $(OCCAFLAGS) -o $@
 
 %: %.cc prk_util.h
diff --git a/Cxx11/nstream-occa.cc b/Cxx11/nstream-occa.cc
index 4e09578fd..10b0b47fd 100644
--- a/Cxx11/nstream-occa.cc
+++ b/Cxx11/nstream-occa.cc
@@ -68,9 +68,17 @@ int main(int argc, char * argv[])
   std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
   std::cout << "C++11/OCCA STREAM triad: A = B + scalar * C" << std::endl;
 
-  occa::device device("mode = Serial");
-  //occa::device device("mode = OpenMP");
-  //occa::device device("mode = OpenCL, platformID = 0, deviceID = 0");
+  char* dc = std::getenv("OCCA_DEVICE");
+  if (dc==NULL) {
+      std::cout << "By default, OCCA executes in serial.\n";
+      std::cout << "Set OCCA_DEVICE as follows for parallel execution\n";
+      std::cout << " OCCA_DEVICE=\"mode = OpenMP\"\n";
+      std::cout << " OCCA_DEVICE=\"mode = OpenCL, platformID = 0, deviceID = 0\" (CPU)\n";
+      std::cout << " OCCA_DEVICE=\"mode = OpenCL, platformID = 1, deviceID = 0\" (GPU)\n";
+      std::cout << " OCCA_DEVICE=\"mode = CUDA', deviceID = 0\"\n";
+  }
+  std::string ds = (dc==NULL) ? "mode = Serial" : dc;
+  occa::device device(ds);
 
   //////////////////////////////////////////////////////////////////////
   /// Read and test input parameters
@@ -106,6 +114,7 @@ int main(int argc, char * argv[])
   std::cout << "Number of iterations = " << iterations << std::endl;
   std::cout << "Vector length        = " << length << std::endl;
   std::cout << "Offset               = " << offset << std::endl;
+  std::cout << "OCCA mode            = " << "\"" << ds << "\"" << std::endl;
 
   //////////////////////////////////////////////////////////////////////
   // Allocate space and perform the computation

From 04307d75b7822c5baaca6939a2213d253b5eca27 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 3 Jan 2018 21:26:41 -0800
Subject: [PATCH 006/245] s/pragma simd/pragma vector/ for ICC [ci skip]

---
 C1z/prk_util.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/C1z/prk_util.h b/C1z/prk_util.h
index 0777d4863..24b94f9c7 100644
--- a/C1z/prk_util.h
+++ b/C1z/prk_util.h
@@ -113,7 +113,7 @@ int __cilkrts_get_nworkers(void);
 #endif
 
 #if defined(__INTEL_COMPILER)
-# define PRAGMA_SIMD PRAGMA(simd)
+# define PRAGMA_SIMD PRAGMA(vector)
 #elif defined(__GNUC__) && defined(__GNUC_MINOR__) && ( ( (__GNUC__ == 4) && (__GNUC_MINOR__ == 9) ) || (__GNUC__ >= 5) )
 # define PRAGMA_SIMD PRAGMA(GCC ivdep)
 #elif defined(__clang__)

From 26c4b40d7a39174a2a95b85cf962935067bd681c Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Thu, 4 Jan 2018 15:07:56 -0800
Subject: [PATCH 007/245] cleanup source

---
 Cxx11/transpose-openmp-target.cc | 8 +++-----
 Cxx11/transpose-vector.cc        | 2 --
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/Cxx11/transpose-openmp-target.cc b/Cxx11/transpose-openmp-target.cc
index c5b707ce5..a611997f5 100644
--- a/Cxx11/transpose-openmp-target.cc
+++ b/Cxx11/transpose-openmp-target.cc
@@ -71,14 +71,12 @@ int main(int argc, char * argv[])
         throw "Usage: <# iterations> <matrix order> [tile size]";
       }
 
-      // number of times to do the transpose
       iterations  = std::atoi(argv[1]);
       if (iterations < 1) {
         throw "ERROR: iterations must be >= 1";
       }
 
-      // order of a the matrix
-      order = std::atol(argv[2]);
+      order = std::atoi(argv[2]);
       if (order <= 0) {
         throw "ERROR: Matrix Order must be greater than 0";
       } else if (order > std::floor(std::sqrt(INT_MAX))) {
@@ -86,7 +84,7 @@ int main(int argc, char * argv[])
       }
 
       // default tile size for tiling of local transpose
-      tile_size = (argc>3) ? std::atol(argv[3]) : 32;
+      tile_size = (argc>3) ? std::atoi(argv[3]) : 32;
       // a negative tile size means no tiling of the local transpose
       if (tile_size <= 0) tile_size = order;
   }
@@ -101,7 +99,7 @@ int main(int argc, char * argv[])
   std::cout << "Tile size             = " << tile_size << std::endl;
 
   //////////////////////////////////////////////////////////////////////
-  /// Allocate space for the input and transpose matrix
+  // Allocate space and perform the computation
   //////////////////////////////////////////////////////////////////////
 
   auto trans_time = 0.0;
diff --git a/Cxx11/transpose-vector.cc b/Cxx11/transpose-vector.cc
index 6dcf5dbe8..c6199ff40 100644
--- a/Cxx11/transpose-vector.cc
+++ b/Cxx11/transpose-vector.cc
@@ -71,13 +71,11 @@ int main(int argc, char * argv[])
         throw "Usage: <# iterations> <matrix order> [tile size]";
       }
 
-      // number of times to do the transpose
       iterations  = std::atoi(argv[1]);
       if (iterations < 1) {
         throw "ERROR: iterations must be >= 1";
       }
 
-      // order of a the matrix
       order = std::atoi(argv[2]);
       if (order <= 0) {
         throw "ERROR: Matrix Order must be greater than 0";

From d82afcace9d33e080f1d12837851121462e4ca95 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Thu, 4 Jan 2018 15:08:18 -0800
Subject: [PATCH 008/245] OCCA transpose

---
 Cxx11/Makefile          |   2 +-
 Cxx11/transpose-occa.cc | 189 ++++++++++++++++++++++++++++++++++++++++
 Cxx11/transpose.okl     |  11 +++
 3 files changed, 201 insertions(+), 1 deletion(-)
 create mode 100644 Cxx11/transpose-occa.cc
 create mode 100644 Cxx11/transpose.okl

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index f37c830ce..f91ade18b 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -106,7 +106,7 @@ raja: p2p-vector-raja stencil-vector-raja transpose-vector-raja nstream-vector-r
 
 cuda: transpose-cuda transpose-cublas nstream-vector-cuda
 
-occa: nstream-occa
+occa: transpose-occa nstream-occa
 
 p2p-innerloop-vector: p2p-innerloop-vector-openmp.cc prk_util.h
 	$(CXX) $(CXXFLAGS) $< -o $@
diff --git a/Cxx11/transpose-occa.cc b/Cxx11/transpose-occa.cc
new file mode 100644
index 000000000..5b05b73ce
--- /dev/null
+++ b/Cxx11/transpose-occa.cc
@@ -0,0 +1,189 @@
+///
+/// Copyright (c) 2017, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    transpose
+///
+/// PURPOSE: This program measures the time for the transpose of a
+///          column-major stored matrix into a row-major stored matrix.
+///
+/// USAGE:   Program input is the matrix order and the number of times to
+///          repeat the operation:
+///
+///          transpose <matrix_size> <# iterations> [tile size]
+///
+///          An optional parameter specifies the tile size used to divide the
+///          individual matrix blocks for improved cache and TLB performance.
+///
+///          The output consists of diagnostics to make sure the
+///          transpose worked and timing statistics.
+///
+///          Converted to C++11 by Jeff Hammond, January 2018.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/OCCA Matrix transpose: B = A^T" << std::endl;
+
+  char* dc = std::getenv("OCCA_DEVICE");
+  if (dc==NULL) {
+      std::cout << "By default, OCCA executes in serial.\n";
+      std::cout << "Set OCCA_DEVICE as follows for parallel execution\n";
+      std::cout << " OCCA_DEVICE=\"mode = OpenMP\"\n";
+      std::cout << " OCCA_DEVICE=\"mode = OpenCL, platformID = 0, deviceID = 0\" (CPU)\n";
+      std::cout << " OCCA_DEVICE=\"mode = OpenCL, platformID = 1, deviceID = 0\" (GPU)\n";
+      std::cout << " OCCA_DEVICE=\"mode = CUDA', deviceID = 0\"\n";
+  }
+  std::string ds = (dc==NULL) ? "mode = Serial" : dc;
+  occa::device device(ds);
+
+  //////////////////////////////////////////////////////////////////////
+  // Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations;
+  int order;
+  int tile_size;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <matrix order> [tile size]";
+      }
+
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      order = std::atoi(argv[2]);
+      if (order <= 0) {
+        throw "ERROR: Matrix Order must be greater than 0";
+      } else if (order > std::floor(std::sqrt(INT_MAX))) {
+        throw "ERROR: matrix dimension too large - overflow risk";
+      }
+
+      // default tile size for tiling of local transpose
+      tile_size = (argc>3) ? std::atoi(argv[3]) : 32;
+      // a negative tile size means no tiling of the local transpose
+      if (tile_size <= 0) tile_size = order;
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Matrix order         = " << order << std::endl;
+  std::cout << "Tile size            = " << tile_size << std::endl;
+  std::cout << "OCCA mode            = " << "\"" << ds << "\"" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  auto trans_time = 0.0;
+
+  double * h_A = new double[order*order];
+  double * h_B = new double[order*order];
+  for (auto i=0;i<order; i++) {
+    for (auto j=0;j<order;j++) {
+      h_A[i*order+j] = static_cast<double>(i*order+j);
+      h_B[i*order+j] = 0.0;
+    }
+  }
+
+  occa::memory d_A = device.malloc(order * order * sizeof(double), h_A);
+  occa::memory d_B = device.malloc(order * order * sizeof(double), h_B);
+
+  d_A.copyFrom(h_A);
+  d_B.copyFrom(h_B);
+
+  occa::kernel transpose = device.buildKernel("transpose.okl", "transpose");
+
+  {
+    for (auto iter = 0; iter<=iterations; iter++) {
+      if (iter==1) trans_time = prk::wtime();
+      transpose(order, d_A, d_B);
+      device.finish();
+    }
+    trans_time = prk::wtime() - trans_time;
+  }
+
+  d_B.copyTo(h_B);
+
+  d_A.free();
+  d_B.free();
+  transpose.free();
+  device.free();
+
+  //////////////////////////////////////////////////////////////////////
+  // Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  const auto addit = (iterations+1.) * (iterations/2.);
+  auto abserr = 0.0;
+  for (auto j=0; j<order; j++) {
+    for (auto i=0; i<order; i++) {
+      const int ij = i*order+j;
+      const int ji = j*order+i;
+      const double reference = static_cast<double>(ij)*(1.+iterations)+addit;
+      abserr += std::fabs(h_B[ji] - reference);
+    }
+  }
+
+  delete[] h_A;
+  delete[] h_B;
+
+#ifdef VERBOSE
+  std::cout << "Sum of absolute differences: " << abserr << std::endl;
+#endif
+
+  const auto epsilon = 1.0e-8;
+  if (abserr < epsilon) {
+    std::cout << "Solution validates" << std::endl;
+    auto avgtime = trans_time/iterations;
+    auto bytes = order * order * sizeof(double);
+    std::cout << "Rate (MB/s): " << 1.0e-6 * (2L*bytes)/avgtime
+              << " Avg time (s): " << avgtime << std::endl;
+  } else {
+    std::cout << "ERROR: Aggregate squared error " << abserr
+              << " exceeds threshold " << epsilon << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
+
+
diff --git a/Cxx11/transpose.okl b/Cxx11/transpose.okl
new file mode 100644
index 000000000..1c8d651dc
--- /dev/null
+++ b/Cxx11/transpose.okl
@@ -0,0 +1,11 @@
+@kernel void transpose(int N, double * A, double * B)
+{
+  for(int j = 0; j < N; ++j; outer) {
+    for(int i = 0; i < N; ++i; inner) {
+      if ((i<N) && (j<N)) {
+          B[i*N+j] += A[j*N+i];
+          A[j*N+i] += 1.0;
+      }
+    }
+  }
+}

From 91e468c687773a486b46c0086ff23eb8facbe182 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Thu, 4 Jan 2018 16:03:46 -0800
Subject: [PATCH 009/245] fix type in -DUSE_TBB

---
 common/make.defs.gcc   | 2 +-
 common/make.defs.intel | 2 +-
 common/make.defs.llvm  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index e4ccf911f..5e7d18986 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -46,7 +46,7 @@ CILKFLAG=-fcilkplus
 # TBB
 #
 TBBDIR=/usr/local/Cellar/tbb/2018_U1
-TBBFLAG=-USE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
+TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
 #
 # Parallel STL, Boost, etc.
 #
diff --git a/common/make.defs.intel b/common/make.defs.intel
index d919113b5..0c21efc09 100644
--- a/common/make.defs.intel
+++ b/common/make.defs.intel
@@ -45,7 +45,7 @@ CILKFLAG=-intel-extensions # default
 #
 # TBB
 #
-TBBFLAG=-USE_TBB -tbb
+TBBFLAG=-DUSE_TBB -tbb
 #
 # Parallel STL, Boost, etc.
 #
diff --git a/common/make.defs.llvm b/common/make.defs.llvm
index 133967dc7..b4837451d 100644
--- a/common/make.defs.llvm
+++ b/common/make.defs.llvm
@@ -61,7 +61,7 @@ OCCADIR=${HOME}/prk-repo/Cxx11/occa
 # TBB
 #
 TBBDIR=/usr/local/Cellar/tbb/2018_U1
-TBBFLAG=-USE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
+TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
 #
 # Parallel STL, Boost, etc.
 #

From e604a1384ba526a6fb08ae04b0b04eaf0add5e3e Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Tue, 9 Jan 2018 15:52:07 -0800
Subject: [PATCH 010/245] fix Travis - USE_TBB flag

---
 travis/build-run-prk.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index 74077e0a7..0a7be419c 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -453,12 +453,12 @@ case "$PRK_TARGET" in
                 Linux)
                     ${CC} --version
                     export TBBFLAG="-I${TBBROOT}/include -L${TBBROOT}/lib/intel64/gcc4.7 -ltbb"
-                    echo "TBBFLAG=${TBBFLAG}" >> common/make.defs
+                    echo "TBBFLAG=-DUSE_TBB ${TBBFLAG}" >> common/make.defs
                     export LD_LIBRARY_PATH=${TBBROOT}/lib/intel64/gcc4.7:${LD_LIBRARY_PATH}
                     ;;
                 Darwin)
                     export TBBFLAG="-I${TBBROOT}/include -L${TBBROOT}/lib -ltbb"
-                    echo "TBBFLAG=${TBBFLAG}" >> common/make.defs
+                    echo "TBBFLAG=-DUSE_TBB ${TBBFLAG}" >> common/make.defs
                     export LD_LIBRARY_PATH=${TBBROOT}/lib:${LD_LIBRARY_PATH}
                     ;;
             esac

From 51599c33e78f391bab4a99e448c081d44971a65a Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Wed, 10 Jan 2018 09:04:30 -0800
Subject: [PATCH 011/245] add OCCA to Travis

---
 travis/build-run-prk.sh |  7 ++++++
 travis/install-deps.sh  |  1 +
 travis/install-occa.sh  | 51 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 59 insertions(+)
 create mode 100644 travis/install-occa.sh

diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index 0a7be419c..b76414933 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -573,6 +573,13 @@ case "$PRK_TARGET" in
                 $PRK_TARGET_PATH/stencil-kokkos 10 200 20 $s $r
             done
         done
+
+        # C++ w/ OCCA
+        echo "OCCADIR=${TRAVIS_ROOT}/occa" >> common/make.defs
+        export OCCA_CXX=${PRK_CXX}
+        make -C $PRK_TARGET_PATH transpose-occa nstream-occa
+        $PRK_TARGET_PATH/transpose-occa   10 1024 32
+        $PRK_TARGET_PATH/nstream-occa     10 16777216 32
         ;;
     allfortran)
         echo "Fortran"
diff --git a/travis/install-deps.sh b/travis/install-deps.sh
index d844e1a8f..35e926c76 100755
--- a/travis/install-deps.sh
+++ b/travis/install-deps.sh
@@ -69,6 +69,7 @@ case "$PRK_TARGET" in
         sh ./travis/install-cmake.sh $TRAVIS_ROOT
         sh ./travis/install-raja.sh $TRAVIS_ROOT
         sh ./travis/install-kokkos.sh $TRAVIS_ROOT
+        sh ./travis/install-occa.sh $TRAVIS_ROOT
         ;;
     allfortran)
         echo "Fortran"
diff --git a/travis/install-occa.sh b/travis/install-occa.sh
new file mode 100644
index 000000000..3152577e3
--- /dev/null
+++ b/travis/install-occa.sh
@@ -0,0 +1,51 @@
+#!/bin/sh
+
+set -e
+set -x
+
+TRAVIS_ROOT="$1"
+
+case $CXX in
+    g++)
+        for major in "-9" "-8" "-7" "-6" "-5" "" ; do
+          if [ -f "`which ${CXX}${major}`" ]; then
+              export PRK_CXX="${CXX}${major}"
+              export PRK_CC="${CC}${major}"
+              echo "Found C++: $PRK_CXX"
+              break
+          fi
+        done
+        if [ "x$PRK_CXX" = "x" ] ; then
+            export PRK_CXX="${CXX}"
+            export PRK_CC="${CC}"
+        fi
+        ;;
+    clang++)
+        for version in "-7" "-6" "-5" "-4" "-3.9" "-3.8" "-3.7" "-3.6" "" ; do
+          if [ -f "`which ${CXX}${version}`" ]; then
+              export PRK_CXX="${CXX}${version}"
+              export PRK_CC="${CC}${version}"
+              echo "Found C++: $PRK_CXX"
+              break
+          fi
+        done
+        if [ "x$PRK_CXX" = "x" ] ; then
+            export PRK_CXX="${CXX}"
+            export PRK_CC="${CC}"
+        fi
+        ;;
+esac
+${PRK_CXX} -v
+
+if [ ! -d "$TRAVIS_ROOT/occa" ]; then
+    pushd
+    cd $TRAVIS_ROOT
+    BRANCH=develop
+    git clone --recursive --depth 1 -b ${BRANCH} https://github.com/libocca/occa.git
+    cd occa
+    CXX=${PRK_CXX} OCCA_CUDA_ENABLED=0 OCCA_FORTRAN_ENABLED=0 make -f makefile
+    popd
+else
+    echo "OCCA installed..."
+    find $TRAVIS_ROOT/occa -name occa.hpp
+fi

From 4aba625eb8dac2445137409f4c2669ab45da4d39 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Wed, 10 Jan 2018 11:29:14 -0800
Subject: [PATCH 012/245] simplify OCCA build

---
 travis/install-occa.sh | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/travis/install-occa.sh b/travis/install-occa.sh
index 3152577e3..7462719e4 100644
--- a/travis/install-occa.sh
+++ b/travis/install-occa.sh
@@ -38,13 +38,9 @@ esac
 ${PRK_CXX} -v
 
 if [ ! -d "$TRAVIS_ROOT/occa" ]; then
-    pushd
-    cd $TRAVIS_ROOT
     BRANCH=develop
-    git clone --recursive --depth 1 -b ${BRANCH} https://github.com/libocca/occa.git
-    cd occa
-    CXX=${PRK_CXX} OCCA_CUDA_ENABLED=0 OCCA_FORTRAN_ENABLED=0 make -f makefile
-    popd
+    git clone --recursive --depth 1 -b ${BRANCH} https://github.com/libocca/occa.git $TRAVIS_ROOT/occa
+    CXX=${PRK_CXX} OCCA_CUDA_ENABLED=0 OCCA_FORTRAN_ENABLED=0 make -f makefile -C $TRAVIS_ROOT/occa
 else
     echo "OCCA installed..."
     find $TRAVIS_ROOT/occa -name occa.hpp

From 6384587fa5a42f802f0387042fe43a5151d1c6c7 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Fri, 12 Jan 2018 14:04:49 -0800
Subject: [PATCH 013/245] use size_t instead of int

---
 Cxx11/nstream-vector-pstl.cc | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/Cxx11/nstream-vector-pstl.cc b/Cxx11/nstream-vector-pstl.cc
index e1fb1ce05..ae67ea494 100644
--- a/Cxx11/nstream-vector-pstl.cc
+++ b/Cxx11/nstream-vector-pstl.cc
@@ -132,14 +132,15 @@ int main(int argc, char * argv[])
     std::for_each( pstl::execution::par_unseq, std::begin(range), std::end(range), [&] (int i) {
 #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \
                         && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) )
-    __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (int i) {
+    __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (size_t i) {
 #else
-    std::for_each( std::begin(range), std::end(range), [&] (int i) {
+    std::for_each( std::begin(range), std::end(range), [&] (size_t i) {
 #endif
-          A[i] = 0.0;
-          B[i] = 2.0;
-          C[i] = 2.0;
-      });
+        A[i] = 0;
+        B[i] = 2;
+        C[i] = 2;
+    });
+
     for (auto iter = 0; iter<=iterations; iter++) {
 
       if (iter==1) nstream_time = prk::wtime();
@@ -148,9 +149,9 @@ int main(int argc, char * argv[])
       std::for_each( pstl::execution::par_unseq, std::begin(range), std::end(range), [&] (int i) {
 #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \
                         && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) )
-      __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (int i) {
+      __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (size_t i) {
 #else
-      std::for_each( std::begin(range), std::end(range), [&] (int i) {
+      std::for_each( std::begin(range), std::end(range), [&] (size_t i) {
 #endif
           A[i] += B[i] + scalar * C[i];
       });

From 3b4f5e6bcb034a9034825f1817e938a8bbb29ba6 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Fri, 12 Jan 2018 14:09:15 -0800
Subject: [PATCH 014/245] add SYCL to examples

---
 common/make.defs.gcc   | 2 ++
 common/make.defs.intel | 2 ++
 common/make.defs.llvm  | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index 4073b48a9..24da56216 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -35,6 +35,8 @@ OPENCLFLAG=-framework OpenCL
 # Linux
 #OPENCLDIR=/etc/alternatives/opencl-intel-tools
 #OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL
+# https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
+SYCLDIR=./triSYCL
 #
 # Cilk
 #
diff --git a/common/make.defs.intel b/common/make.defs.intel
index 4fec2a33a..5e2eea3ea 100644
--- a/common/make.defs.intel
+++ b/common/make.defs.intel
@@ -35,6 +35,8 @@ OFFLOADFLAG=-qopenmp-offload=host
 # Linux
 OPENCLDIR=/etc/alternatives/opencl-intel-tools
 OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL
+# https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
+SYCLDIR=./triSYCL
 #
 # Cilk
 #
diff --git a/common/make.defs.llvm b/common/make.defs.llvm
index d8357cd6b..f029e1ceb 100644
--- a/common/make.defs.llvm
+++ b/common/make.defs.llvm
@@ -53,6 +53,8 @@ OPENCLFLAG=-framework OpenCL
 # Linux
 #OPENCLDIR=/etc/alternatives/opencl-intel-tools
 #OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL
+# https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
+SYCLDIR=./triSYCL
 #
 # TBB
 #

From 010e11feace5914debecf319f025bf0d99c89b1a Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Fri, 12 Jan 2018 14:10:07 -0800
Subject: [PATCH 015/245] add SYCL nstream

---
 Cxx11/Makefile        |   9 +-
 Cxx11/nstream-sycl.cc | 204 ++++++++++++++++++++++++++++++++++++++++++
 Cxx11/prk_util.h      |   4 +
 3 files changed, 216 insertions(+), 1 deletion(-)
 create mode 100644 Cxx11/nstream-sycl.cc

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index 093702a48..04e7bae91 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -38,6 +38,7 @@ TARGETFLAGS = $(OFFLOADFLAG)
 OPENCLFLAGS = $(OPENCLFLAG)
 # We do not yet handle all possible exceptions...
 #OPENCLFLAGS += -D__CL_ENABLE_EXCEPTIONS
+SYCLFLAGS = -I$(SYCLDIR)/include -DUSE_SYCL $(BOOSTFLAG)
 ORNLACCFLAGS = $(ORNLACCFLAG)
 TBBFLAGS = $(TBBFLAG) -DUSE_TBB -DPRK_TBB_PARTITIONER=$(PRK_TBB_PARTITIONER)
 CBLASFLAGS = $(CBLASFLAG) $(OPENMPFLAG)
@@ -58,7 +59,7 @@ else
   EXTRA += target
 endif
 
-all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl $(EXTRA)
+all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sycl $(EXTRA)
 
 p2p: p2p-vector p2p-doacross-vector-openmp p2p-innerloop-vector-openmp p2p-tasks-openmp p2p-openmp-target \
      p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl
@@ -90,6 +91,8 @@ taskloop: stencil-vector-taskloop transpose-vector-taskloop nstream-vector-taskl
 
 opencl: p2p-innerloop-opencl stencil-opencl transpose-opencl nstream-opencl
 
+sycl: nstream-sycl
+
 tbb: p2p-innerloop-vector-tbb p2p-vector-tbb stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb
 
 stl: stencil-vector-stl transpose-vector-stl nstream-vector-stl
@@ -118,6 +121,9 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h
 %-opencl: %-opencl.cc prk_util.h prk_opencl.h
 	$(CXX) $(CXXFLAGS) $< $(OPENCLFLAGS) -o $@
 
+%-sycl: %-sycl.cc prk_util.h
+	$(CXX) $(CXXFLAGS) $< $(SYCLFLAGS) -o $@
+
 %-target: %-target.cc prk_util.h
 	$(CXX) $(CXXFLAGS) $< $(OMPFLAGS) $(TARGETFLAGS) -o $@
 
@@ -176,6 +182,7 @@ clean:
 	-rm -f *-target
 	-rm -f *-taskloop
 	-rm -f *-opencl
+	-rm -f *-sycl
 	-rm -f *-tbb
 	-rm -f *-stl
 	-rm -f *-pstl
diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc
new file mode 100644
index 000000000..cf9f49373
--- /dev/null
+++ b/Cxx11/nstream-sycl.cc
@@ -0,0 +1,204 @@
+///
+/// Copyright (c) 2017, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    nstream
+///
+/// PURPOSE: To compute memory bandwidth when adding a vector of a given
+///          number of double precision values to the scalar multiple of
+///          another vector of the same length, and storing the result in
+///          a third vector.
+///
+/// USAGE:   The program takes as input the number
+///          of iterations to loop over the triad vectors, the length of the
+///          vectors, and the offset between vectors
+///
+///          <progname> <# iterations> <vector length> <offset>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// NOTES:   Bandwidth is determined as the number of words read, plus the
+///          number of words written, times the size of the words, divided
+///          by the execution time. For a vector length of N, the total
+///          number of words read and written is 4*N*sizeof(double).
+///
+///
+/// HISTORY: This code is loosely based on the Stream benchmark by John
+///          McCalpin, but does not follow all the Stream rules. Hence,
+///          reported results should not be associated with Stream in
+///          external publications
+///
+///          Converted to C++11 by Jeff Hammond, November 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+
+// See ParallelSTL.md for important information.
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/SYCL STREAM triad: A = B + scalar * C" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations, offset;
+  size_t length;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <vector length>";
+      }
+
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      length = std::atol(argv[2]);
+      if (length <= 0) {
+        throw "ERROR: vector length must be positive";
+      }
+
+      offset = (argc>3) ? std::atoi(argv[3]) : 0;
+      if (length <= 0) {
+        throw "ERROR: offset must be nonnegative";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Vector length        = " << length << std::endl;
+  std::cout << "Offset               = " << offset << std::endl;
+
+  // SYCL device queue
+  cl::sycl::queue q;
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  auto nstream_time = 0.0;
+
+  std::vector<double> h_A;
+  std::vector<double> h_B;
+  std::vector<double> h_C;
+  h_A.resize(length);
+  h_B.resize(length);
+  h_C.resize(length);
+
+  auto range = boost::irange(static_cast<size_t>(0), length);
+
+  const double scalar(3);
+
+  std::for_each( std::begin(range), std::end(range), [&] (size_t i) {
+      h_A[i] = 0;
+      h_B[i] = 2;
+      h_C[i] = 2;
+  });
+
+  {
+    // initialize device buffers from host buffers
+    cl::sycl::buffer<double> d_A { std::begin(h_A), std::end(h_A) };
+    cl::sycl::buffer<double> d_B { std::begin(h_B), std::end(h_B) };
+    cl::sycl::buffer<double> d_C { std::begin(h_C), std::end(h_C) };
+
+    for (auto iter = 0; iter<=iterations; iter++) {
+   
+      if (iter==1) nstream_time = prk::wtime();
+   
+      q.submit([&](cl::sycl::handler& h) {
+
+        // accessor methods
+        auto A = d_A.get_access<cl::sycl::access::mode::read_write>(h);
+        auto B = d_B.get_access<cl::sycl::access::mode::read>(h);
+        auto C = d_C.get_access<cl::sycl::access::mode::read>(h);
+
+        h.parallel_for<class nothing>(cl::sycl::range<1>{length}, [=] (cl::sycl::item<1> i) {
+            A[i] += B[i] + scalar * C[i];
+        });
+      });
+      q.wait();
+    }
+
+    d_A.mark_as_written();
+    // Stop timer before buffer+accessor destructors fire,
+    // since that will move data, and we do not time that
+    // for other device-oriented programming models.
+    nstream_time = prk::wtime() - nstream_time;
+
+    d_A.set_final_data( h_A.begin() );
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  double ar(0);
+  double br(2);
+  double cr(2);
+  for (auto i=0; i<=iterations; i++) {
+      ar += br + scalar * cr;
+  }
+
+  ar *= length;
+
+  double asum(0);
+  for (size_t i=0; i<length; i++) {
+      asum += std::fabs(h_A[i]);
+  }
+
+  double epsilon(1.e-8);
+  if (std::fabs(ar-asum)/asum > epsilon) {
+      std::cout << "Failed Validation on output array\n"
+                << "       Expected checksum: " << ar << "\n"
+                << "       Observed checksum: " << asum << std::endl;
+      std::cout << "ERROR: solution did not validate" << std::endl;
+      return 1;
+  } else {
+      std::cout << "Solution validates" << std::endl;
+      double avgtime = nstream_time/iterations;
+      double nbytes = 4.0 * length * sizeof(double);
+      std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime
+                << " Avg time (s): " << avgtime << std::endl;
+  }
+
+  return 0;
+}
+
+
diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h
index 9d42f3c1d..5804a1bf6 100644
--- a/Cxx11/prk_util.h
+++ b/Cxx11/prk_util.h
@@ -212,6 +212,10 @@ const T prk_reduce(I first, I last, T init) {
 # include "RAJA/RAJA.hpp"
 #endif
 
+#ifdef USE_SYCL
+# include "CL/sycl.hpp"
+#endif
+
 #define RESTRICT __restrict__
 
 namespace prk {

From 2e4dc458713e4e98598e7d3c5cf9b619daa7e1e5 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Fri, 12 Jan 2018 15:00:27 -0800
Subject: [PATCH 016/245] fix a bunch of issues in SYCL nstream

- tested against ComputeCpp and triSYCL now.
- eliminate mark_as_written and set_final_data, which were not present
  in ComputeCpp (only triSYCL).  still figuring out what the right API
  is...
- use better device buffer constructor, i.e. one that is correct all of
  the time :-)
---
 Cxx11/Makefile        | 4 ++--
 Cxx11/nstream-sycl.cc | 9 +++------
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index 04e7bae91..5846d2693 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -38,7 +38,7 @@ TARGETFLAGS = $(OFFLOADFLAG)
 OPENCLFLAGS = $(OPENCLFLAG)
 # We do not yet handle all possible exceptions...
 #OPENCLFLAGS += -D__CL_ENABLE_EXCEPTIONS
-SYCLFLAGS = -I$(SYCLDIR)/include -DUSE_SYCL $(BOOSTFLAG)
+SYCLFLAGS = $(SYCLFLAG) $(BOOSTFLAG)
 ORNLACCFLAGS = $(ORNLACCFLAG)
 TBBFLAGS = $(TBBFLAG) -DUSE_TBB -DPRK_TBB_PARTITIONER=$(PRK_TBB_PARTITIONER)
 CBLASFLAGS = $(CBLASFLAG) $(OPENMPFLAG)
@@ -122,7 +122,7 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h
 	$(CXX) $(CXXFLAGS) $< $(OPENCLFLAGS) -o $@
 
 %-sycl: %-sycl.cc prk_util.h
-	$(CXX) $(CXXFLAGS) $< $(SYCLFLAGS) -o $@
+	$(SYCLCXX) $(CPPFLAGS) $(SYCLFLAGS) $< -o $@
 
 %-target: %-target.cc prk_util.h
 	$(CXX) $(CXXFLAGS) $< $(OMPFLAGS) $(TARGETFLAGS) -o $@
diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc
index cf9f49373..c2684a023 100644
--- a/Cxx11/nstream-sycl.cc
+++ b/Cxx11/nstream-sycl.cc
@@ -134,9 +134,9 @@ int main(int argc, char * argv[])
 
   {
     // initialize device buffers from host buffers
-    cl::sycl::buffer<double> d_A { std::begin(h_A), std::end(h_A) };
-    cl::sycl::buffer<double> d_B { std::begin(h_B), std::end(h_B) };
-    cl::sycl::buffer<double> d_C { std::begin(h_C), std::end(h_C) };
+    cl::sycl::buffer<double> d_A { h_A.data(), h_A.size() };
+    cl::sycl::buffer<double> d_B { h_B.data(), h_B.size() };
+    cl::sycl::buffer<double> d_C { h_C.data(), h_C.size() };
 
     for (auto iter = 0; iter<=iterations; iter++) {
    
@@ -156,13 +156,10 @@ int main(int argc, char * argv[])
       q.wait();
     }
 
-    d_A.mark_as_written();
     // Stop timer before buffer+accessor destructors fire,
     // since that will move data, and we do not time that
     // for other device-oriented programming models.
     nstream_time = prk::wtime() - nstream_time;
-
-    d_A.set_final_data( h_A.begin() );
   }
 
   //////////////////////////////////////////////////////////////////////

From c8ee43331164c376e2e7ccc0ad9c47d8a36dc529 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Fri, 12 Jan 2018 16:34:14 -0800
Subject: [PATCH 017/245] name kernel; cleanup

---
 Cxx11/nstream-sycl.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc
index c2684a023..306dc7038 100644
--- a/Cxx11/nstream-sycl.cc
+++ b/Cxx11/nstream-sycl.cc
@@ -64,8 +64,6 @@
 
 #include "prk_util.h"
 
-// See ParallelSTL.md for important information.
-
 int main(int argc, char * argv[])
 {
   std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
@@ -149,7 +147,7 @@ int main(int argc, char * argv[])
         auto B = d_B.get_access<cl::sycl::access::mode::read>(h);
         auto C = d_C.get_access<cl::sycl::access::mode::read>(h);
 
-        h.parallel_for<class nothing>(cl::sycl::range<1>{length}, [=] (cl::sycl::item<1> i) {
+        h.parallel_for<class nstream>(cl::sycl::range<1>{length}, [=] (cl::sycl::item<1> i) {
             A[i] += B[i] + scalar * C[i];
         });
       });

From 954d504fda805a28576c390cbe0175076b881b0f Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Fri, 12 Jan 2018 16:36:09 -0800
Subject: [PATCH 018/245] cleanup SYCL in make.defs.* examples

---
 common/make.defs.gcc   |  6 ++++++
 common/make.defs.intel |  6 ++++++
 common/make.defs.llvm  | 15 ++++++++++++++-
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index 24da56216..f355695fe 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -35,8 +35,14 @@ OPENCLFLAG=-framework OpenCL
 # Linux
 #OPENCLDIR=/etc/alternatives/opencl-intel-tools
 #OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL
+#
+# SYCL flags
+#
+# triSYCL
 # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
 SYCLDIR=./triSYCL
+SYCLCXX=${CXX}
+SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include
 #
 # Cilk
 #
diff --git a/common/make.defs.intel b/common/make.defs.intel
index 5e2eea3ea..a31e8a6ae 100644
--- a/common/make.defs.intel
+++ b/common/make.defs.intel
@@ -35,8 +35,14 @@ OFFLOADFLAG=-qopenmp-offload=host
 # Linux
 OPENCLDIR=/etc/alternatives/opencl-intel-tools
 OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL
+#
+# SYCL flags
+#
+# triSYCL
 # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
 SYCLDIR=./triSYCL
+SYCLCXX=${CXX}
+SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include
 #
 # Cilk
 #
diff --git a/common/make.defs.llvm b/common/make.defs.llvm
index f029e1ceb..ada326c58 100644
--- a/common/make.defs.llvm
+++ b/common/make.defs.llvm
@@ -53,8 +53,21 @@ OPENCLFLAG=-framework OpenCL
 # Linux
 #OPENCLDIR=/etc/alternatives/opencl-intel-tools
 #OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL
+#
+# SYCL flags
+#
+# CodePlay ComputeCpp
+SYCLDIR=/opt/sycl/latest
+SYCLCXX=${SYCLDIR}/bin/compute++
+SYCLFLAG=-DUSE_SYCL -sycl-driver -I$(SYCLDIR)/include -L$(SYCLDIR)/lib -Wl,-rpath=$(SYCLDIR)/lib -lComputeCpp
+# This makes a huge difference in e.g. nstream...
+SYCLFLAG+=-no-serial-memop
+#
+# triSYCL
 # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
-SYCLDIR=./triSYCL
+#SYCLDIR=./triSYCL
+#SYCLCXX=${CXX}
+#SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include
 #
 # TBB
 #

From 91eb8f4ffe40eb43cd5a2db5457ff03fdd21a89e Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Fri, 12 Jan 2018 16:36:25 -0800
Subject: [PATCH 019/245] add SYCL transpose

---
 Cxx11/Makefile          |   2 +-
 Cxx11/transpose-sycl.cc | 186 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 187 insertions(+), 1 deletion(-)
 create mode 100644 Cxx11/transpose-sycl.cc

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index 5846d2693..ce8af6110 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -91,7 +91,7 @@ taskloop: stencil-vector-taskloop transpose-vector-taskloop nstream-vector-taskl
 
 opencl: p2p-innerloop-opencl stencil-opencl transpose-opencl nstream-opencl
 
-sycl: nstream-sycl
+sycl: nstream-sycl transpose-sycl
 
 tbb: p2p-innerloop-vector-tbb p2p-vector-tbb stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb
 
diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc
new file mode 100644
index 000000000..80ebb12b0
--- /dev/null
+++ b/Cxx11/transpose-sycl.cc
@@ -0,0 +1,186 @@
+///
+/// Copyright (c) 2013, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    transpose
+///
+/// PURPOSE: This program measures the time for the transpose of a
+///          column-major stored matrix into a row-major stored matrix.
+///
+/// USAGE:   Program input is the matrix order and the number of times to
+///          repeat the operation:
+///
+///          transpose <matrix_size> <# iterations>
+///
+///          The output consists of diagnostics to make sure the
+///          transpose worked and timing statistics.
+///
+/// HISTORY: Written by  Rob Van der Wijngaart, February 2009.
+///          Converted to C++11 by Jeff Hammond, February 2016 and May 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/SYCL Matrix transpose: B = A^T" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations;
+  size_t order;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <matrix order>";
+      }
+
+      // number of times to do the transpose
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      // order of a the matrix
+      order = std::atoi(argv[2]);
+      if (order <= 0) {
+        throw "ERROR: Matrix Order must be greater than 0";
+      } else if (order > std::floor(std::sqrt(INT_MAX))) {
+        throw "ERROR: matrix dimension too large - overflow risk";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations  = " << iterations << std::endl;
+  std::cout << "Matrix order          = " << order << std::endl;
+
+  // SYCL device queue
+  cl::sycl::queue q;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Allocate space for the input and transpose matrix
+  //////////////////////////////////////////////////////////////////////
+
+  std::vector<double> h_A;
+  std::vector<double> h_B;
+  h_A.resize(order*order);
+  h_B.resize(order*order,0.0);
+
+  // fill A with the sequence 0 to order^2-1 as doubles
+  std::iota(h_A.begin(), h_A.end(), 0.0);
+
+  auto range = boost::irange(static_cast<size_t>(0),order);
+
+  auto trans_time = 0.0;
+
+  {
+    // initialize device buffers from host buffers
+#if USE_2D_INDEXING
+    cl::sycl::buffer<double,2> d_A( cl::sycl::range<2>{order,order} ); // FIXME: does not initialize with host array
+    cl::sycl::buffer<double,2> d_B( cl::sycl::range<2>{order,order} ); // FIXME: does not initialize with host array
+#else
+    cl::sycl::buffer<double> d_A { h_A.data(), h_A.size() };
+    cl::sycl::buffer<double> d_B { h_B.data(), h_B.size() };
+#endif
+
+    for (auto iter = 0; iter<=iterations; iter++) {
+ 
+      if (iter==1) trans_time = prk::wtime();
+ 
+      q.submit([&](cl::sycl::handler& h) {
+
+        // accessor methods
+        auto A = d_A.get_access<cl::sycl::access::mode::read_write>(h);
+        auto B = d_B.get_access<cl::sycl::access::mode::read_write>(h);
+
+        // transpose
+        h.parallel_for<class transpose>(cl::sycl::range<2>{order,order}, [=] (cl::sycl::item<2> it) {
+#if USE_2D_INDEXING
+#error 2D indexing is not implemented yet.  Fix this!
+#else
+          B[it[0] * order + it[1]] += A[it[1] * order + it[0]];
+          A[it[1] * order + it[0]] += 1.0;
+#endif
+        });
+      });
+      q.wait();
+    }
+
+    // Stop timer before buffer+accessor destructors fire,
+    // since that will move data, and we do not time that
+    // for other device-oriented programming models.
+    trans_time = prk::wtime() - trans_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  // TODO: replace with std::generate, std::accumulate, or similar
+  const auto addit = (iterations+1.) * (iterations/2.);
+  auto abserr = 0.0;
+  for (auto i : range) {
+    for (auto j : range) {
+      const int ij = i*order+j;
+      const int ji = j*order+i;
+      const double reference = static_cast<double>(ij)*(1.+iterations)+addit;
+      abserr += std::fabs(h_B[ji] - reference);
+    }
+  }
+
+#ifdef VERBOSE
+  std::cout << "Sum of absolute differences: " << abserr << std::endl;
+#endif
+
+  const auto epsilon = 1.0e-8;
+  if (abserr < epsilon) {
+    std::cout << "Solution validates" << std::endl;
+    auto avgtime = trans_time/iterations;
+    auto bytes = (size_t)order * (size_t)order * sizeof(double);
+    std::cout << "Rate (MB/s): " << 1.0e-6 * (2L*bytes)/avgtime
+              << " Avg time (s): " << avgtime << std::endl;
+  } else {
+    std::cout << "ERROR: Aggregate squared error " << abserr
+              << " exceeds threshold " << epsilon << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
+
+

From 20126c109d1270b8583bc671e3509de3a0e587fa Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Fri, 12 Jan 2018 17:50:36 -0800
Subject: [PATCH 020/245] fix occa install hopefully

---
 travis/install-occa.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/travis/install-occa.sh b/travis/install-occa.sh
index 7462719e4..8adb47346 100644
--- a/travis/install-occa.sh
+++ b/travis/install-occa.sh
@@ -39,7 +39,8 @@ ${PRK_CXX} -v
 
 if [ ! -d "$TRAVIS_ROOT/occa" ]; then
     BRANCH=develop
-    git clone --recursive --depth 1 -b ${BRANCH} https://github.com/libocca/occa.git $TRAVIS_ROOT/occa
+    git clone --recursive --depth 1 -b ${BRANCH} https://github.com/libocca/occa.git
+    mv occa $TRAVIS_ROOT/occa
     CXX=${PRK_CXX} OCCA_CUDA_ENABLED=0 OCCA_FORTRAN_ENABLED=0 make -f makefile -C $TRAVIS_ROOT/occa
 else
     echo "OCCA installed..."

From 6804a8ae610f02a4692eb5a5879f6bbd2a66f679 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 13 Jan 2018 14:24:13 -0800
Subject: [PATCH 021/245] fix Travis OCCA

---
 travis/install-occa.sh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/travis/install-occa.sh b/travis/install-occa.sh
index 8adb47346..887366219 100644
--- a/travis/install-occa.sh
+++ b/travis/install-occa.sh
@@ -38,9 +38,8 @@ esac
 ${PRK_CXX} -v
 
 if [ ! -d "$TRAVIS_ROOT/occa" ]; then
-    BRANCH=develop
-    git clone --recursive --depth 1 -b ${BRANCH} https://github.com/libocca/occa.git
-    mv occa $TRAVIS_ROOT/occa
+    BRANCH="1.0"
+    git clone --recursive --depth 1 -b ${BRANCH} https://github.com/libocca/occa.git $TRAVIS_ROOT/occa
     CXX=${PRK_CXX} OCCA_CUDA_ENABLED=0 OCCA_FORTRAN_ENABLED=0 make -f makefile -C $TRAVIS_ROOT/occa
 else
     echo "OCCA installed..."

From e1c0822f2c5e83aa9bdf73734c9165fdf3617655 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Sat, 13 Jan 2018 16:13:59 -0800
Subject: [PATCH 022/245] add stencil-sycl despite bugs

stencil kernel is segfaulting due to out-of-bounds error (presumably)
---
 Cxx11/Makefile        |   2 +-
 Cxx11/stencil-sycl.cc | 263 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 264 insertions(+), 1 deletion(-)
 create mode 100644 Cxx11/stencil-sycl.cc

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index ce8af6110..e91dae03f 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -91,7 +91,7 @@ taskloop: stencil-vector-taskloop transpose-vector-taskloop nstream-vector-taskl
 
 opencl: p2p-innerloop-opencl stencil-opencl transpose-opencl nstream-opencl
 
-sycl: nstream-sycl transpose-sycl
+sycl: stencil-sycl transpose-sycl nstream-sycl
 
 tbb: p2p-innerloop-vector-tbb p2p-vector-tbb stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb
 
diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc
new file mode 100644
index 000000000..b9708e0df
--- /dev/null
+++ b/Cxx11/stencil-sycl.cc
@@ -0,0 +1,263 @@
+
+///
+/// Copyright (c) 2013, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    Stencil
+///
+/// PURPOSE: This program tests the efficiency with which a space-invariant,
+///          linear, symmetric filter (stencil) can be applied to a square
+///          grid or image.
+///
+/// USAGE:   The program takes as input the linear
+///          dimension of the grid, and the number of iterations on the grid
+///
+///                <progname> <iterations> <grid size>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// FUNCTIONS CALLED:
+///
+///          Other than standard C functions, the following functions are used in
+///          this program:
+///          wtime()
+///
+/// HISTORY: - Written by Rob Van der Wijngaart, February 2009.
+///          - RvdW: Removed unrolling pragmas for clarity;
+///            added constant to array "in" at end of each iteration to force
+///            refreshing of neighbor data in parallel versions; August 2013
+///            C++11-ification by Jeff Hammond, May 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+
+int main(int argc, char* argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/SYCL Stencil execution on 2D grid" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Process and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations;
+  size_t n;
+  int radius = 2;
+  bool star = true;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <array dimension>";
+      }
+
+      // number of times to run the algorithm
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      // linear grid dimension
+      n  = std::atoi(argv[2]);
+      if (n < 1) {
+        throw "ERROR: grid dimension must be positive";
+      } else if (n > std::floor(std::sqrt(INT_MAX))) {
+        throw "ERROR: grid dimension too large - overflow risk";
+      }
+
+#if 0
+      // default tile size for tiling of local transpose
+      tile_size = 32;
+      if (argc > 3) {
+          tile_size = std::atoi(argv[3]);
+          if (tile_size <= 0) tile_size = n;
+          if (tile_size > n) tile_size = n;
+      }
+
+      // stencil pattern
+      if (argc > 4) {
+          auto stencil = std::string(argv[4]);
+          auto grid = std::string("grid");
+          star = (stencil == grid) ? false : true;
+      }
+
+      // stencil radius
+      radius = 2;
+      if (argc > 5) {
+          radius = std::atoi(argv[5]);
+      }
+
+      if ( (radius < 1) || (2*radius+1 > n) ) {
+        throw "ERROR: Stencil radius negative or too large";
+      }
+#endif
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Grid size            = " << n << std::endl;
+  std::cout << "Type of stencil      = " << (star ? "star" : "grid") << std::endl;
+  std::cout << "Radius of stencil    = " << radius << std::endl;
+
+#if 0
+  auto stencil = nothing;
+  if (star) {
+      switch (radius) {
+          case 1: stencil = star1; break;
+          case 2: stencil = star2; break;
+          case 3: stencil = star3; break;
+          case 4: stencil = star4; break;
+          case 5: stencil = star5; break;
+      }
+  } else {
+      switch (radius) {
+          case 1: stencil = grid1; break;
+          case 2: stencil = grid2; break;
+          case 3: stencil = grid3; break;
+          case 4: stencil = grid4; break;
+          case 5: stencil = grid5; break;
+      }
+  }
+#endif
+
+  // SYCL device queue
+  cl::sycl::queue q;
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  auto stencil_time = 0.0;
+
+  std::vector<double> h_in;
+  std::vector<double> h_out;
+  h_in.resize(n*n);
+  h_out.resize(n*n);
+
+  for (auto i=0; i<n; i++) {
+    for (auto j=0; j<n; j++) {
+      h_in[i*n+j] = static_cast<double>(i+j);
+      h_out[i*n+j] = 0.0;
+    }
+  }
+
+  {
+    // initialize device buffers from host buffers
+    //cl::sycl::buffer<double> d_in  { h_in.data(),  h_in.size() };
+    //cl::sycl::buffer<double> d_out { h_out.data(), h_out.size() };
+    cl::sycl::buffer<double, 2> d_in  { h_in.data() , cl::sycl::range<2> {n, n} };
+    cl::sycl::buffer<double, 2> d_out { h_out.data(), cl::sycl::range<2> {n, n} };
+
+    for (auto iter = 0; iter<=iterations; iter++) {
+   
+      if (iter==1) stencil_time = prk::wtime();
+
+      q.submit([&](cl::sycl::handler& h) {
+
+        // accessor methods
+        auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
+        auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
+       
+#if 0
+        // Apply the stencil operator
+        h.parallel_for<class star2>(cl::sycl::range<2> {n-2, n-2}, cl::sycl::id<2> {2, 2},
+                                    [=] (cl::sycl::item<2> it) {
+            cl::sycl::id<2> xy = it.get_id();
+            cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
+            cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
+            cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0});
+            cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2});
+            out[xy] += +in[xy-dx2] * -0.125
+                       +in[xy-dx1] * -0.25
+                       +in[xy-dy2] * -0.125
+                       +in[xy-dx1] * -0.25
+                       +in[xy+dx1] *  0.25
+                       +in[xy+dx2] *  0.125
+                       +in[xy+dy1] *  0.25
+                       +in[xy+dx2] *  0.125;
+        });
+#endif
+
+        // Add constant to solution to force refresh of neighbor data, if any
+        h.parallel_for<class add>(cl::sycl::range<2> {n, n}, cl::sycl::id<2> {1, 1},
+                                  [=] (cl::sycl::item<2> it) {
+            cl::sycl::id<2> xy = it.get_id();
+            out[xy] += 1.0;
+        });
+
+      });
+      q.wait();
+    }
+    stencil_time = prk::wtime() - stencil_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  // Analyze and output results.
+  //////////////////////////////////////////////////////////////////////
+
+  // interior of grid with respect to stencil
+  size_t active_points = static_cast<size_t>(n-2*radius)*static_cast<size_t>(n-2*radius);
+
+  // compute L1 norm in parallel
+  double norm = 0.0;
+  for (auto i=radius; i<n-radius; i++) {
+    for (auto j=radius; j<n-radius; j++) {
+      norm += std::fabs(h_out[i*n+j]);
+    }
+  }
+  norm /= active_points;
+
+  // verify correctness
+  const double epsilon = 1.0e-8;
+  double reference_norm = 2.*(iterations+1.);
+  if (std::fabs(norm-reference_norm) > epsilon) {
+    std::cout << "ERROR: L1 norm = " << norm
+              << " Reference L1 norm = " << reference_norm << std::endl;
+    return 1;
+  } else {
+    std::cout << "Solution validates" << std::endl;
+#ifdef VERBOSE
+    std::cout << "L1 norm = " << norm
+              << " Reference L1 norm = " << reference_norm << std::endl;
+#endif
+    const int stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1);
+    size_t flops = (2L*(size_t)stencil_size+1L) * active_points;
+    auto avgtime = stencil_time/iterations;
+    std::cout << "Rate (MFlops/s): " << 1.0e-6 * static_cast<double>(flops)/avgtime
+              << " Avg time (s): " << avgtime << std::endl;
+  }
+
+  return 0;
+}

From d5a577ab14b0dbb15bda04857eb82e153565be37 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 13 Jan 2018 16:33:40 -0800
Subject: [PATCH 023/245] use unconstrained transpose

blocked transpose by NVIDIA does not work for all dimensions so we will
use a slower version that actually works for all dimensions.
---
 Cxx11/transpose-cuda.cu | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/Cxx11/transpose-cuda.cu b/Cxx11/transpose-cuda.cu
index 5c710942a..ddb22ca92 100644
--- a/Cxx11/transpose-cuda.cu
+++ b/Cxx11/transpose-cuda.cu
@@ -56,6 +56,7 @@
 #include "prk_util.h"
 #include "prk_cuda.h"
 
+#if TILED
 // The kernel was derived from https://github.com/parallel-forall/code-samples/blob/master/series/cuda-cpp/transpose/transpose.cu,
 // which is the reason for the additional copyright noted above.
 
@@ -73,6 +74,18 @@ __global__ void transpose(int order, prk_float * A, prk_float * B)
         A[(y+j)*width + x] += (prk_float)1;
     }
 }
+#else
+__global__ void transpose(unsigned order, prk_float * A, prk_float * B)
+{
+    auto i = blockIdx.x * blockDim.x + threadIdx.x;
+    auto j = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if ((i<order) && (j<order)) {
+        B[i*order+j] += A[j*order+i];
+        A[j*order+i] += (prk_float)1;
+    }
+}
+#endif
 
 int main(int argc, char * argv[])
 {
@@ -105,10 +118,18 @@ int main(int argc, char * argv[])
         throw "ERROR: matrix dimension too large - overflow risk";
       }
 
+#if TILED
       if (order % tile_dim != 0) {
           std::cout << "Sorry, but order (" << order << ") must be evenly divible by " << tile_dim
                     << " or the results are going to be wrong.\n";
       }
+#endif
+#ifdef __CORIANDERCC__
+      // This has not been analyzed, but it is an empirical fact.
+      if (order > 1234) {
+          std::cout << "The results are probably going to be wrong, because order>1234.\n";
+      }
+#endif
   }
   catch (const char * e) {
     std::cout << e << std::endl;
@@ -118,8 +139,13 @@ int main(int argc, char * argv[])
   std::cout << "Matrix order          = " << order << std::endl;
   std::cout << "Number of iterations  = " << iterations << std::endl;
 
+#if TILED
   dim3 dimGrid(order/tile_dim, order/tile_dim, 1);
   dim3 dimBlock(tile_dim, block_rows, 1);
+#else
+  dim3 dimGrid(order, order, 1);
+  dim3 dimBlock(1, 1, 1);
+#endif
 
   info.checkDims(dimBlock, dimGrid);
 

From c25dc7281e64744423bb9bd64b8bfe1125c7a18a Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Sat, 13 Jan 2018 17:00:46 -0800
Subject: [PATCH 024/245] debugged SYCL stencil

---
 Cxx11/stencil-sycl.cc | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc
index b9708e0df..4d3c83c32 100644
--- a/Cxx11/stencil-sycl.cc
+++ b/Cxx11/stencil-sycl.cc
@@ -190,33 +190,44 @@ int main(int argc, char* argv[])
         auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
         auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
        
-#if 0
         // Apply the stencil operator
-        h.parallel_for<class star2>(cl::sycl::range<2> {n-2, n-2}, cl::sycl::id<2> {2, 2},
+        h.parallel_for<class star2>(cl::sycl::range<2> {n-4, n-4}, cl::sycl::id<2> {2, 2},
                                     [=] (cl::sycl::item<2> it) {
             cl::sycl::id<2> xy = it.get_id();
+#if 1
             cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
             cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
             cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0});
             cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2});
-            out[xy] += +in[xy-dx2] * -0.125
-                       +in[xy-dx1] * -0.25
-                       +in[xy-dy2] * -0.125
+#endif
+            //printf("%zu,%zu\n",xy[0],xy[1]);
+            out[xy] += 0.0
+#if 1
                        +in[xy-dx1] * -0.25
                        +in[xy+dx1] *  0.25
-                       +in[xy+dx2] *  0.125
+                       +in[xy-dy1] * -0.25
                        +in[xy+dy1] *  0.25
-                       +in[xy+dx2] *  0.125;
-        });
+                       +in[xy-dx2] * -0.125
+                       +in[xy+dx2] *  0.125
+                       +in[xy-dy2] * -0.125
+                       +in[xy+dy2] *  0.125
 #endif
+                       ;
+        });
+      });
+
+      q.submit([&](cl::sycl::handler& h) {
 
+        // accessor methods
+        auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
+        auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
+       
         // Add constant to solution to force refresh of neighbor data, if any
-        h.parallel_for<class add>(cl::sycl::range<2> {n, n}, cl::sycl::id<2> {1, 1},
+        h.parallel_for<class add>(cl::sycl::range<2> {n, n}, cl::sycl::id<2> {0, 0},
                                   [=] (cl::sycl::item<2> it) {
             cl::sycl::id<2> xy = it.get_id();
-            out[xy] += 1.0;
+            in[xy] += 1.0;
         });
-
       });
       q.wait();
     }

From e0898dda89255081c8a8ec258a001875e5bb2a2b Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Sat, 13 Jan 2018 20:25:48 -0800
Subject: [PATCH 025/245] refactor stencil-sycl

---
 Cxx11/stencil-sycl.cc | 93 +++++++++++++++++++++++--------------------
 1 file changed, 50 insertions(+), 43 deletions(-)

diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc
index 4d3c83c32..552e11082 100644
--- a/Cxx11/stencil-sycl.cc
+++ b/Cxx11/stencil-sycl.cc
@@ -1,6 +1,6 @@
 
 ///
-/// Copyright (c) 2013, Intel Corporation
+/// Copyright (c) 2017, Intel Corporation
 ///
 /// Redistribution and use in source and binary forms, with or without
 /// modification, are permitted provided that the following conditions
@@ -61,6 +61,45 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
+#include "stencil_sycl.hpp"
+
+void nothing(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double, 2> d_in, cl::sycl::buffer<double, 2> d_out)
+{
+    std::cout << "You are trying to use a stencil that does not exist.\n";
+    std::cout << "Please generate the new stencil using the code generator\n";
+    std::cout << "and add it to the case-switch in the driver." << std::endl;
+    std::abort();
+}
+
+void star2(cl::sycl::queue & q, const size_t n,
+           cl::sycl::buffer<double, 2> d_in,
+           cl::sycl::buffer<double, 2> d_out)
+{
+   q.submit([&](cl::sycl::handler& h) {
+
+     // accessor methods
+     auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
+     auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
+
+     // Apply the stencil operator
+     h.parallel_for<class star2>(cl::sycl::range<2> {n-4, n-4}, cl::sycl::id<2> {2, 2},
+                                 [=] (cl::sycl::item<2> it) {
+         cl::sycl::id<2> xy = it.get_id();
+         cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
+         cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
+         cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0});
+         cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2});
+         out[xy] += +in[xy-dx1] * -0.25
+                    +in[xy+dx1] *  0.25
+                    +in[xy-dy1] * -0.25
+                    +in[xy+dy1] *  0.25
+                    +in[xy-dx2] * -0.125
+                    +in[xy+dx2] *  0.125
+                    +in[xy-dy2] * -0.125
+                    +in[xy+dy2] *  0.125;
+     });
+   });
+}
 
 int main(int argc, char* argv[])
 {
@@ -131,26 +170,24 @@ int main(int argc, char* argv[])
   std::cout << "Type of stencil      = " << (star ? "star" : "grid") << std::endl;
   std::cout << "Radius of stencil    = " << radius << std::endl;
 
-#if 0
   auto stencil = nothing;
   if (star) {
       switch (radius) {
-          case 1: stencil = star1; break;
+          //case 1: stencil = star1; break;
           case 2: stencil = star2; break;
-          case 3: stencil = star3; break;
-          case 4: stencil = star4; break;
-          case 5: stencil = star5; break;
+          //case 3: stencil = star3; break;
+          //case 4: stencil = star4; break;
+          //case 5: stencil = star5; break;
       }
   } else {
       switch (radius) {
-          case 1: stencil = grid1; break;
-          case 2: stencil = grid2; break;
-          case 3: stencil = grid3; break;
-          case 4: stencil = grid4; break;
-          case 5: stencil = grid5; break;
+          //case 1: stencil = grid1; break;
+          //case 2: stencil = grid2; break;
+          //case 3: stencil = grid3; break;
+          //case 4: stencil = grid4; break;
+          //case 5: stencil = grid5; break;
       }
   }
-#endif
 
   // SYCL device queue
   cl::sycl::queue q;
@@ -184,37 +221,7 @@ int main(int argc, char* argv[])
    
       if (iter==1) stencil_time = prk::wtime();
 
-      q.submit([&](cl::sycl::handler& h) {
-
-        // accessor methods
-        auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
-        auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
-       
-        // Apply the stencil operator
-        h.parallel_for<class star2>(cl::sycl::range<2> {n-4, n-4}, cl::sycl::id<2> {2, 2},
-                                    [=] (cl::sycl::item<2> it) {
-            cl::sycl::id<2> xy = it.get_id();
-#if 1
-            cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
-            cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
-            cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0});
-            cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2});
-#endif
-            //printf("%zu,%zu\n",xy[0],xy[1]);
-            out[xy] += 0.0
-#if 1
-                       +in[xy-dx1] * -0.25
-                       +in[xy+dx1] *  0.25
-                       +in[xy-dy1] * -0.25
-                       +in[xy+dy1] *  0.25
-                       +in[xy-dx2] * -0.125
-                       +in[xy+dx2] *  0.125
-                       +in[xy-dy2] * -0.125
-                       +in[xy+dy2] *  0.125
-#endif
-                       ;
-        });
-      });
+      star2(q, n, d_in, d_out);
 
       q.submit([&](cl::sycl::handler& h) {
 

From 423edd4f3075e9f6434b17a2949121a70b324e85 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Sat, 13 Jan 2018 21:10:18 -0800
Subject: [PATCH 026/245] create and use SYCL code generator

- standalone code generator that improves in some ways on the cxx one.
- star2 behaves the same, other star stencils validate.
- did not implement grid, because it's still buggy in other impls
  anyways.  fix that bug (bookkeeping) first.
---
 Cxx11/generate-sycl-stencil.py |  78 +++++++++++++++++
 Cxx11/stencil-sycl.cc          |  16 ++--
 Cxx11/stencil_sycl.hpp         | 155 +++++++++++++++++++++++++++++++++
 3 files changed, 241 insertions(+), 8 deletions(-)
 create mode 100755 Cxx11/generate-sycl-stencil.py
 create mode 100644 Cxx11/stencil_sycl.hpp

diff --git a/Cxx11/generate-sycl-stencil.py b/Cxx11/generate-sycl-stencil.py
new file mode 100755
index 000000000..8a8f44ddb
--- /dev/null
+++ b/Cxx11/generate-sycl-stencil.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+
+import sys
+import fileinput
+import string
+import os
+
+def codegen(src,pattern,stencil_size,radius,W,model):
+    src.write('void '+pattern+str(radius)+'(cl::sycl::queue & q, const size_t n,\n')
+    src.write('           cl::sycl::buffer<double, 2> d_in,\n')
+    src.write('           cl::sycl::buffer<double, 2> d_out) {\n')
+    src.write('  q.submit([&](cl::sycl::handler& h) {\n')
+    src.write('    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);       \n')
+    src.write('    auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);\n')
+    src.write('    h.parallel_for<class '+pattern+str(radius)+'>(cl::sycl::range<2> {n-2*'+str(radius)+',n-2*'+str(radius)+'}, cl::sycl::id<2> {'+str(radius)+','+str(radius)+'},\n')
+    src.write('                                [=] (cl::sycl::item<2> it) {\n')
+    src.write('        cl::sycl::id<2> xy = it.get_id();\n')
+    for r in range(1,radius+1):
+        src.write('        cl::sycl::id<2> dx'+str(r)+'(cl::sycl::range<2> {'+str(r)+',0});\n')
+        src.write('        cl::sycl::id<2> dy'+str(r)+'(cl::sycl::range<2> {0,'+str(r)+'});\n')
+    src.write('        out[xy] += ')
+    if pattern == 'star':
+        for i in range(1,radius+1):
+            if i > 1:
+                src.write('\n')
+                src.write(19*' ')
+            src.write('+in[xy+dx'+str(i)+'] * '+str(+1./(2.*i*radius)))
+            src.write('\n'+19*' ')
+            src.write('+in[xy+dy'+str(i)+'] * '+str(+1./(2.*i*radius)))
+            src.write('\n'+19*' ')
+            src.write('+in[xy-dx'+str(i)+'] * '+str(-1./(2.*i*radius)))
+            src.write('\n'+19*' ')
+            src.write('+in[xy-dy'+str(i)+'] * '+str(-1./(2.*i*radius)))
+            if i == radius:
+                src.write(';\n')
+    else:
+        print('grid not implemented\n')
+    src.write('    });\n')
+    src.write('  });\n')
+    src.write('}\n\n')
+
+def instance(src,model,pattern,r):
+
+    W = [[0.0e0 for x in range(2*r+1)] for x in range(2*r+1)]
+    if pattern == 'star':
+        stencil_size = 4*r+1
+        for i in range(1,r+1):
+            W[r][r+i] = +1./(2*i*r)
+            W[r+i][r] = +1./(2*i*r)
+            W[r][r-i] = -1./(2*i*r)
+            W[r-i][r] = -1./(2*i*r)
+
+    else:
+        stencil_size = (2*r+1)**2
+        for j in range(1,r+1):
+            for i in range(-j+1,j):
+                W[r+i][r+j] = +1./(4*j*(2*j-1)*r)
+                W[r+i][r-j] = -1./(4*j*(2*j-1)*r)
+                W[r+j][r+i] = +1./(4*j*(2*j-1)*r)
+                W[r-j][r+i] = -1./(4*j*(2*j-1)*r)
+
+            W[r+j][r+j]    = +1./(4*j*r)
+            W[r-j][r-j]    = -1./(4*j*r)
+
+    codegen(src,pattern,stencil_size,r,W,model)
+
+def main():
+    for model in ['sycl']:
+      src = open('stencil_'+model+'.hpp','w')
+      #for pattern in ['star','grid']:
+      for pattern in ['star']:
+        for r in range(1,6):
+          instance(src,model,pattern,r)
+      src.close()
+
+if __name__ == '__main__':
+    main()
+
diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc
index 552e11082..fcd193298 100644
--- a/Cxx11/stencil-sycl.cc
+++ b/Cxx11/stencil-sycl.cc
@@ -71,6 +71,7 @@ void nothing(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double, 2> d_
     std::abort();
 }
 
+#if 0
 void star2(cl::sycl::queue & q, const size_t n,
            cl::sycl::buffer<double, 2> d_in,
            cl::sycl::buffer<double, 2> d_out)
@@ -100,6 +101,7 @@ void star2(cl::sycl::queue & q, const size_t n,
      });
    });
 }
+#endif
 
 int main(int argc, char* argv[])
 {
@@ -141,6 +143,7 @@ int main(int argc, char* argv[])
           if (tile_size <= 0) tile_size = n;
           if (tile_size > n) tile_size = n;
       }
+#endif
 
       // stencil pattern
       if (argc > 4) {
@@ -158,7 +161,6 @@ int main(int argc, char* argv[])
       if ( (radius < 1) || (2*radius+1 > n) ) {
         throw "ERROR: Stencil radius negative or too large";
       }
-#endif
   }
   catch (const char * e) {
     std::cout << e << std::endl;
@@ -173,11 +175,11 @@ int main(int argc, char* argv[])
   auto stencil = nothing;
   if (star) {
       switch (radius) {
-          //case 1: stencil = star1; break;
+          case 1: stencil = star1; break;
           case 2: stencil = star2; break;
-          //case 3: stencil = star3; break;
-          //case 4: stencil = star4; break;
-          //case 5: stencil = star5; break;
+          case 3: stencil = star3; break;
+          case 4: stencil = star4; break;
+          case 5: stencil = star5; break;
       }
   } else {
       switch (radius) {
@@ -212,8 +214,6 @@ int main(int argc, char* argv[])
 
   {
     // initialize device buffers from host buffers
-    //cl::sycl::buffer<double> d_in  { h_in.data(),  h_in.size() };
-    //cl::sycl::buffer<double> d_out { h_out.data(), h_out.size() };
     cl::sycl::buffer<double, 2> d_in  { h_in.data() , cl::sycl::range<2> {n, n} };
     cl::sycl::buffer<double, 2> d_out { h_out.data(), cl::sycl::range<2> {n, n} };
 
@@ -230,7 +230,7 @@ int main(int argc, char* argv[])
         auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
        
         // Add constant to solution to force refresh of neighbor data, if any
-        h.parallel_for<class add>(cl::sycl::range<2> {n, n}, cl::sycl::id<2> {0, 0},
+        h.parallel_for<class add>(cl::sycl::range<2> {n, n}, //cl::sycl::id<2> {0, 0},
                                   [=] (cl::sycl::item<2> it) {
             cl::sycl::id<2> xy = it.get_id();
             in[xy] += 1.0;
diff --git a/Cxx11/stencil_sycl.hpp b/Cxx11/stencil_sycl.hpp
new file mode 100644
index 000000000..845082e62
--- /dev/null
+++ b/Cxx11/stencil_sycl.hpp
@@ -0,0 +1,155 @@
+void star1(cl::sycl::queue & q, const size_t n,
+           cl::sycl::buffer<double, 2> d_in,
+           cl::sycl::buffer<double, 2> d_out) {
+  q.submit([&](cl::sycl::handler& h) {
+    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);       
+    auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
+    h.parallel_for<class star1>(cl::sycl::range<2> {n-2*1,n-2*1}, cl::sycl::id<2> {1,1},
+                                [=] (cl::sycl::item<2> it) {
+        cl::sycl::id<2> xy = it.get_id();
+        cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
+        cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
+        out[xy] += +in[xy+dx1] * 0.5
+                   +in[xy+dy1] * 0.5
+                   +in[xy-dx1] * -0.5
+                   +in[xy-dy1] * -0.5;
+    });
+  });
+}
+
+void star2(cl::sycl::queue & q, const size_t n,
+           cl::sycl::buffer<double, 2> d_in,
+           cl::sycl::buffer<double, 2> d_out) {
+  q.submit([&](cl::sycl::handler& h) {
+    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);       
+    auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
+    h.parallel_for<class star2>(cl::sycl::range<2> {n-2*2,n-2*2}, cl::sycl::id<2> {2,2},
+                                [=] (cl::sycl::item<2> it) {
+        cl::sycl::id<2> xy = it.get_id();
+        cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
+        cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
+        cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0});
+        cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2});
+        out[xy] += +in[xy+dx1] * 0.25
+                   +in[xy+dy1] * 0.25
+                   +in[xy-dx1] * -0.25
+                   +in[xy-dy1] * -0.25
+                   +in[xy+dx2] * 0.125
+                   +in[xy+dy2] * 0.125
+                   +in[xy-dx2] * -0.125
+                   +in[xy-dy2] * -0.125;
+    });
+  });
+}
+
+void star3(cl::sycl::queue & q, const size_t n,
+           cl::sycl::buffer<double, 2> d_in,
+           cl::sycl::buffer<double, 2> d_out) {
+  q.submit([&](cl::sycl::handler& h) {
+    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);       
+    auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
+    h.parallel_for<class star3>(cl::sycl::range<2> {n-2*3,n-2*3}, cl::sycl::id<2> {3,3},
+                                [=] (cl::sycl::item<2> it) {
+        cl::sycl::id<2> xy = it.get_id();
+        cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
+        cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
+        cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0});
+        cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2});
+        cl::sycl::id<2> dx3(cl::sycl::range<2> {3,0});
+        cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3});
+        out[xy] += +in[xy+dx1] * 0.16666666666666666
+                   +in[xy+dy1] * 0.16666666666666666
+                   +in[xy-dx1] * -0.16666666666666666
+                   +in[xy-dy1] * -0.16666666666666666
+                   +in[xy+dx2] * 0.08333333333333333
+                   +in[xy+dy2] * 0.08333333333333333
+                   +in[xy-dx2] * -0.08333333333333333
+                   +in[xy-dy2] * -0.08333333333333333
+                   +in[xy+dx3] * 0.05555555555555555
+                   +in[xy+dy3] * 0.05555555555555555
+                   +in[xy-dx3] * -0.05555555555555555
+                   +in[xy-dy3] * -0.05555555555555555;
+    });
+  });
+}
+
+void star4(cl::sycl::queue & q, const size_t n,
+           cl::sycl::buffer<double, 2> d_in,
+           cl::sycl::buffer<double, 2> d_out) {
+  q.submit([&](cl::sycl::handler& h) {
+    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);       
+    auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
+    h.parallel_for<class star4>(cl::sycl::range<2> {n-2*4,n-2*4}, cl::sycl::id<2> {4,4},
+                                [=] (cl::sycl::item<2> it) {
+        cl::sycl::id<2> xy = it.get_id();
+        cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
+        cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
+        cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0});
+        cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2});
+        cl::sycl::id<2> dx3(cl::sycl::range<2> {3,0});
+        cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3});
+        cl::sycl::id<2> dx4(cl::sycl::range<2> {4,0});
+        cl::sycl::id<2> dy4(cl::sycl::range<2> {0,4});
+        out[xy] += +in[xy+dx1] * 0.125
+                   +in[xy+dy1] * 0.125
+                   +in[xy-dx1] * -0.125
+                   +in[xy-dy1] * -0.125
+                   +in[xy+dx2] * 0.0625
+                   +in[xy+dy2] * 0.0625
+                   +in[xy-dx2] * -0.0625
+                   +in[xy-dy2] * -0.0625
+                   +in[xy+dx3] * 0.041666666666666664
+                   +in[xy+dy3] * 0.041666666666666664
+                   +in[xy-dx3] * -0.041666666666666664
+                   +in[xy-dy3] * -0.041666666666666664
+                   +in[xy+dx4] * 0.03125
+                   +in[xy+dy4] * 0.03125
+                   +in[xy-dx4] * -0.03125
+                   +in[xy-dy4] * -0.03125;
+    });
+  });
+}
+
+void star5(cl::sycl::queue & q, const size_t n,
+           cl::sycl::buffer<double, 2> d_in,
+           cl::sycl::buffer<double, 2> d_out) {
+  q.submit([&](cl::sycl::handler& h) {
+    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);       
+    auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
+    h.parallel_for<class star5>(cl::sycl::range<2> {n-2*5,n-2*5}, cl::sycl::id<2> {5,5},
+                                [=] (cl::sycl::item<2> it) {
+        cl::sycl::id<2> xy = it.get_id();
+        cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
+        cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
+        cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0});
+        cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2});
+        cl::sycl::id<2> dx3(cl::sycl::range<2> {3,0});
+        cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3});
+        cl::sycl::id<2> dx4(cl::sycl::range<2> {4,0});
+        cl::sycl::id<2> dy4(cl::sycl::range<2> {0,4});
+        cl::sycl::id<2> dx5(cl::sycl::range<2> {5,0});
+        cl::sycl::id<2> dy5(cl::sycl::range<2> {0,5});
+        out[xy] += +in[xy+dx1] * 0.1
+                   +in[xy+dy1] * 0.1
+                   +in[xy-dx1] * -0.1
+                   +in[xy-dy1] * -0.1
+                   +in[xy+dx2] * 0.05
+                   +in[xy+dy2] * 0.05
+                   +in[xy-dx2] * -0.05
+                   +in[xy-dy2] * -0.05
+                   +in[xy+dx3] * 0.03333333333333333
+                   +in[xy+dy3] * 0.03333333333333333
+                   +in[xy-dx3] * -0.03333333333333333
+                   +in[xy-dy3] * -0.03333333333333333
+                   +in[xy+dx4] * 0.025
+                   +in[xy+dy4] * 0.025
+                   +in[xy-dx4] * -0.025
+                   +in[xy-dy4] * -0.025
+                   +in[xy+dx5] * 0.02
+                   +in[xy+dy5] * 0.02
+                   +in[xy-dx5] * -0.02
+                   +in[xy-dy5] * -0.02;
+    });
+  });
+}
+

From 8dc644e51f6c8e40c4aa2fdb2d6aca231c42a154 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Mon, 15 Jan 2018 21:10:36 -0800
Subject: [PATCH 027/245] no clue why OCCA is so hard to get runnig in Travis

---
 travis/install-occa.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/travis/install-occa.sh b/travis/install-occa.sh
index 887366219..810580cd0 100644
--- a/travis/install-occa.sh
+++ b/travis/install-occa.sh
@@ -40,7 +40,7 @@ ${PRK_CXX} -v
 if [ ! -d "$TRAVIS_ROOT/occa" ]; then
     BRANCH="1.0"
     git clone --recursive --depth 1 -b ${BRANCH} https://github.com/libocca/occa.git $TRAVIS_ROOT/occa
-    CXX=${PRK_CXX} OCCA_CUDA_ENABLED=0 OCCA_FORTRAN_ENABLED=0 make -f makefile -C $TRAVIS_ROOT/occa
+    CXX=${PRK_CXX} OCCA_CUDA_ENABLED=0 OCCA_FORTRAN_ENABLED=0 make -C $TRAVIS_ROOT/occa
 else
     echo "OCCA installed..."
     find $TRAVIS_ROOT/occa -name occa.hpp

From 25b5c7c8d6dfeb628d404f191c3399423c267328 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Mon, 15 Jan 2018 21:12:41 -0800
Subject: [PATCH 028/245] workaround Mac ld

---
 travis/build-run-prk.sh | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index b76414933..10a1841f6 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -575,11 +575,14 @@ case "$PRK_TARGET" in
         done
 
         # C++ w/ OCCA
-        echo "OCCADIR=${TRAVIS_ROOT}/occa" >> common/make.defs
-        export OCCA_CXX=${PRK_CXX}
-        make -C $PRK_TARGET_PATH transpose-occa nstream-occa
-        $PRK_TARGET_PATH/transpose-occa   10 1024 32
-        $PRK_TARGET_PATH/nstream-occa     10 16777216 32
+        # OCCA sets  -Wl,-rpath=${OCCA_LIB}, which chokes Mac's ld.
+        if [ "${TRAVIS_OS_NAME}" = "linux" ] ; then
+            echo "OCCADIR=${TRAVIS_ROOT}/occa" >> common/make.defs
+            export OCCA_CXX=${PRK_CXX}
+            make -C $PRK_TARGET_PATH transpose-occa nstream-occa
+            $PRK_TARGET_PATH/transpose-occa   10 1024 32
+            $PRK_TARGET_PATH/nstream-occa     10 16777216 32
+        fi
         ;;
     allfortran)
         echo "Fortran"

From 34e99ad5baffd409f13b35cc7b69c2a99ba17f96 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 17 Jan 2018 09:37:27 -0800
Subject: [PATCH 029/245] improve SYCL stencil

- use size_t instead of int so i can use nice initializers for ranges
- fix bug in accessors where in was read instead of read_write
- do initialization of <in> on the device in SYCL instead of host.
---
 Cxx11/stencil-sycl.cc | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc
index fcd193298..74a8a2801 100644
--- a/Cxx11/stencil-sycl.cc
+++ b/Cxx11/stencil-sycl.cc
@@ -114,7 +114,7 @@ int main(int argc, char* argv[])
 
   int iterations;
   size_t n;
-  int radius = 2;
+  size_t radius = 2;
   bool star = true;
   try {
       if (argc < 3) {
@@ -200,23 +200,36 @@ int main(int argc, char* argv[])
 
   auto stencil_time = 0.0;
 
-  std::vector<double> h_in;
   std::vector<double> h_out;
-  h_in.resize(n*n);
-  h_out.resize(n*n);
+  h_out.resize(n*n,0.0);
 
   for (auto i=0; i<n; i++) {
     for (auto j=0; j<n; j++) {
-      h_in[i*n+j] = static_cast<double>(i+j);
       h_out[i*n+j] = 0.0;
     }
   }
 
   {
     // initialize device buffers from host buffers
-    cl::sycl::buffer<double, 2> d_in  { h_in.data() , cl::sycl::range<2> {n, n} };
+    cl::sycl::buffer<double, 2> d_in  { cl::sycl::range<2> {n, n} };
     cl::sycl::buffer<double, 2> d_out { h_out.data(), cl::sycl::range<2> {n, n} };
 
+    q.submit([&](cl::sycl::handler& h) {
+
+      // accessor methods
+      auto in  = d_in.get_access<cl::sycl::access::mode::read_write>(h);
+
+      // Add constant to solution to force refresh of neighbor data, if any
+      h.parallel_for<class init>(cl::sycl::range<2> {n, n}, //cl::sycl::id<2> {0, 0},
+                                [=] (cl::sycl::item<2> it) {
+          cl::sycl::id<2> xy = it.get_id();
+          auto i = xy[0];
+          auto j = xy[1];
+          in[xy] = static_cast<double>(i+j);
+      });
+    });
+    q.wait();
+
     for (auto iter = 0; iter<=iterations; iter++) {
    
       if (iter==1) stencil_time = prk::wtime();
@@ -226,7 +239,7 @@ int main(int argc, char* argv[])
       q.submit([&](cl::sycl::handler& h) {
 
         // accessor methods
-        auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
+        auto in  = d_in.get_access<cl::sycl::access::mode::read_write>(h);
         auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
        
         // Add constant to solution to force refresh of neighbor data, if any
@@ -246,7 +259,7 @@ int main(int argc, char* argv[])
   //////////////////////////////////////////////////////////////////////
 
   // interior of grid with respect to stencil
-  size_t active_points = static_cast<size_t>(n-2*radius)*static_cast<size_t>(n-2*radius);
+  auto active_points = (n-2L*radius)*(n-2L*radius);
 
   // compute L1 norm in parallel
   double norm = 0.0;
@@ -270,8 +283,8 @@ int main(int argc, char* argv[])
     std::cout << "L1 norm = " << norm
               << " Reference L1 norm = " << reference_norm << std::endl;
 #endif
-    const int stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1);
-    size_t flops = (2L*(size_t)stencil_size+1L) * active_points;
+    const size_t stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1);
+    size_t flops = (2L*stencil_size+1L) * active_points;
     auto avgtime = stencil_time/iterations;
     std::cout << "Rate (MFlops/s): " << 1.0e-6 * static_cast<double>(flops)/avgtime
               << " Avg time (s): " << avgtime << std::endl;

From ac9bbed2dffeae4e8f505b2edee1dbcd9e828bad Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Wed, 17 Jan 2018 21:28:13 -0800
Subject: [PATCH 030/245] disable OCCA in Travis [ci skip]

---
 travis/build-run-prk.sh | 14 +++++++-------
 travis/install-deps.sh  |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index 619d5a00d..41a34612f 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -578,13 +578,13 @@ case "$PRK_TARGET" in
 
         # C++ w/ OCCA
         # OCCA sets  -Wl,-rpath=${OCCA_LIB}, which chokes Mac's ld.
-        if [ "${TRAVIS_OS_NAME}" = "linux" ] ; then
-            echo "OCCADIR=${TRAVIS_ROOT}/occa" >> common/make.defs
-            export OCCA_CXX=${PRK_CXX}
-            make -C $PRK_TARGET_PATH transpose-occa nstream-occa
-            $PRK_TARGET_PATH/transpose-occa   10 1024 32
-            $PRK_TARGET_PATH/nstream-occa     10 16777216 32
-        fi
+        #if [ "${TRAVIS_OS_NAME}" = "linux" ] ; then
+        #    echo "OCCADIR=${TRAVIS_ROOT}/occa" >> common/make.defs
+        #    export OCCA_CXX=${PRK_CXX}
+        #    make -C $PRK_TARGET_PATH transpose-occa nstream-occa
+        #    $PRK_TARGET_PATH/transpose-occa   10 1024 32
+        #    $PRK_TARGET_PATH/nstream-occa     10 16777216 32
+        #fi
         ;;
     allfortran)
         echo "Fortran"
diff --git a/travis/install-deps.sh b/travis/install-deps.sh
index 35e926c76..42b620858 100755
--- a/travis/install-deps.sh
+++ b/travis/install-deps.sh
@@ -69,7 +69,7 @@ case "$PRK_TARGET" in
         sh ./travis/install-cmake.sh $TRAVIS_ROOT
         sh ./travis/install-raja.sh $TRAVIS_ROOT
         sh ./travis/install-kokkos.sh $TRAVIS_ROOT
-        sh ./travis/install-occa.sh $TRAVIS_ROOT
+        #sh ./travis/install-occa.sh $TRAVIS_ROOT
         ;;
     allfortran)
         echo "Fortran"

From 2de3b7774e1700f9aeaedf4f0d77b6a932f8beb7 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Fri, 19 Jan 2018 12:18:38 -0800
Subject: [PATCH 031/245] fix banner error [ci skip]

---
 Cxx11/transpose-kokkos.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cxx11/transpose-kokkos.cc b/Cxx11/transpose-kokkos.cc
index 7a7543b04..268d9e19a 100644
--- a/Cxx11/transpose-kokkos.cc
+++ b/Cxx11/transpose-kokkos.cc
@@ -54,7 +54,7 @@
 int main(int argc, char * argv[])
 {
   std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
-  std::cout << "C++11/Kokkos Stencil execution on 2D grid" << std::endl;
+  std::cout << "C++11 Matrix transpose: B = A^T" << std::endl;
 
   Kokkos::initialize(argc, argv);
 

From 4bd6a3c4900d44c070db91a4688c78aeb8f40d62 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Fri, 19 Jan 2018 12:18:38 -0800
Subject: [PATCH 032/245] fix banner error [ci skip]

---
 Cxx11/transpose-kokkos.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cxx11/transpose-kokkos.cc b/Cxx11/transpose-kokkos.cc
index 7a7543b04..268d9e19a 100644
--- a/Cxx11/transpose-kokkos.cc
+++ b/Cxx11/transpose-kokkos.cc
@@ -54,7 +54,7 @@
 int main(int argc, char * argv[])
 {
   std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
-  std::cout << "C++11/Kokkos Stencil execution on 2D grid" << std::endl;
+  std::cout << "C++11 Matrix transpose: B = A^T" << std::endl;
 
   Kokkos::initialize(argc, argv);
 

From 9633215c36b1c58ea24bc76e74fff866a37178a0 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Fri, 19 Jan 2018 13:26:15 -0800
Subject: [PATCH 033/245] detect bad input settings [ci skip]

---
 Cxx11/transpose-vector-async.cc  |  8 +++++++-
 Cxx11/transpose-vector-thread.cc | 10 ++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/Cxx11/transpose-vector-async.cc b/Cxx11/transpose-vector-async.cc
index 43040c389..74a349cd2 100644
--- a/Cxx11/transpose-vector-async.cc
+++ b/Cxx11/transpose-vector-async.cc
@@ -109,8 +109,14 @@ int main(int argc, char * argv[])
   std::cout << "Block size            = " << block_size << std::endl;
   std::cout << "Tile size             = " << tile_size << std::endl;
 
+  if (num_futures > 300) {
+      std::cout << "These settings may lead to resource exhaustion.\n"
+                << "Please use a larger block size.\n";
+      return 1;
+  }
+
   //////////////////////////////////////////////////////////////////////
-  /// Allocate space for the input and transpose matrix
+  // Allocate space and perform the computation
   //////////////////////////////////////////////////////////////////////
 
   std::vector<double> A;
diff --git a/Cxx11/transpose-vector-thread.cc b/Cxx11/transpose-vector-thread.cc
index c6c60f8b2..8ce3d79c7 100644
--- a/Cxx11/transpose-vector-thread.cc
+++ b/Cxx11/transpose-vector-thread.cc
@@ -89,6 +89,10 @@ int main(int argc, char * argv[])
         throw "ERROR: block size must be greater than 0";
       }
 
+      if (order / block_size > 16) {
+          throw "ERROR: this will create more than 256 threads";
+      }
+
       // default tile size for tiling of local transpose
       tile_size = (argc>4) ? std::atoi(argv[4]) : 32;
       // a negative tile size means no tiling of the local transpose
@@ -109,6 +113,12 @@ int main(int argc, char * argv[])
   std::cout << "Block size            = " << block_size << std::endl;
   std::cout << "Tile size             = " << tile_size << std::endl;
 
+  if (num_threads > 300) {
+      std::cout << "These settings may lead to resource exhaustion.\n"
+                << "Please use a larger block size.\n";
+      return 1;
+  }
+
   //////////////////////////////////////////////////////////////////////
   // Allocate space and perform the computation
   //////////////////////////////////////////////////////////////////////

From 843aab9ec2fd02bcbe181f284565be416bf6f7cb Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Fri, 19 Jan 2018 13:30:39 -0800
Subject: [PATCH 034/245] add SYCL to docs

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 478a93e34..39c5cd89e 100644
--- a/README.md
+++ b/README.md
@@ -87,6 +87,7 @@ i = in-progress, incomplete, or incorrect
 | OpenMP tasks         |  y  |    y    |     y     |    y    |        |       |
 | OpenMP target        |  y  |    y    |     y     |    y    |        |       |
 | OpenCL 1.x           |  i  |    y    |     y     |    y    |        |       |
+| SYCL                 |     |    y    |     y     |    y    |        |       |
 | Parallel STL         |  y  |    y    |     y     |    y    |        |       |
 | TBB                  |  i  |    y    |     y     |    y    |        |       |
 | Kokkos               |  y  |    y    |     y     |    y    |        |       |

From f713445124e0613a3a88a4795514e46bef86c250 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Fri, 19 Jan 2018 13:31:36 -0800
Subject: [PATCH 035/245] add CUDA nstream to docs [ci skip]

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 478a93e34..c7256cb7a 100644
--- a/README.md
+++ b/README.md
@@ -91,8 +91,8 @@ i = in-progress, incomplete, or incorrect
 | TBB                  |  i  |    y    |     y     |    y    |        |       |
 | Kokkos               |  y  |    y    |     y     |    y    |        |       |
 | RAJA                 |  y  |    y    |     y     |    y    |        |       |
-| CUDA                 |     |         |     y     |         |        |       |
-| CUBLAS               |     |         |     y     |         |        |       |
+| CUDA                 |     |         |     y     |    y    |        |       |
+| CUBLAS               |     |         |     y     |    y    |        |       |
 | CBLAS                |     |         |           |         |        |   y   |
 
 * [TBB](https://www.threadingbuildingblocks.org/)

From 7de1f05fec65503bd2e39365590eae322630c8d9 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Fri, 19 Jan 2018 16:25:58 -0800
Subject: [PATCH 036/245] fix banner [ci skip]

---
 Cxx11/nstream-vector-pstl.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cxx11/nstream-vector-pstl.cc b/Cxx11/nstream-vector-pstl.cc
index e1fb1ce05..c243f0ff1 100644
--- a/Cxx11/nstream-vector-pstl.cc
+++ b/Cxx11/nstream-vector-pstl.cc
@@ -70,7 +70,7 @@ int main(int argc, char * argv[])
 {
   std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
 #if defined(USE_PSTL)
-  std::cout << "C++17 STREAM triad: A = B + scalar * C" << std::endl;
+  std::cout << "C++17 Parallel STL STREAM triad: A = B + scalar * C" << std::endl;
 #else
   std::cout << "C++11 STL STREAM triad: A = B + scalar * C" << std::endl;
 #endif

From a27f728b8665d3048446e7cf713bc140495e6f39 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 20 Jan 2018 16:50:05 -0800
Subject: [PATCH 037/245] add missing block size to C++11 thread+async
 transpose

---
 travis/build-run-prk.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index 41a34612f..97a91b2bb 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -349,8 +349,8 @@ case "$PRK_TARGET" in
 
         # C++11 native parallelism
         make -C $PRK_TARGET_PATH transpose-vector-thread transpose-vector-async
-        $PRK_TARGET_PATH/transpose-vector-thread 10 1024 32
-        $PRK_TARGET_PATH/transpose-vector-async  10 1024 32
+        $PRK_TARGET_PATH/transpose-vector-thread 10 1024 512 32
+        $PRK_TARGET_PATH/transpose-vector-async  10 1024 512 32
 
         # C++11 with rangefor
         echo "BOOSTFLAG=-DUSE_BOOST" >> common/make.defs

From a318b234fa775d75cb53c8b21250cd1081e9252f Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sun, 21 Jan 2018 12:12:32 -0800
Subject: [PATCH 038/245] no need to resize stl::vector so stop it

---
 Cxx11/dgemm-cblas.cc             |  9 +++------
 Cxx11/dgemm-vector.cc            |  9 +++------
 Cxx11/nstream-opencl.cc          |  9 +++------
 Cxx11/nstream-sycl.cc            | 13 +++++--------
 Cxx11/nstream-vector-openmp.cc   |  9 +++------
 Cxx11/nstream-vector-pstl.cc     |  9 +++------
 Cxx11/nstream-vector-raja.cc     |  9 +++------
 Cxx11/nstream-vector-rangefor.cc |  9 +++------
 Cxx11/nstream-vector-taskloop.cc |  9 +++------
 Cxx11/nstream-vector-tbb.cc      |  9 +++------
 Cxx11/nstream-vector.cc          |  9 +++------
 11 files changed, 35 insertions(+), 68 deletions(-)

diff --git a/Cxx11/dgemm-cblas.cc b/Cxx11/dgemm-cblas.cc
index a239102a2..5fe2c5ab9 100644
--- a/Cxx11/dgemm-cblas.cc
+++ b/Cxx11/dgemm-cblas.cc
@@ -143,12 +143,9 @@ int main(int argc, char * argv[])
 
   double dgemm_time(0);
 
-  std::vector<double> A;
-  std::vector<double> B;
-  std::vector<double> C;
-  A.resize(order*order);
-  B.resize(order*order);
-  C.resize(order*order,0.0);
+  std::vector<double> A(order*order);
+  std::vector<double> B(order*order);
+  std::vector<double> C(order*order,0.0);
 #ifdef PRK_DEBUG
   const unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
   std::default_random_engine generator(seed);
diff --git a/Cxx11/dgemm-vector.cc b/Cxx11/dgemm-vector.cc
index 7cb102194..973c0df97 100644
--- a/Cxx11/dgemm-vector.cc
+++ b/Cxx11/dgemm-vector.cc
@@ -156,12 +156,9 @@ int main(int argc, char * argv[])
 
   double dgemm_time(0);
 
-  std::vector<double> A;
-  std::vector<double> B;
-  std::vector<double> C;
-  A.resize(order*order);
-  B.resize(order*order);
-  C.resize(order*order,0.0);
+  std::vector<double> A(order*order);
+  std::vector<double> B(order*order);
+  std::vector<double> C(order*order,0.0);
   for (auto i=0; i<order; ++i) {
     for (auto j=0; j<order; ++j) {
        A[i*order+j] = i;
diff --git a/Cxx11/nstream-opencl.cc b/Cxx11/nstream-opencl.cc
index 823ded5a9..18a5a022c 100644
--- a/Cxx11/nstream-opencl.cc
+++ b/Cxx11/nstream-opencl.cc
@@ -88,12 +88,9 @@ void run(cl::Context context, int iterations, size_t length)
 
   auto nstream_time = 0.0;
 
-  std::vector<T> h_a;
-  std::vector<T> h_b;
-  std::vector<T> h_c;
-  h_a.resize(length, (T)0);
-  h_b.resize(length, (T)2);
-  h_c.resize(length, (T)2);
+  std::vector<T> h_a(length, T(0));
+  std::vector<T> h_b(length, T(2));
+  std::vector<T> h_c(length, T(2));
 
   // copy input from host to device
   cl::Buffer d_a = cl::Buffer(context, begin(h_a), end(h_a), true);
diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc
index 306dc7038..b21c73593 100644
--- a/Cxx11/nstream-sycl.cc
+++ b/Cxx11/nstream-sycl.cc
@@ -113,12 +113,9 @@ int main(int argc, char * argv[])
 
   auto nstream_time = 0.0;
 
-  std::vector<double> h_A;
-  std::vector<double> h_B;
-  std::vector<double> h_C;
-  h_A.resize(length);
-  h_B.resize(length);
-  h_C.resize(length);
+  std::vector<double> h_A(length);
+  std::vector<double> h_B(length);
+  std::vector<double> h_C(length);
 
   auto range = boost::irange(static_cast<size_t>(0), length);
 
@@ -137,9 +134,9 @@ int main(int argc, char * argv[])
     cl::sycl::buffer<double> d_C { h_C.data(), h_C.size() };
 
     for (auto iter = 0; iter<=iterations; iter++) {
-   
+
       if (iter==1) nstream_time = prk::wtime();
-   
+
       q.submit([&](cl::sycl::handler& h) {
 
         // accessor methods
diff --git a/Cxx11/nstream-vector-openmp.cc b/Cxx11/nstream-vector-openmp.cc
index d62c9000b..d48015df6 100644
--- a/Cxx11/nstream-vector-openmp.cc
+++ b/Cxx11/nstream-vector-openmp.cc
@@ -117,12 +117,9 @@ int main(int argc, char * argv[])
 
   auto nstream_time = 0.0;
 
-  std::vector<double> A;
-  std::vector<double> B;
-  std::vector<double> C;
-  A.resize(length);
-  B.resize(length);
-  C.resize(length);
+  std::vector<double> A(length);
+  std::vector<double> B(length);
+  std::vector<double> C(length);
 
   double scalar = 3.0;
 
diff --git a/Cxx11/nstream-vector-pstl.cc b/Cxx11/nstream-vector-pstl.cc
index ae67ea494..6ba74fe57 100644
--- a/Cxx11/nstream-vector-pstl.cc
+++ b/Cxx11/nstream-vector-pstl.cc
@@ -116,12 +116,9 @@ int main(int argc, char * argv[])
 
   auto nstream_time = 0.0;
 
-  std::vector<double> A;
-  std::vector<double> B;
-  std::vector<double> C;
-  A.resize(length);
-  B.resize(length);
-  C.resize(length);
+  std::vector<double> A(length);
+  std::vector<double> B(length);
+  std::vector<double> C(length);
 
   auto range = boost::irange(static_cast<size_t>(0), length);
 
diff --git a/Cxx11/nstream-vector-raja.cc b/Cxx11/nstream-vector-raja.cc
index 91f87b93c..31c6434e7 100644
--- a/Cxx11/nstream-vector-raja.cc
+++ b/Cxx11/nstream-vector-raja.cc
@@ -119,12 +119,9 @@ int main(int argc, char * argv[])
 
   auto nstream_time = 0.0;
 
-  std::vector<double> A;
-  std::vector<double> B;
-  std::vector<double> C;
-  A.resize(length);
-  B.resize(length);
-  C.resize(length);
+  std::vector<double> A(length);
+  std::vector<double> B(length);
+  std::vector<double> C(length);
 
   double scalar = 3.0;
 
diff --git a/Cxx11/nstream-vector-rangefor.cc b/Cxx11/nstream-vector-rangefor.cc
index 92feef10e..54bad9274 100644
--- a/Cxx11/nstream-vector-rangefor.cc
+++ b/Cxx11/nstream-vector-rangefor.cc
@@ -112,12 +112,9 @@ int main(int argc, char * argv[])
 
   auto nstream_time = 0.0;
 
-  std::vector<double> A;
-  std::vector<double> B;
-  std::vector<double> C;
-  A.resize(length,0.0);
-  B.resize(length,2.0);
-  C.resize(length,2.0);
+  std::vector<double> A(length,0.0);
+  std::vector<double> B(length,2.0);
+  std::vector<double> C(length,2.0);
 
   auto range = boost::irange(static_cast<size_t>(0), length);
 
diff --git a/Cxx11/nstream-vector-taskloop.cc b/Cxx11/nstream-vector-taskloop.cc
index 272047a54..dede73b16 100644
--- a/Cxx11/nstream-vector-taskloop.cc
+++ b/Cxx11/nstream-vector-taskloop.cc
@@ -124,12 +124,9 @@ int main(int argc, char * argv[])
 
   auto nstream_time = 0.0;
 
-  std::vector<double> A;
-  std::vector<double> B;
-  std::vector<double> C;
-  A.resize(length);
-  B.resize(length);
-  C.resize(length);
+  std::vector<double> A(length);
+  std::vector<double> B(length);
+  std::vector<double> C(length);
 
   double scalar = 3.0;
 
diff --git a/Cxx11/nstream-vector-tbb.cc b/Cxx11/nstream-vector-tbb.cc
index da78be1df..cb73d3eda 100644
--- a/Cxx11/nstream-vector-tbb.cc
+++ b/Cxx11/nstream-vector-tbb.cc
@@ -116,12 +116,9 @@ int main(int argc, char * argv[])
 
   auto nstream_time = 0.0;
 
-  std::vector<double> A;
-  std::vector<double> B;
-  std::vector<double> C;
-  A.resize(length);
-  B.resize(length);
-  C.resize(length);
+  std::vector<double> A(length);
+  std::vector<double> B(length);
+  std::vector<double> C(length);
 
   double scalar(3);
 
diff --git a/Cxx11/nstream-vector.cc b/Cxx11/nstream-vector.cc
index bf5879ca2..c19938da3 100644
--- a/Cxx11/nstream-vector.cc
+++ b/Cxx11/nstream-vector.cc
@@ -109,12 +109,9 @@ int main(int argc, char * argv[])
 
   auto nstream_time = 0.0;
 
-  std::vector<double> A;
-  std::vector<double> B;
-  std::vector<double> C;
-  A.resize(length,0.0);
-  B.resize(length,2.0);
-  C.resize(length,2.0);
+  std::vector<double> A(length,0.0);
+  std::vector<double> B(length,2.0);
+  std::vector<double> C(length,2.0);
 
   double scalar = 3.0;
 

From 75ff0496031d11ea685ca1b7c3c1b3814812af27 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sun, 21 Jan 2018 12:22:10 -0800
Subject: [PATCH 039/245] no need to resize stl::vector so stop it

---
 Cxx11/sparse-vector.cc             | 13 ++++---------
 Cxx11/stencil-opencl.cc            |  6 ++----
 Cxx11/stencil-sycl.cc              |  9 +--------
 Cxx11/transpose-opencl.cc          |  7 +++----
 Cxx11/transpose-sycl.cc            |  6 ++----
 Cxx11/transpose-vector-async.cc    |  7 +++----
 Cxx11/transpose-vector-openmp.cc   |  6 ++----
 Cxx11/transpose-vector-pstl.cc     |  7 +++----
 Cxx11/transpose-vector-raja.cc     |  6 ++----
 Cxx11/transpose-vector-rangefor.cc |  6 ++----
 Cxx11/transpose-vector-taskloop.cc |  6 ++----
 Cxx11/transpose-vector-tbb.cc      |  6 ++----
 Cxx11/transpose-vector-thread.cc   |  7 +++----
 Cxx11/transpose-vector.cc          |  7 +++----
 14 files changed, 34 insertions(+), 65 deletions(-)

diff --git a/Cxx11/sparse-vector.cc b/Cxx11/sparse-vector.cc
index b4c7445b6..c521528e8 100644
--- a/Cxx11/sparse-vector.cc
+++ b/Cxx11/sparse-vector.cc
@@ -158,15 +158,10 @@ int main(int argc, char* argv[])
   // Allocate space and perform the computation
   //////////////////////////////////////////////////////////////////////
 
-  std::vector<double> matrix;
-  std::vector<size_t> colIndex;
-  std::vector<double> vector;
-  std::vector<double> result;
-
-  matrix.resize(nent,0.0);
-  colIndex.resize(nent,0);
-  vector.resize(size2,0.0);
-  result.resize(size2,0.0);
+  std::vector<double> matrix(nent,0.0);
+  std::vector<size_t> colIndex(nent,0);
+  std::vector<double> vector(size2,0.0);
+  std::vector<double> result(size2,0.0);
 
   double sparse_time(0);
 
diff --git a/Cxx11/stencil-opencl.cc b/Cxx11/stencil-opencl.cc
index e75c416de..89a261cc9 100644
--- a/Cxx11/stencil-opencl.cc
+++ b/Cxx11/stencil-opencl.cc
@@ -107,10 +107,8 @@ void run(cl::Context context, int iterations, int n, int radius, bool star)
   // Allocate space and perform the computation
   //////////////////////////////////////////////////////////////////////
 
-  std::vector<T> h_in;
-  std::vector<T> h_out;
-  h_in.resize(n*n, (T)0);
-  h_out.resize(n*n, (T)0);
+  std::vector<T> h_in(n*n,  T(0));
+  std::vector<T> h_out(n*n, T(0));
 
   auto stencil_time = 0.0;
 
diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc
index 74a8a2801..f1ecb8abe 100644
--- a/Cxx11/stencil-sycl.cc
+++ b/Cxx11/stencil-sycl.cc
@@ -200,14 +200,7 @@ int main(int argc, char* argv[])
 
   auto stencil_time = 0.0;
 
-  std::vector<double> h_out;
-  h_out.resize(n*n,0.0);
-
-  for (auto i=0; i<n; i++) {
-    for (auto j=0; j<n; j++) {
-      h_out[i*n+j] = 0.0;
-    }
-  }
+  std::vector<double> h_out(n*n,0.0);
 
   {
     // initialize device buffers from host buffers
diff --git a/Cxx11/transpose-opencl.cc b/Cxx11/transpose-opencl.cc
index 7f632f297..4e22114d5 100644
--- a/Cxx11/transpose-opencl.cc
+++ b/Cxx11/transpose-opencl.cc
@@ -78,10 +78,9 @@ void run(cl::Context context, int iterations, int order)
   //////////////////////////////////////////////////////////////////////
 
   const size_t nelems = (size_t)order * (size_t)order;
-  std::vector<T> h_a;
-  std::vector<T> h_b;
-  h_a.resize(nelems);
-  h_b.resize(nelems, (T)0);
+  std::vector<T> h_a(nelems);
+  std::vector<T> h_b(nelems, T(0));
+
   // fill A with the sequence 0 to order^2-1 as doubles
   std::iota(h_a.begin(), h_a.end(), (T)0);
 
diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc
index 80ebb12b0..cbbc1a2a1 100644
--- a/Cxx11/transpose-sycl.cc
+++ b/Cxx11/transpose-sycl.cc
@@ -96,10 +96,8 @@ int main(int argc, char * argv[])
   /// Allocate space for the input and transpose matrix
   //////////////////////////////////////////////////////////////////////
 
-  std::vector<double> h_A;
-  std::vector<double> h_B;
-  h_A.resize(order*order);
-  h_B.resize(order*order,0.0);
+  std::vector<double> h_A(order*order);
+  std::vector<double> h_B(order*order,0.0);
 
   // fill A with the sequence 0 to order^2-1 as doubles
   std::iota(h_A.begin(), h_A.end(), 0.0);
diff --git a/Cxx11/transpose-vector-async.cc b/Cxx11/transpose-vector-async.cc
index 74a349cd2..8f285b1ad 100644
--- a/Cxx11/transpose-vector-async.cc
+++ b/Cxx11/transpose-vector-async.cc
@@ -119,10 +119,9 @@ int main(int argc, char * argv[])
   // Allocate space and perform the computation
   //////////////////////////////////////////////////////////////////////
 
-  std::vector<double> A;
-  std::vector<double> B;
-  B.resize(order*order,0.0);
-  A.resize(order*order);
+  std::vector<double> A(order*order);
+  std::vector<double> B(order*order,0.0);
+
   // fill A with the sequence 0 to order^2-1 as doubles
   std::iota(A.begin(), A.end(), 0.0);
 
diff --git a/Cxx11/transpose-vector-openmp.cc b/Cxx11/transpose-vector-openmp.cc
index aaaf4e5ab..4e02d09bb 100644
--- a/Cxx11/transpose-vector-openmp.cc
+++ b/Cxx11/transpose-vector-openmp.cc
@@ -112,10 +112,8 @@ int main(int argc, char * argv[])
 
   auto trans_time = 0.0;
 
-  std::vector<double> A;
-  std::vector<double> B;
-  A.resize(order*order);
-  B.resize(order*order);
+  std::vector<double> A(order*order);
+  std::vector<double> B(order*order);
 
   OMP_PARALLEL()
   {
diff --git a/Cxx11/transpose-vector-pstl.cc b/Cxx11/transpose-vector-pstl.cc
index 616c94d17..8b9734200 100644
--- a/Cxx11/transpose-vector-pstl.cc
+++ b/Cxx11/transpose-vector-pstl.cc
@@ -99,10 +99,9 @@ int main(int argc, char * argv[])
   /// Allocate space for the input and transpose matrix
   //////////////////////////////////////////////////////////////////////
 
-  std::vector<double> A;
-  std::vector<double> B;
-  B.resize(order*order,0.0);
-  A.resize(order*order);
+  std::vector<double> A(order*order);
+  std::vector<double> B(order*order,0.0);
+
   // fill A with the sequence 0 to order^2-1 as doubles
   std::iota(A.begin(), A.end(), 0.0);
 
diff --git a/Cxx11/transpose-vector-raja.cc b/Cxx11/transpose-vector-raja.cc
index 7b183913a..84738694d 100644
--- a/Cxx11/transpose-vector-raja.cc
+++ b/Cxx11/transpose-vector-raja.cc
@@ -272,10 +272,8 @@ int main(int argc, char * argv[])
   /// Allocate space for the input and transpose matrix
   //////////////////////////////////////////////////////////////////////
 
-  std::vector<double> A;
-  std::vector<double> B;
-  A.resize(order*order);
-  B.resize(order*order);
+  std::vector<double> A(order*order);
+  std::vector<double> B(order*order);
 
   if (use_for=="seq") {
     if (use_nested) {
diff --git a/Cxx11/transpose-vector-rangefor.cc b/Cxx11/transpose-vector-rangefor.cc
index ef18d76a0..ee0097026 100644
--- a/Cxx11/transpose-vector-rangefor.cc
+++ b/Cxx11/transpose-vector-rangefor.cc
@@ -93,10 +93,8 @@ int main(int argc, char * argv[])
   /// Allocate space for the input and transpose matrix
   //////////////////////////////////////////////////////////////////////
 
-  std::vector<double> A;
-  std::vector<double> B;
-  B.resize(order*order,0.0);
-  A.resize(order*order);
+  std::vector<double> A(order*order);
+  std::vector<double> B(order*order,0.0);
   // fill A with the sequence 0 to order^2-1 as doubles
   std::iota(A.begin(), A.end(), 0.0);
 
diff --git a/Cxx11/transpose-vector-taskloop.cc b/Cxx11/transpose-vector-taskloop.cc
index e54d87913..17dbad525 100644
--- a/Cxx11/transpose-vector-taskloop.cc
+++ b/Cxx11/transpose-vector-taskloop.cc
@@ -113,10 +113,8 @@ int main(int argc, char * argv[])
   // Allocate space and perform the computation
   //////////////////////////////////////////////////////////////////////
 
-  std::vector<double> A;
-  std::vector<double> B;
-  A.resize(order*order);
-  B.resize(order*order);
+  std::vector<double> A(order*order);
+  std::vector<double> B(order*order);
 
   auto trans_time = 0.0;
 
diff --git a/Cxx11/transpose-vector-tbb.cc b/Cxx11/transpose-vector-tbb.cc
index 98d6922b9..45ea4bc5b 100644
--- a/Cxx11/transpose-vector-tbb.cc
+++ b/Cxx11/transpose-vector-tbb.cc
@@ -111,10 +111,8 @@ int main(int argc, char * argv[])
 
   auto trans_time = 0.0;
 
-  std::vector<double> A;
-  std::vector<double> B;
-  A.resize(order*order);
-  B.resize(order*order);
+  std::vector<double> A(order*order);
+  std::vector<double> B(order*order);
 
   tbb::blocked_range2d<int> range(0, order, tile_size, 0, order, tile_size);
   tbb::parallel_for( range, [&](decltype(range)& r) {
diff --git a/Cxx11/transpose-vector-thread.cc b/Cxx11/transpose-vector-thread.cc
index 8ce3d79c7..57fbf11ea 100644
--- a/Cxx11/transpose-vector-thread.cc
+++ b/Cxx11/transpose-vector-thread.cc
@@ -123,10 +123,9 @@ int main(int argc, char * argv[])
   // Allocate space and perform the computation
   //////////////////////////////////////////////////////////////////////
 
-  std::vector<double> A;
-  std::vector<double> B;
-  B.resize(order*order,0.0);
-  A.resize(order*order);
+  std::vector<double> A(order*order);
+  std::vector<double> B(order*order,0.0);
+
   // fill A with the sequence 0 to order^2-1 as doubles
   std::iota(A.begin(), A.end(), 0.0);
 
diff --git a/Cxx11/transpose-vector.cc b/Cxx11/transpose-vector.cc
index c6199ff40..943a6e380 100644
--- a/Cxx11/transpose-vector.cc
+++ b/Cxx11/transpose-vector.cc
@@ -103,10 +103,9 @@ int main(int argc, char * argv[])
 
   auto trans_time = 0.0;
 
-  std::vector<double> A;
-  std::vector<double> B;
-  A.resize(order*order);
-  B.resize(order*order,0.0);
+  std::vector<double> A(order*order);
+  std::vector<double> B(order*order,0.0);
+
   // fill A with the sequence 0 to order^2-1 as doubles
   std::iota(A.begin(), A.end(), 0.0);
 

From cafd273145ba21b81c2b893e75c60bd17bda6c85 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sun, 21 Jan 2018 12:27:39 -0800
Subject: [PATCH 040/245] no need to resize stl::vector so stop it

---
 Cxx11/dgemm-cblas.cc                | 3 +--
 Cxx11/p2p-doacross-vector-openmp.cc | 4 +---
 Cxx11/p2p-innerloop-opencl.cc       | 4 +---
 Cxx11/p2p-innerloop-vector-tbb.cc   | 4 +---
 Cxx11/p2p-vector-raja.cc            | 4 +---
 Cxx11/p2p-vector-tbb.cc             | 4 +---
 Cxx11/p2p-vector.cc                 | 4 +---
 Cxx11/stencil-vector-openmp.cc      | 6 ++----
 Cxx11/stencil-vector-pstl.cc        | 6 ++----
 Cxx11/stencil-vector-raja.cc        | 6 ++----
 Cxx11/stencil-vector-rangefor.cc    | 6 ++----
 Cxx11/stencil-vector-taskloop.cc    | 6 ++----
 Cxx11/stencil-vector-tbb.cc         | 6 ++----
 Cxx11/stencil-vector.cc             | 6 ++----
 14 files changed, 21 insertions(+), 48 deletions(-)

diff --git a/Cxx11/dgemm-cblas.cc b/Cxx11/dgemm-cblas.cc
index 5fe2c5ab9..fc5709812 100644
--- a/Cxx11/dgemm-cblas.cc
+++ b/Cxx11/dgemm-cblas.cc
@@ -182,8 +182,7 @@ int main(int argc, char * argv[])
   const auto epsilon = 1.0e-8;
   const auto forder = static_cast<double>(order);
 #ifdef PRK_DEBUG
-  std::vector<double> D;
-  D.resize(order*order,0.0);
+  std::vector<double> D(order*order,0.0);;
   for (auto iter = 0; iter<=iterations; iter++) {
     prk_dgemm_loops(order, A, B, D);
   }
diff --git a/Cxx11/p2p-doacross-vector-openmp.cc b/Cxx11/p2p-doacross-vector-openmp.cc
index 2650c0a81..2d271c92b 100644
--- a/Cxx11/p2p-doacross-vector-openmp.cc
+++ b/Cxx11/p2p-doacross-vector-openmp.cc
@@ -108,9 +108,7 @@ int main(int argc, char* argv[])
 
   auto pipeline_time = 0.0; // silence compiler warning
 
-  // working set
-  std::vector<double> grid;
-  grid.resize(m*n);
+  std::vector<double> grid(m*n);;
 
   OMP_PARALLEL()
   {
diff --git a/Cxx11/p2p-innerloop-opencl.cc b/Cxx11/p2p-innerloop-opencl.cc
index 620f415d3..2552fe787 100644
--- a/Cxx11/p2p-innerloop-opencl.cc
+++ b/Cxx11/p2p-innerloop-opencl.cc
@@ -84,9 +84,7 @@ void run(cl::Context context, int iterations, int n)
   /// Allocate space for the input and transpose matrix
   //////////////////////////////////////////////////////////////////////
 
-  const int nelems = n*n;
-  std::vector<T> h_grid;
-  h_grid.resize(nelems, (T)0);
+  std::vector<T> h_grid(n*n, T(0));
   for (auto j=0; j<n; j++) {
     h_grid[0*n+j] = static_cast<double>(j);
   }
diff --git a/Cxx11/p2p-innerloop-vector-tbb.cc b/Cxx11/p2p-innerloop-vector-tbb.cc
index eb17ca3e5..788226f71 100644
--- a/Cxx11/p2p-innerloop-vector-tbb.cc
+++ b/Cxx11/p2p-innerloop-vector-tbb.cc
@@ -112,9 +112,7 @@ int main(int argc, char* argv[])
 
   auto pipeline_time = 0.0; // silence compiler warning
 
-  // working set
-  std::vector<double> grid;
-  grid.resize(n*n,0.0);
+  std::vector<double> grid(n*n,0.0);
 
   // set boundary values (bottom and left side of grid)
   for (auto j=0; j<n; j++) {
diff --git a/Cxx11/p2p-vector-raja.cc b/Cxx11/p2p-vector-raja.cc
index 8115a0d91..7dfeea21d 100644
--- a/Cxx11/p2p-vector-raja.cc
+++ b/Cxx11/p2p-vector-raja.cc
@@ -117,9 +117,7 @@ int main(int argc, char* argv[])
 
   auto pipeline_time = 0.0; // silence compiler warning
 
-  // working set
-  std::vector<double> grid;
-  grid.resize(m*n,0.0);
+  std::vector<double> grid(m*n,0.0);
 
   // set boundary values (bottom and left side of grid)
   for (auto j=0; j<n; j++) {
diff --git a/Cxx11/p2p-vector-tbb.cc b/Cxx11/p2p-vector-tbb.cc
index bb54afeb9..bcc45b27b 100644
--- a/Cxx11/p2p-vector-tbb.cc
+++ b/Cxx11/p2p-vector-tbb.cc
@@ -175,9 +175,7 @@ int main(int argc, char* argv[])
 
   auto pipeline_time = 0.0; // silence compiler warning
 
-  // working set
-  std::vector<double> grid;
-  grid.resize(m*n,0.0);
+  std::vector<double> grid(m*n,0.0);
 
   // set boundary values (bottom and left side of grid)
   for (auto j=0; j<n; j++) {
diff --git a/Cxx11/p2p-vector.cc b/Cxx11/p2p-vector.cc
index c0e05d7fb..de7337980 100644
--- a/Cxx11/p2p-vector.cc
+++ b/Cxx11/p2p-vector.cc
@@ -128,9 +128,7 @@ int main(int argc, char* argv[])
 
   auto pipeline_time = 0.0; // silence compiler warning
 
-  // working set
-  std::vector<double> grid;
-  grid.resize(m*n,0.0);
+  std::vector<double> grid(m*n,0.0);;
 
   {
     // set boundary values (bottom and left side of grid)
diff --git a/Cxx11/stencil-vector-openmp.cc b/Cxx11/stencil-vector-openmp.cc
index 8ff184d98..98343a798 100644
--- a/Cxx11/stencil-vector-openmp.cc
+++ b/Cxx11/stencil-vector-openmp.cc
@@ -175,10 +175,8 @@ int main(int argc, char* argv[])
 
   auto stencil_time = 0.0;
 
-  std::vector<double> in;
-  std::vector<double> out;
-  in.resize(n*n);
-  out.resize(n*n);
+  std::vector<double> in(n*n);
+  std::vector<double> out(n*n);
 
   OMP_PARALLEL()
   {
diff --git a/Cxx11/stencil-vector-pstl.cc b/Cxx11/stencil-vector-pstl.cc
index a661ff736..863a50df5 100644
--- a/Cxx11/stencil-vector-pstl.cc
+++ b/Cxx11/stencil-vector-pstl.cc
@@ -176,10 +176,8 @@ int main(int argc, char* argv[])
 
   auto stencil_time = 0.0;
 
-  std::vector<double> in;
-  std::vector<double> out;
-  in.resize(n*n);
-  out.resize(n*n);
+  std::vector<double> in(n*n);
+  std::vector<double> out(n*n);
 
   // initialize the input and output arrays
   auto range = boost::irange(0,n);
diff --git a/Cxx11/stencil-vector-raja.cc b/Cxx11/stencil-vector-raja.cc
index cf2243bf6..3bcecb4ec 100644
--- a/Cxx11/stencil-vector-raja.cc
+++ b/Cxx11/stencil-vector-raja.cc
@@ -173,10 +173,8 @@ int main(int argc, char* argv[])
 
   auto stencil_time = 0.0;
 
-  std::vector<double> in;
-  std::vector<double> out;
-  in.resize(n*n);
-  out.resize(n*n);
+  std::vector<double> in(n*n);
+  std::vector<double> out(n*n);
 
 #if 0
   RAJA::forallN<RAJA::NestedPolicy<RAJA::ExecList<thread_exec, RAJA::simd_exec>>>
diff --git a/Cxx11/stencil-vector-rangefor.cc b/Cxx11/stencil-vector-rangefor.cc
index 2f5cf7ce1..aef3a3880 100644
--- a/Cxx11/stencil-vector-rangefor.cc
+++ b/Cxx11/stencil-vector-rangefor.cc
@@ -164,10 +164,8 @@ int main(int argc, char* argv[])
 
   auto stencil_time = 0.0;
 
-  std::vector<double> in;
-  std::vector<double> out;
-  in.resize(n*n);
-  out.resize(n*n);
+  std::vector<double> in(n*n);
+  std::vector<double> out(n*n);
 
   // initialize the input and output arrays
   auto range = boost::irange(0,n);
diff --git a/Cxx11/stencil-vector-taskloop.cc b/Cxx11/stencil-vector-taskloop.cc
index 52106b9cc..971d71db1 100644
--- a/Cxx11/stencil-vector-taskloop.cc
+++ b/Cxx11/stencil-vector-taskloop.cc
@@ -174,10 +174,8 @@ int main(int argc, char* argv[])
 
   auto stencil_time = 0.0;
 
-  std::vector<double> in;
-  std::vector<double> out;
-  in.resize(n*n);
-  out.resize(n*n);
+  std::vector<double> in(n*n);;
+  std::vector<double> out(n*n);;
 
   OMP_PARALLEL()
   OMP_MASTER
diff --git a/Cxx11/stencil-vector-tbb.cc b/Cxx11/stencil-vector-tbb.cc
index 76d8be67c..2f5c27488 100644
--- a/Cxx11/stencil-vector-tbb.cc
+++ b/Cxx11/stencil-vector-tbb.cc
@@ -169,10 +169,8 @@ int main(int argc, char* argv[])
 
   auto stencil_time = 0.0;
 
-  std::vector<double> in;
-  std::vector<double> out;
-  in.resize(n*n);
-  out.resize(n*n);
+  std::vector<double> in(n*n);
+  std::vector<double> out(n*n);
 
   tbb::blocked_range2d<int> range(0, n, tile_size, 0, n, tile_size);
   tbb::parallel_for( range, [&](decltype(range)& r) {
diff --git a/Cxx11/stencil-vector.cc b/Cxx11/stencil-vector.cc
index 57fcf86f6..26931780d 100644
--- a/Cxx11/stencil-vector.cc
+++ b/Cxx11/stencil-vector.cc
@@ -164,10 +164,8 @@ int main(int argc, char* argv[])
 
   auto stencil_time = 0.0;
 
-  std::vector<double> in;
-  std::vector<double> out;
-  in.resize(n*n);
-  out.resize(n*n);
+  std::vector<double> in(n*n);
+  std::vector<double> out(n*n);
 
   {
     for (auto it=0; it<n; it+=tile_size) {

From 3c77c6b0ccf91153a09995ee2138d64bbbc90d96 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Tue, 23 Jan 2018 10:08:47 -0800
Subject: [PATCH 041/245] fix rpath usage [ci skip]

---
 Cxx11/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index a73b87eb7..ca3144497 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -48,7 +48,7 @@ KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) $(BOOSTFLAGS)
 ifdef OCCADIR
   include ${OCCADIR}/scripts/makefile
 endif
-OCCAFLAGS = -DUSE_OCCA -I${OCCADIR}/include -Wl,-rpath=${OCCADIR}/lib -L${OCCADIR}/lib -locca
+OCCAFLAGS = -DUSE_OCCA -I${OCCADIR}/include -Wl,-rpath -Wl,${OCCADIR}/lib -L${OCCADIR}/lib -locca
 
 .PHONY: all clean vector valarray openmp target opencl taskloop tbb stl pstl rangefor kokkos raja cuda
 

From 82bea6cf58b7297cefa0f5716b003e1eb0cea334 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Fri, 2 Feb 2018 14:38:27 -0800
Subject: [PATCH 042/245] try SOS 1.4.0

---
 travis/install-sandia-openshmem.sh | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/travis/install-sandia-openshmem.sh b/travis/install-sandia-openshmem.sh
index 8dcf9fe25..308c32d3d 100755
--- a/travis/install-sandia-openshmem.sh
+++ b/travis/install-sandia-openshmem.sh
@@ -13,15 +13,14 @@ TRAVIS_ROOT="$1"
 SHMEM_ROOT=$TRAVIS_ROOT/sandia-openshmem
 
 if [ ! -d "$SHMEM_ROOT" ]; then
-    # master
-    #git clone --depth 1 https://github.com/regrant/sandia-shmem.git sandia-shmem
+    # HEAD
     #git clone --depth 1 https://github.com/Sandia-OpenSHMEM/SOS.git sandia-shmem
-    #git clone -b v1.3.2 --depth 1 https://github.com/Sandia-OpenSHMEM/SOS.git sandia-shmem
     #cd sandia-shmem
-    # 1.3 release
-    wget https://github.com/Sandia-OpenSHMEM/SOS/archive/v1.3.2.tar.gz
-    tar -xzf v1.3.2.tar.gz
-    cd SOS-1.3.2
+    VERSION=1.4.0
+    #git clone -b v$VERSION --depth 1 https://github.com/Sandia-OpenSHMEM/SOS.git SOS-$VERSION
+    wget https://github.com/Sandia-OpenSHMEM/SOS/archive/v$VERSION.tar.gz
+    tar -xzf v$VERSION.tar.gz
+    cd SOS-$VERSION
     ./autogen.sh
     mkdir build
     cd build

From 40034c58c04f8df5e943896cd8a08e518d6b85e3 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Wed, 7 Feb 2018 06:26:21 -0800
Subject: [PATCH 043/245] PGI 17.4 Mac workaround [ci skip]

---
 C1z/prk_util.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/C1z/prk_util.h b/C1z/prk_util.h
index 9df3a81fa..5d0831d34 100644
--- a/C1z/prk_util.h
+++ b/C1z/prk_util.h
@@ -46,7 +46,13 @@
 #include <stdio.h>   // atoi
 #include <stdlib.h>  // getenv
 #include <stdint.h>
+#if defined(__PGIC__)
+typedef _Bool bool;
+const bool true=1;
+const bool false=0;
+#else
 #include <stdbool.h> // bool
+#endif
 #include <string.h>
 #include <limits.h>
 #include <math.h>    // fabs
@@ -81,6 +87,7 @@
 #  define OMP_ORDERED(x) PRAGMA(omp ordered x)
 #  define OMP_TARGET(x) PRAGMA(omp target x)
 # else
+#  warning No OpenMP 4+ features!
 #  define OMP_SIMD
 #  define OMP_FOR_SIMD() PRAGMA(omp for x)
 #  define OMP_TASK(x)

From e02238199be2411d3e3019919d2e8131e8340550 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Thu, 8 Feb 2018 15:56:54 -0800
Subject: [PATCH 044/245] whitespace fix [ci skip]

---
 Cxx11/transpose-vector-openmp.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Cxx11/transpose-vector-openmp.cc b/Cxx11/transpose-vector-openmp.cc
index 4e02d09bb..ba3a26321 100644
--- a/Cxx11/transpose-vector-openmp.cc
+++ b/Cxx11/transpose-vector-openmp.cc
@@ -100,11 +100,11 @@ int main(int argc, char * argv[])
   }
 
 #ifdef _OPENMP
-  std::cout << "Number of threads     = " << omp_get_max_threads() << std::endl;
+  std::cout << "Number of threads    = " << omp_get_max_threads() << std::endl;
 #endif
-  std::cout << "Number of iterations  = " << iterations << std::endl;
-  std::cout << "Matrix order          = " << order << std::endl;
-  std::cout << "Tile size             = " << tile_size << std::endl;
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Matrix order         = " << order << std::endl;
+  std::cout << "Tile size            = " << tile_size << std::endl;
 
   //////////////////////////////////////////////////////////////////////
   /// Allocate space for the input and transpose matrix

From e8c3d35361e508ce560c8719f12d4da543fd467d Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Thu, 8 Feb 2018 16:17:41 -0800
Subject: [PATCH 045/245] fix argv location of offset in taskloop [ci skip]

---
 Cxx11/nstream-vector-taskloop.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cxx11/nstream-vector-taskloop.cc b/Cxx11/nstream-vector-taskloop.cc
index dede73b16..d82e37d14 100644
--- a/Cxx11/nstream-vector-taskloop.cc
+++ b/Cxx11/nstream-vector-taskloop.cc
@@ -68,7 +68,7 @@ int main(int argc, char * argv[])
 {
   std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
 #ifdef _OPENMP
-  std::cout << "C++11/OpenMP STREAM triad: A = B + scalar * C" << std::endl;
+  std::cout << "C++11/OpenMP TASKLOOP STREAM triad: A = B + scalar * C" << std::endl;
 #else
   std::cout << "C++11 STREAM triad: A = B + scalar * C" << std::endl;
 #endif
@@ -100,7 +100,7 @@ int main(int argc, char * argv[])
         throw "ERROR: grainsize";
       }
 
-      offset = (argc>3) ? std::atoi(argv[3]) : 0;
+      offset = (argc>4) ? std::atoi(argv[4]) : 0;
       if (length <= 0) {
         throw "ERROR: offset must be nonnegative";
       }

From 7c6578d43e82b91d2fa668c5df43ea878a92452a Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Thu, 8 Feb 2018 16:27:43 -0800
Subject: [PATCH 046/245] fix banner [ci skip]

---
 Cxx11/nstream-vector-taskloop.cc | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/Cxx11/nstream-vector-taskloop.cc b/Cxx11/nstream-vector-taskloop.cc
index d82e37d14..95bd5c925 100644
--- a/Cxx11/nstream-vector-taskloop.cc
+++ b/Cxx11/nstream-vector-taskloop.cc
@@ -67,11 +67,7 @@
 int main(int argc, char * argv[])
 {
   std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
-#ifdef _OPENMP
   std::cout << "C++11/OpenMP TASKLOOP STREAM triad: A = B + scalar * C" << std::endl;
-#else
-  std::cout << "C++11 STREAM triad: A = B + scalar * C" << std::endl;
-#endif
 
   //////////////////////////////////////////////////////////////////////
   /// Read and test input parameters

From d323a4b5069c7ff66562c8e10f7616e81efceb2a Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 3 Mar 2018 12:05:38 -0800
Subject: [PATCH 047/245] cleanup flags [ci skip]

---
 common/make.defs.gcc   | 10 +++++++---
 common/make.defs.intel |  3 +++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index aaae093e1..3dcbdb030 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -6,22 +6,26 @@
 #
 VERSION=-7
 # C99 is required in some implementations.
-CC=gcc${VERSION} -std=c11 -pthread -Wall
+CC=gcc${VERSION} -std=c11 -pthread
 #EXTRA_CLIBS=-lrt
 # All of the Fortran code is written for the 2008 standard and requires preprocessing.
-FC=gfortran${VERSION} -std=f2008 -cpp -Wall
+FC=gfortran${VERSION} -std=f2008 -cpp
 # C++11 may not be required but does no harm here.
-CXX=g++${VERSION} -std=gnu++17 -pthread -Wall
+CXX=g++${VERSION} -std=gnu++17 -pthread
 #
 # Compiler flags
 #
 # -mtune=native is appropriate for most cases.
 # -march=native is appropriate if you want portable binaries.
 DEFAULT_OPT_FLAGS=-g -O3 -mtune=native -ffast-math
+#
 # If you are compiling for KNL on a Xeon login node, use the following:
 # DEFAULT_OPT_FLAGS=-g -O3 -march=knl
 # See https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html for details.
 #
+DEFAULT_OPT_FLAGS+=-fopt-info-vec-missed
+DEFAULT_OPT_FLAGS+=-Wall
+#
 # OpenMP flags
 #
 OPENMPFLAG=-fopenmp
diff --git a/common/make.defs.intel b/common/make.defs.intel
index 2f111cd58..087964e2c 100644
--- a/common/make.defs.intel
+++ b/common/make.defs.intel
@@ -20,9 +20,12 @@ CXX=icpc -std=c++14 -pthread
 #
 # -xHOST is appropriate for most cases.
 DEFAULT_OPT_FLAGS=-g -O3 -xHOST
+#
 # If you are compiling for KNL on a Xeon login node, use the following:
 # DEFAULT_OPT_FLAGS=-g -O3 -xMIC-AVX512
 #
+DEFAULT_OPT_FLAGS+=-qopt-report=5
+#
 # OpenMP flags
 #
 OPENMPFLAG=-qopenmp

From 0131ab2c0451d3fdd4237ed74efc49b85f6ab475 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Fri, 9 Mar 2018 20:17:27 -0800
Subject: [PATCH 048/245] ignore Fortran dgemm binary [ci skip]

---
 FORTRAN/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile
index 7fbf78876..ee57e8255 100644
--- a/FORTRAN/Makefile
+++ b/FORTRAN/Makefile
@@ -94,7 +94,7 @@ clean:
 	-rm -f *.optrpt
 	-rm -f *.dwarf
 	-rm -rf *.dSYM # Mac
-	-rm -f p2p stencil transpose nstream
+	-rm -f p2p stencil transpose nstream dgemm
 	-rm -f p2p-innerloop
 	-rm -f *-pretty
 	-rm -f *-coarray

From 77e1f9f1c8de93a76c9e7b337f33656070df609a Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 14 Mar 2018 08:10:22 -0700
Subject: [PATCH 049/245] OpenMP blocked wavefront (#317)

blocked hyperplane (aka innerloop) working w/ OpenMP

- rename p2p innerloop openmp to hyperplane, add to travis
- add skeleton for two-level hyperplane (todo later)
- performance looks pretty good:
```
./p2p-innerloop-vector-openmp 40 4096 512
Parallel Research Kernels version 2.16
C++11/OpenMP HYPERPLANE pipeline execution on 2D grid
Number of threads (max)   = 4
Number of iterations = 40
Grid sizes           = 4096, 4096
Grid chunk sizes     = 512
Solution validates
Rate (MFlops/s): 1362.17 Avg time (s): 0.0246211
```
---
 .gitignore                                    |  2 +-
 Cxx11/Makefile                                |  6 +-
 ...nmp.cc => p2p-hyperplane-vector-openmp.cc} | 74 ++++++++++++++++---
 common/make.defs.gcc                          |  6 +-
 travis/build-run-prk.sh                       |  5 +-
 5 files changed, 73 insertions(+), 20 deletions(-)
 rename Cxx11/{p2p-innerloop-vector-openmp.cc => p2p-hyperplane-vector-openmp.cc} (73%)

diff --git a/.gitignore b/.gitignore
index 8ac27dc88..6e6fe1688 100644
--- a/.gitignore
+++ b/.gitignore
@@ -127,7 +127,7 @@ Cxx11/p2p-innerloop-openmp
 Cxx11/p2p-doacross-vector-openmp
 Cxx11/p2p-innerloop-opencl
 Cxx11/p2p-innerloop-vector
-Cxx11/p2p-innerloop-vector-openmp
+Cxx11/p2p-hyperplane-vector-openmp
 Cxx11/p2p-innerloop-vector-tbb
 Cxx11/nstream-kokkos
 Cxx11/nstream-opencl
diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index ca3144497..d3cacecd2 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -63,7 +63,7 @@ endif
 
 all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sycl occa $(EXTRA)
 
-p2p: p2p-vector p2p-doacross-vector-openmp p2p-innerloop-vector-openmp p2p-tasks-openmp p2p-openmp-target \
+p2p: p2p-vector p2p-doacross-vector-openmp p2p-hyperplane-vector-openmp p2p-tasks-openmp p2p-openmp-target \
      p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl
 
 stencil: stencil-valarray stencil-vector stencil-vector-async stencil-vector-openmp stencil-openmp-target \
@@ -85,7 +85,7 @@ vector: p2p-vector p2p-innerloop-vector stencil-vector transpose-vector nstream-
 
 valarray: transpose-valarray nstream-valarray
 
-openmp: p2p-innerloop-vector-openmp p2p-tasks-openmp stencil-vector-openmp transpose-vector-openmp nstream-vector-openmp
+openmp: p2p-hyperplane-vector-openmp p2p-tasks-openmp stencil-vector-openmp transpose-vector-openmp nstream-vector-openmp
 
 target: stencil-openmp-target transpose-openmp-target nstream-openmp-target
 
@@ -113,7 +113,7 @@ cublas: transpose-cublas nstream-cublas
 
 occa: transpose-occa nstream-occa
 
-p2p-innerloop-vector: p2p-innerloop-vector-openmp.cc prk_util.h
+p2p-innerloop-vector: p2p-hyperplane-vector-openmp.cc prk_util.h
 	$(CXX) $(CXXFLAGS) $< -o $@
 
 transpose-opencl: transpose-opencl.cc transpose.cl prk_util.h prk_opencl.h
diff --git a/Cxx11/p2p-innerloop-vector-openmp.cc b/Cxx11/p2p-hyperplane-vector-openmp.cc
similarity index 73%
rename from Cxx11/p2p-innerloop-vector-openmp.cc
rename to Cxx11/p2p-hyperplane-vector-openmp.cc
index 2d398cdb9..4a3f317ae 100644
--- a/Cxx11/p2p-innerloop-vector-openmp.cc
+++ b/Cxx11/p2p-hyperplane-vector-openmp.cc
@@ -61,13 +61,39 @@
 
 #include "prk_util.h"
 
+inline void sweep_tile_sequential(int startm, int endm,
+                                  int startn, int endn,
+                                  int n, double grid[])
+{
+  for (auto i=startm; i<endm; i++) {
+    for (auto j=startn; j<endn; j++) {
+      grid[i*n+j] = grid[(i-1)*n+j] + grid[i*n+(j-1)] - grid[(i-1)*n+(j-1)];
+    }
+  }
+}
+
+#if 0
+inline void sweep_tile_hyperplane(int startm, int endm,
+                                  int startn, int endn,
+                                  int n, double grid[])
+{
+  for (auto i=2; i<=2*n-2; i++) {
+    for (auto j=std::max(2,i-n+2); j<=std::min(i,n); j++) {
+      const auto x = i-j+1;
+      const auto y = j-1;
+      grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)];
+    }
+  }
+}
+#endif
+
 int main(int argc, char* argv[])
 {
   std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
 #ifdef _OPENMP
-  std::cout << "C++11/OpenMP INNERLOOP pipeline execution on 2D grid" << std::endl;
+  std::cout << "C++11/OpenMP HYPERPLANE pipeline execution on 2D grid" << std::endl;
 #else
-  std::cout << "C++11/Serial INNERLOOP pipeline execution on 2D grid" << std::endl;
+  std::cout << "C++11/Serial HYPERPLANE pipeline execution on 2D grid" << std::endl;
 #endif
 
   //////////////////////////////////////////////////////////////////////
@@ -75,15 +101,15 @@ int main(int argc, char* argv[])
   //////////////////////////////////////////////////////////////////////
 
   int iterations;
-  int n;
+  int n, nc, nb;
   try {
       if (argc < 3) {
-        throw " <# iterations> <array dimension>";
+        throw " <# iterations> <array dimension> [<chunk dimension>]";
       }
 
       // number of times to run the pipeline algorithm
       iterations  = std::atoi(argv[1]);
-      if (iterations < 1) {
+      if (iterations < 0) {
         throw "ERROR: iterations must be >= 1";
       }
 
@@ -94,6 +120,18 @@ int main(int argc, char* argv[])
       } else if ( static_cast<size_t>(n)*static_cast<size_t>(n) > INT_MAX) {
         throw "ERROR: grid dimension too large - overflow risk";
       }
+
+      // grid chunk dimensions
+      nc = (argc > 3) ? std::atoi(argv[3]) : 1;
+      nc = std::max(1,nc);
+      nc = std::min(n,nc);
+
+      // number of grid blocks
+      nb = (n-1)/nc;
+      if ((n-1)%nc) nb++;
+      //std::cerr << "n="  << n << std::endl;
+      //std::cerr << "nb=" << nb << std::endl;
+      //std::cerr << "nc=" << nc << std::endl;
   }
   catch (const char * e) {
     std::cout << e << std::endl;
@@ -105,6 +143,7 @@ int main(int argc, char* argv[])
 #endif
   std::cout << "Number of iterations = " << iterations << std::endl;
   std::cout << "Grid sizes           = " << n << ", " << n << std::endl;
+  std::cout << "Grid chunk sizes     = " << nc << std::endl;
 
   //////////////////////////////////////////////////////////////////////
   // Allocate space and perform the computation
@@ -112,11 +151,11 @@ int main(int argc, char* argv[])
 
   auto pipeline_time = 0.0; // silence compiler warning
 
-  // working set
   double * grid = new double[n*n];
 
   OMP_PARALLEL()
   {
+    // TODO block this
     OMP_FOR_SIMD
     for (auto i=0; i<n; i++) {
       for (auto j=0; j<n; j++) {
@@ -144,12 +183,23 @@ int main(int argc, char* argv[])
           pipeline_time = prk::wtime();
       }
 
-      for (auto i=2; i<=2*n-2; i++) {
-        OMP_FOR_SIMD
-        for (auto j=std::max(2,i-n+2); j<=std::min(i,n); j++) {
-          const auto x = i-j+2-1;
-          const auto y = j-1;
-          grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)];
+      if (nc==1) {
+        for (auto i=2; i<=2*n-2; i++) {
+          OMP_FOR_SIMD
+          for (auto j=std::max(2,i-n+2); j<=std::min(i,n); j++) {
+            const auto x = i-j+1;
+            const auto y = j-1;
+            grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)];
+          }
+        }
+      } else {
+        for (int i=2; i<=2*(nb+1)-2; i++) {
+          OMP_FOR()
+          for (int j=std::max(2,i-(nb+1)+2); j<=std::min(i,nb+1); j++) {
+            const int ib = nc*(i-j+1-1)+1;
+            const int jb = nc*(j-1-1)+1;
+            sweep_tile_sequential(ib, std::min(n,ib+nc), jb, std::min(n,jb+nc), n, grid);
+          }
         }
       }
       OMP_MASTER
diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index 3dcbdb030..3e7b55235 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -18,7 +18,9 @@ CXX=g++${VERSION} -std=gnu++17 -pthread
 # -mtune=native is appropriate for most cases.
 # -march=native is appropriate if you want portable binaries.
 DEFAULT_OPT_FLAGS=-g -O3 -mtune=native -ffast-math
-#
+#DEFAULT_OPT_FLAGS=-g -O3 -mtune=native -ffast-math -fsanitize=undefined,leak
+#DEFAULT_OPT_FLAGS=-g -O3 -mtune=native -ffast-math -fsanitize=address
+#DEFAULT_OPT_FLAGS=-g -O3 -mtune=native -ffast-math -fsanitize=thread
 # If you are compiling for KNL on a Xeon login node, use the following:
 # DEFAULT_OPT_FLAGS=-g -O3 -march=knl
 # See https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html for details.
@@ -50,7 +52,7 @@ SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include
 #
 # OCCA
 #
-OCCADIR=${HOME}/prk-repo/Cxx11/occa
+#OCCADIR=${HOME}/prk-repo/Cxx11/occa
 #
 # Cilk
 #
diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index 97a91b2bb..55898719f 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -371,10 +371,11 @@ case "$PRK_TARGET" in
             gcc)
                 # Host
                 echo "OPENMPFLAG=-fopenmp" >> common/make.defs
-                make -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-innerloop-vector-openmp stencil-vector-openmp \
+                make -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-hyperplane-vector-openmp stencil-vector-openmp \
                                          transpose-vector-openmp nstream-vector-openmp
                 $PRK_TARGET_PATH/p2p-tasks-openmp                 10 1024 1024 100 100
-                $PRK_TARGET_PATH/p2p-innerloop-vector-openmp      10 1024
+                $PRK_TARGET_PATH/p2p-hyperplane-vector-openmp     10 1024
+                $PRK_TARGET_PATH/p2p-hyperplane-vector-openmp     10 1024 64
                 $PRK_TARGET_PATH/stencil-vector-openmp            10 1000
                 $PRK_TARGET_PATH/transpose-vector-openmp          10 1024 32
                 $PRK_TARGET_PATH/nstream-vector-openmp            10 16777216 32

From a79f505e025babefd8e6973c39a39599a347f7fe Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Fri, 16 Mar 2018 15:27:35 -0700
Subject: [PATCH 050/245] SYCL with Travis support (#311)

* SYCL Travis support

Documentation/Examples
- add ProGTX impl of SYCL to examples
- add OpenMP flag to SYCL example (both triSYCL and ProGTX need it)

Bug fix:
- support all star stencils correctly
- remove unnecessary whitespace in generate-sycl-stencil.py

Unrelated:
- git clone with depth=1 in install-pstl.sh

* add missing block size to C++11 thread+async transpose
* use correct binary names
* add C++11 flags for SYCL
* wrong scope for SYCLDIR
* triSYCL requires C++14
* limit SYCL to Mac in Travis
---
 Cxx11/generate-sycl-stencil.py |  2 +-
 Cxx11/stencil-sycl.cc          |  2 +-
 common/make.defs.gcc           | 11 ++++++++---
 common/make.defs.intel         |  7 ++++++-
 common/make.defs.llvm          |  7 ++++++-
 travis/build-run-prk.sh        | 24 ++++++++++++++++++++++++
 travis/install-deps.sh         |  1 +
 travis/install-pstl.sh         |  2 +-
 travis/install-sycl.sh         |  8 ++++++++
 9 files changed, 56 insertions(+), 8 deletions(-)
 create mode 100644 travis/install-sycl.sh

diff --git a/Cxx11/generate-sycl-stencil.py b/Cxx11/generate-sycl-stencil.py
index 8a8f44ddb..bc049c892 100755
--- a/Cxx11/generate-sycl-stencil.py
+++ b/Cxx11/generate-sycl-stencil.py
@@ -10,7 +10,7 @@ def codegen(src,pattern,stencil_size,radius,W,model):
     src.write('           cl::sycl::buffer<double, 2> d_in,\n')
     src.write('           cl::sycl::buffer<double, 2> d_out) {\n')
     src.write('  q.submit([&](cl::sycl::handler& h) {\n')
-    src.write('    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);       \n')
+    src.write('    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);\n')
     src.write('    auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);\n')
     src.write('    h.parallel_for<class '+pattern+str(radius)+'>(cl::sycl::range<2> {n-2*'+str(radius)+',n-2*'+str(radius)+'}, cl::sycl::id<2> {'+str(radius)+','+str(radius)+'},\n')
     src.write('                                [=] (cl::sycl::item<2> it) {\n')
diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc
index f1ecb8abe..dba4a44af 100644
--- a/Cxx11/stencil-sycl.cc
+++ b/Cxx11/stencil-sycl.cc
@@ -227,7 +227,7 @@ int main(int argc, char* argv[])
    
       if (iter==1) stencil_time = prk::wtime();
 
-      star2(q, n, d_in, d_out);
+      stencil(q, n, d_in, d_out);
 
       q.submit([&](cl::sycl::handler& h) {
 
diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index 3e7b55235..074a1b696 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -46,9 +46,14 @@ OPENCLFLAG=-framework OpenCL
 #
 # triSYCL
 # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
-SYCLDIR=./triSYCL
-SYCLCXX=${CXX}
-SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include
+#SYCLDIR=./triSYCL
+#SYCLCXX=${CXX} ${OPENMPFLAG}
+#SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include
+# ProGTX
+# https://github.com/ProGTX/sycl-gtx
+SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx
+SYCLCXX=${CXX} ${OPENMPFLAG}
+SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG}
 #
 # OCCA
 #
diff --git a/common/make.defs.intel b/common/make.defs.intel
index 087964e2c..49beeb6fa 100644
--- a/common/make.defs.intel
+++ b/common/make.defs.intel
@@ -44,8 +44,13 @@ OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL
 # triSYCL
 # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
 SYCLDIR=./triSYCL
-SYCLCXX=${CXX}
+SYCLCXX=${CXX} ${OPENMPFLAG}
 SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include
+# ProGTX
+# https://github.com/ProGTX/sycl-gtx
+SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx
+SYCLCXX=${CXX} ${OPENMPFLAG}
+SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG}
 #
 # OCCA
 #
diff --git a/common/make.defs.llvm b/common/make.defs.llvm
index 40af0143b..c67e8b7db 100644
--- a/common/make.defs.llvm
+++ b/common/make.defs.llvm
@@ -66,8 +66,13 @@ SYCLFLAG+=-no-serial-memop
 # triSYCL
 # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
 #SYCLDIR=./triSYCL
-#SYCLCXX=${CXX}
+#SYCLCXX=${CXX} ${OPENMPFLAG}
 #SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include
+# ProGTX
+# https://github.com/ProGTX/sycl-gtx
+SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx
+SYCLCXX=${CXX} ${OPENMPFLAG}
+SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG}
 #
 # OCCA
 OCCADIR=${HOME}/prk-repo/Cxx11/occa
diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index 55898719f..3dbe3cfe1 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -586,6 +586,30 @@ case "$PRK_TARGET" in
         #    $PRK_TARGET_PATH/transpose-occa   10 1024 32
         #    $PRK_TARGET_PATH/nstream-occa     10 16777216 32
         #fi
+
+        # C++ w/ SYCL
+        # triSYCL requires Boost.  We are having Boost issues with Travis Linux builds.
+        if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then
+            SYCLDIR=${TRAVIS_ROOT}/triSYCL
+            if [ "${CC}" = "clang" ] ; then
+                # SYCL will compile without OpenMP
+                echo "SYCLCXX=${PRK_CXX} -pthread -std=c++14" >> common/make.defs
+            else
+                echo "SYCLCXX=${PRK_CXX} -fopenmp -std=c++14" >> common/make.defs
+            fi
+            echo "SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/include" >> common/make.defs
+            make -C $PRK_TARGET_PATH stencil-sycl transpose-sycl nstream-sycl
+            $PRK_TARGET_PATH/stencil-sycl     10 1000
+            $PRK_TARGET_PATH/transpose-sycl   10 1024 32
+            $PRK_TARGET_PATH/nstream-sycl     10 16777216 32
+            #echo "Test stencil code generator"
+            for s in star ; do # grid ; do # grid not supported yet
+                for r in 1 2 3 4 5 ; do
+                    $PRK_TARGET_PATH/stencil-sycl 10 200 20 $s $r
+                done
+            done
+        fi
+
         ;;
     allfortran)
         echo "Fortran"
diff --git a/travis/install-deps.sh b/travis/install-deps.sh
index 42b620858..3917e2fec 100755
--- a/travis/install-deps.sh
+++ b/travis/install-deps.sh
@@ -70,6 +70,7 @@ case "$PRK_TARGET" in
         sh ./travis/install-raja.sh $TRAVIS_ROOT
         sh ./travis/install-kokkos.sh $TRAVIS_ROOT
         #sh ./travis/install-occa.sh $TRAVIS_ROOT
+        sh ./travis/install-sycl.sh $TRAVIS_ROOT
         ;;
     allfortran)
         echo "Fortran"
diff --git a/travis/install-pstl.sh b/travis/install-pstl.sh
index 5f68368f8..ed5aba69b 100644
--- a/travis/install-pstl.sh
+++ b/travis/install-pstl.sh
@@ -5,4 +5,4 @@ set -x
 
 TRAVIS_ROOT="$1"
 
-git clone https://github.com/intel/parallelstl.git $TRAVIS_ROOT/pstl
+git clone --depth 1 https://github.com/intel/parallelstl.git $TRAVIS_ROOT/pstl
diff --git a/travis/install-sycl.sh b/travis/install-sycl.sh
new file mode 100644
index 000000000..3ac157a3f
--- /dev/null
+++ b/travis/install-sycl.sh
@@ -0,0 +1,8 @@
+#!/bin/sh
+
+set -e
+set -x
+
+TRAVIS_ROOT="$1"
+
+git clone --depth 1 https://github.com/triSYCL/triSYCL.git $TRAVIS_ROOT/triSYCL

From 18b6c95b981ffb94818f11ed8169c2d209a18fa1 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Fri, 16 Mar 2018 21:43:11 -0700
Subject: [PATCH 051/245] clean FORTRAN ignoring [ci skip]

---
 .gitignore | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/.gitignore b/.gitignore
index 6e6fe1688..950d52c2f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -43,17 +43,6 @@ func.c                          # PRK C89 stencil generated code
 *.output                        # ALCF Cobalt scheduler
 *.error                         # ALCF Cobalt scheduler
 
-FORTRAN/Stencil/stencil
-FORTRAN/Stencil/stencil-coarray
-FORTRAN/Stencil/stencil-omp
-FORTRAN/Stencil/stencil-pretty
-FORTRAN/Synch_p2p/p2p
-FORTRAN/Synch_p2p/p2p-coarray
-FORTRAN/Synch_p2p/p2p-omp
-FORTRAN/Transpose/transpose
-FORTRAN/Transpose/transpose-coarray
-FORTRAN/Transpose/transpose-omp
-FORTRAN/Transpose/transpose-pretty
 MPI1/AMR/amr
 MPI1/Branch/branch
 MPI1/DGEMM/dgemm
@@ -186,6 +175,14 @@ Cxx11/star6.cl
 Cxx11/star7.cl
 Cxx11/star8.cl
 Cxx11/star9.cl
+FORTRAN/dgemm-taskloop-openmp
+FORTRAN/dgemm-pretty
+FORTRAN/dgemm-openmp
+FORTRAN/dgemm
+FORTRAN/nstream
+FORTRAN/nstream-openmp
+FORTRAN/nstream-pretty
+FORTRAN/nstream-taskloop-openmp
 FORTRAN/p2p
 FORTRAN/p2p-innerloop
 FORTRAN/p2p-coarray

From edd2d6792c0d681735aec3ca96307f5b70ff92ae Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Sat, 17 Mar 2018 07:03:18 -0700
Subject: [PATCH 052/245] CUDA stencil (#321)

* make no tiling transpose explicit
* add stencil-cuda to make
* add CUDA stencil
* fix CUDA stencil code gen for Coriander
* fix issues with grid/block sizes
---
 Cxx11/Makefile                |   5 +-
 Cxx11/generate-cxx-stencil.py |  11 +-
 Cxx11/stencil-cuda.cu         | 275 ++++++++++++++++++++++++
 Cxx11/stencil_cuda.hpp        | 385 ++++++++++++++++++++++++++++++++++
 Cxx11/transpose-cuda.cu       |  23 +-
 5 files changed, 691 insertions(+), 8 deletions(-)
 create mode 100644 Cxx11/stencil-cuda.cu
 create mode 100644 Cxx11/stencil_cuda.hpp

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index d3cacecd2..f6fed48ca 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -68,7 +68,8 @@ p2p: p2p-vector p2p-doacross-vector-openmp p2p-hyperplane-vector-openmp p2p-task
 
 stencil: stencil-valarray stencil-vector stencil-vector-async stencil-vector-openmp stencil-openmp-target \
 	 stencil-vector-taskloop stencil-vector-stl stencil-vector-pstl stencil-vector-raja \
-	 stencil-vector-rangefor stencil-vector-tbb stencil-vector-thread stencil-kokkos stencil-opencl
+	 stencil-vector-rangefor stencil-vector-tbb stencil-vector-thread stencil-kokkos stencil-opencl \
+	 stencil-cuda
 
 transpose: transpose-valarray transpose-vector transpose-vector-async transpose-vector-openmp transpose-openmp-target \
 	   transpose-vector-taskloop transpose-vector-stl transpose-vector-pstl transpose-vector-raja \
@@ -107,7 +108,7 @@ kokkos: stencil-kokkos transpose-kokkos nstream-kokkos
 
 raja: p2p-vector-raja stencil-vector-raja transpose-vector-raja nstream-vector-raja
 
-cuda: transpose-cuda nstream-cuda
+cuda: stencil-cuda transpose-cuda nstream-cuda
 
 cublas: transpose-cublas nstream-cublas
 
diff --git a/Cxx11/generate-cxx-stencil.py b/Cxx11/generate-cxx-stencil.py
index 37e6077d3..134cd0e89 100755
--- a/Cxx11/generate-cxx-stencil.py
+++ b/Cxx11/generate-cxx-stencil.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python3
+#!/usr/bin/env python
 
 import sys
 import fileinput
@@ -69,6 +69,11 @@ def codegen(src,pattern,stencil_size,radius,W,model):
         src.write('    Kokkos::parallel_for ( Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>('+str(radius)+',n-'+str(radius)+'), KOKKOS_LAMBDA(const int i) {\n')
         src.write('      PRAGMA_SIMD\n')
         src.write('      for (auto j='+str(radius)+'; j<n-'+str(radius)+'; ++j) {\n')
+    elif (model=='cuda'):
+        src.write('__global__ void '+pattern+str(radius)+'(const int n, const prk_float * in, prk_float * out) {\n')
+        src.write('    const int i = blockIdx.x * blockDim.x + threadIdx.x;\n')
+        src.write('    const int j = blockIdx.y * blockDim.y + threadIdx.y;\n')
+        src.write('    if ( ('+str(radius)+' <= i) && (i < n-'+str(radius)+') && ('+str(radius)+' <= j) && (j < n-'+str(radius)+') ) {\n')
     else:
         src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {\n')
         src.write('    for (auto it='+str(radius)+'; it<n-'+str(radius)+'; it+=t) {\n')
@@ -110,6 +115,8 @@ def codegen(src,pattern,stencil_size,radius,W,model):
     elif (model=='target'):
         src.write('       }\n')
         src.write('     }\n')
+    elif (model=='cuda'):
+        src.write('     }\n')
     else:
         src.write('           }\n')
         src.write('         }\n')
@@ -143,7 +150,7 @@ def instance(src,model,pattern,r):
     codegen(src,pattern,stencil_size,r,W,model)
 
 def main():
-    for model in ['seq','rangefor','stl','pgnu','pstl','openmp','taskloop','target','tbb','raja','kokkos']:
+    for model in ['seq','rangefor','stl','pgnu','pstl','openmp','taskloop','target','tbb','raja','kokkos','cuda']:
       src = open('stencil_'+model+'.hpp','w')
       if (model=='target'):
           src.write('#define RESTRICT __restrict__\n\n')
diff --git a/Cxx11/stencil-cuda.cu b/Cxx11/stencil-cuda.cu
new file mode 100644
index 000000000..ba544ada7
--- /dev/null
+++ b/Cxx11/stencil-cuda.cu
@@ -0,0 +1,275 @@
+
+///
+/// Copyright (c) 2013, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    Stencil
+///
+/// PURPOSE: This program tests the efficiency with which a space-invariant,
+///          linear, symmetric filter (stencil) can be applied to a square
+///          grid or image.
+///
+/// USAGE:   The program takes as input the linear
+///          dimension of the grid, and the number of iterations on the grid
+///
+///                <progname> <iterations> <grid size>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// FUNCTIONS CALLED:
+///
+///          Other than standard C functions, the following functions are used in
+///          this program:
+///          wtime()
+///
+/// HISTORY: - Written by Rob Van der Wijngaart, February 2009.
+///          - RvdW: Removed unrolling pragmas for clarity;
+///            added constant to array "in" at end of each iteration to force
+///            refreshing of neighbor data in parallel versions; August 2013
+///            C++11-ification by Jeff Hammond, May 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+#include "prk_cuda.h"
+#include "stencil_cuda.hpp"
+
+__global__ void nothing(const int n, const prk_float * in, prk_float * out)
+{
+    //printf("You are trying to use a stencil that does not exist.\n");
+    //printf("Please generate the new stencil using the code generator.\n");
+    // n will never be zero - this is to silence compiler warnings.
+    //if (n==0) printf("in=%p out=%p\n", in, out);
+    //abort();
+}
+
+__global__ void add(const int n, prk_float * in)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int j = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if ((i<n) && (j<n)) {
+        in[i*n+j] += (prk_float)1;
+    }
+}
+
+int main(int argc, char* argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/CUDA Stencil execution on 2D grid" << std::endl;
+
+  prk::CUDA::info info;
+  info.print();
+
+  //////////////////////////////////////////////////////////////////////
+  // Process and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations, n, radius, tile_size;
+  bool star = true;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <array dimension> [<tile_size> <star/grid> <radius>]";
+      }
+
+      // number of times to run the algorithm
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      // linear grid dimension
+      n  = std::atoi(argv[2]);
+      if (n < 1) {
+        throw "ERROR: grid dimension must be positive";
+      } else if (n > std::floor(std::sqrt(INT_MAX))) {
+        throw "ERROR: grid dimension too large - overflow risk";
+      }
+
+      // default tile size for tiling of local transpose
+      tile_size = 32;
+      if (argc > 3) {
+          tile_size = std::atoi(argv[3]);
+          if (tile_size <= 0) tile_size = n;
+          if (tile_size > n) tile_size = n;
+      }
+
+      // stencil pattern
+      if (argc > 4) {
+          auto stencil = std::string(argv[4]);
+          auto grid = std::string("grid");
+          star = (stencil == grid) ? false : true;
+      }
+
+      // stencil radius
+      radius = 2;
+      if (argc > 5) {
+          radius = std::atoi(argv[5]);
+      }
+
+      if ( (radius < 1) || (2*radius+1 > n) ) {
+        throw "ERROR: Stencil radius negative or too large";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Grid size            = " << n << std::endl;
+  std::cout << "Tile size            = " << tile_size << std::endl;
+  std::cout << "Type of stencil      = " << (star ? "star" : "grid") << std::endl;
+  std::cout << "Radius of stencil    = " << radius << std::endl;
+
+  auto stencil = nothing;
+  if (star) {
+      switch (radius) {
+          case 1: stencil = star1; break;
+          case 2: stencil = star2; break;
+          case 3: stencil = star3; break;
+          case 4: stencil = star4; break;
+          case 5: stencil = star5; break;
+      }
+  } else {
+      switch (radius) {
+          case 1: stencil = grid1; break;
+          case 2: stencil = grid2; break;
+          case 3: stencil = grid3; break;
+          case 4: stencil = grid4; break;
+          case 5: stencil = grid5; break;
+      }
+  }
+
+  dim3 dimGrid(prk::divceil(n,tile_size),prk::divceil(n,tile_size),1);
+  dim3 dimBlock(tile_size, tile_size, 1);
+  info.checkDims(dimBlock, dimGrid);
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  auto stencil_time = 0.0;
+
+  const size_t nelems = (size_t)n * (size_t)n;
+  const size_t bytes = nelems * sizeof(prk_float);
+  prk_float * h_in;
+  prk_float * h_out;
+#ifndef __CORIANDERCC__
+  prk::CUDA::check( cudaMallocHost((void**)&h_in, bytes) );
+  prk::CUDA::check( cudaMallocHost((void**)&h_out, bytes) );
+#else
+  h_in = new prk_float[nelems];
+  h_out = new prk_float[nelems];
+#endif
+
+  for (auto i=0; i<n; i++) {
+    for (auto j=0; j<n; j++) {
+      h_in[i*n+j]  = static_cast<prk_float>(i+j);
+      h_out[i*n+j] = static_cast<prk_float>(0);
+    }
+  }
+
+  // copy input from host to device
+  prk_float * d_in;
+  prk_float * d_out;
+  prk::CUDA::check( cudaMalloc((void**)&d_in, bytes) );
+  prk::CUDA::check( cudaMalloc((void**)&d_out, bytes) );
+  prk::CUDA::check( cudaMemcpy(d_in, &(h_in[0]), bytes, cudaMemcpyHostToDevice) );
+  prk::CUDA::check( cudaMemcpy(d_out, &(h_out[0]), bytes, cudaMemcpyHostToDevice) );
+
+  for (auto iter = 0; iter<=iterations; iter++) {
+
+    if (iter==1) stencil_time = prk::wtime();
+
+    // Apply the stencil operator
+    stencil<<<dimGrid, dimBlock>>>(n, d_in, d_out);
+
+    // Add constant to solution to force refresh of neighbor data, if any
+    add<<<dimGrid, dimBlock>>>(n, d_in);
+
+#ifndef __CORIANDERCC__
+    // silence "ignoring cudaDeviceSynchronize for now" warning
+    prk::CUDA::check( cudaDeviceSynchronize() );
+#endif
+  }
+  stencil_time = prk::wtime() - stencil_time;
+
+  // copy output back to host
+  prk::CUDA::check( cudaMemcpy(&(h_out[0]), d_out, bytes, cudaMemcpyDeviceToHost) );
+
+#ifdef VERBOSE
+  // copy input back to host - debug only
+  prk::CUDA::check( cudaMemcpy(&(h_in[0]), d_in, bytes, cudaMemcpyDeviceToHost) );
+#endif
+
+  prk::CUDA::check( cudaFree(d_out) );
+  prk::CUDA::check( cudaFree(d_in) );
+
+  //////////////////////////////////////////////////////////////////////
+  // Analyze and output results.
+  //////////////////////////////////////////////////////////////////////
+
+  // interior of grid with respect to stencil
+  size_t active_points = static_cast<size_t>(n-2*radius)*static_cast<size_t>(n-2*radius);
+  // compute L1 norm
+  double norm = 0.0;
+  for (auto i=radius; i<n-radius; i++) {
+    for (auto j=radius; j<n-radius; j++) {
+      norm += std::fabs(h_out[i*n+j]);
+    }
+  }
+  norm /= active_points;
+
+  // verify correctness
+  const double epsilon = 1.0e-8;
+  double reference_norm = 2.*(iterations+1.);
+  if (std::fabs(norm-reference_norm) > epsilon) {
+    std::cout << "ERROR: L1 norm = " << norm
+              << " Reference L1 norm = " << reference_norm << std::endl;
+    return 1;
+  } else {
+    std::cout << "Solution validates" << std::endl;
+#ifdef VERBOSE
+    std::cout << "L1 norm = " << norm
+              << " Reference L1 norm = " << reference_norm << std::endl;
+#endif
+    const int stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1);
+    size_t flops = (2L*(size_t)stencil_size+1L) * active_points;
+    auto avgtime = stencil_time/iterations;
+    std::cout << "Rate (MFlops/s): " << 1.0e-6 * static_cast<double>(flops)/avgtime
+              << " Avg time (s): " << avgtime << std::endl;
+  }
+
+  return 0;
+}
diff --git a/Cxx11/stencil_cuda.hpp b/Cxx11/stencil_cuda.hpp
new file mode 100644
index 000000000..1783327fa
--- /dev/null
+++ b/Cxx11/stencil_cuda.hpp
@@ -0,0 +1,385 @@
+__global__ void star1(const int n, const prk_float * in, prk_float * out) {
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    const int j = blockIdx.y * blockDim.y + threadIdx.y;
+    if ( (1 <= i) && (i < n-1) && (1 <= j) && (j < n-1) ) {
+            out[i*n+j] += +in[(i+-1)*n+(j+0)] * -0.5
+                          +in[(i+0)*n+(j+-1)] * -0.5
+                          +in[(i+0)*n+(j+1)] * 0.5
+                          +in[(i+1)*n+(j+0)] * 0.5;
+     }
+}
+
+__global__ void star2(const int n, const prk_float * in, prk_float * out) {
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    const int j = blockIdx.y * blockDim.y + threadIdx.y;
+    if ( (2 <= i) && (i < n-2) && (2 <= j) && (j < n-2) ) {
+            out[i*n+j] += +in[(i+-2)*n+(j+0)] * -0.125
+                          +in[(i+-1)*n+(j+0)] * -0.25
+                          +in[(i+0)*n+(j+-2)] * -0.125
+                          +in[(i+0)*n+(j+-1)] * -0.25
+                          +in[(i+0)*n+(j+1)] * 0.25
+                          +in[(i+0)*n+(j+2)] * 0.125
+                          +in[(i+1)*n+(j+0)] * 0.25
+                          +in[(i+2)*n+(j+0)] * 0.125;
+     }
+}
+
+__global__ void star3(const int n, const prk_float * in, prk_float * out) {
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    const int j = blockIdx.y * blockDim.y + threadIdx.y;
+    if ( (3 <= i) && (i < n-3) && (3 <= j) && (j < n-3) ) {
+            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556
+                          +in[(i+-2)*n+(j+0)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+0)] * -0.166666666667
+                          +in[(i+0)*n+(j+-3)] * -0.0555555555556
+                          +in[(i+0)*n+(j+-2)] * -0.0833333333333
+                          +in[(i+0)*n+(j+-1)] * -0.166666666667
+                          +in[(i+0)*n+(j+1)] * 0.166666666667
+                          +in[(i+0)*n+(j+2)] * 0.0833333333333
+                          +in[(i+0)*n+(j+3)] * 0.0555555555556
+                          +in[(i+1)*n+(j+0)] * 0.166666666667
+                          +in[(i+2)*n+(j+0)] * 0.0833333333333
+                          +in[(i+3)*n+(j+0)] * 0.0555555555556;
+     }
+}
+
+__global__ void star4(const int n, const prk_float * in, prk_float * out) {
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    const int j = blockIdx.y * blockDim.y + threadIdx.y;
+    if ( (4 <= i) && (i < n-4) && (4 <= j) && (j < n-4) ) {
+            out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125
+                          +in[(i+-3)*n+(j+0)] * -0.0416666666667
+                          +in[(i+-2)*n+(j+0)] * -0.0625
+                          +in[(i+-1)*n+(j+0)] * -0.125
+                          +in[(i+0)*n+(j+-4)] * -0.03125
+                          +in[(i+0)*n+(j+-3)] * -0.0416666666667
+                          +in[(i+0)*n+(j+-2)] * -0.0625
+                          +in[(i+0)*n+(j+-1)] * -0.125
+                          +in[(i+0)*n+(j+1)] * 0.125
+                          +in[(i+0)*n+(j+2)] * 0.0625
+                          +in[(i+0)*n+(j+3)] * 0.0416666666667
+                          +in[(i+0)*n+(j+4)] * 0.03125
+                          +in[(i+1)*n+(j+0)] * 0.125
+                          +in[(i+2)*n+(j+0)] * 0.0625
+                          +in[(i+3)*n+(j+0)] * 0.0416666666667
+                          +in[(i+4)*n+(j+0)] * 0.03125;
+     }
+}
+
+__global__ void star5(const int n, const prk_float * in, prk_float * out) {
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    const int j = blockIdx.y * blockDim.y + threadIdx.y;
+    if ( (5 <= i) && (i < n-5) && (5 <= j) && (j < n-5) ) {
+            out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02
+                          +in[(i+-4)*n+(j+0)] * -0.025
+                          +in[(i+-3)*n+(j+0)] * -0.0333333333333
+                          +in[(i+-2)*n+(j+0)] * -0.05
+                          +in[(i+-1)*n+(j+0)] * -0.1
+                          +in[(i+0)*n+(j+-5)] * -0.02
+                          +in[(i+0)*n+(j+-4)] * -0.025
+                          +in[(i+0)*n+(j+-3)] * -0.0333333333333
+                          +in[(i+0)*n+(j+-2)] * -0.05
+                          +in[(i+0)*n+(j+-1)] * -0.1
+                          +in[(i+0)*n+(j+1)] * 0.1
+                          +in[(i+0)*n+(j+2)] * 0.05
+                          +in[(i+0)*n+(j+3)] * 0.0333333333333
+                          +in[(i+0)*n+(j+4)] * 0.025
+                          +in[(i+0)*n+(j+5)] * 0.02
+                          +in[(i+1)*n+(j+0)] * 0.1
+                          +in[(i+2)*n+(j+0)] * 0.05
+                          +in[(i+3)*n+(j+0)] * 0.0333333333333
+                          +in[(i+4)*n+(j+0)] * 0.025
+                          +in[(i+5)*n+(j+0)] * 0.02;
+     }
+}
+
+__global__ void grid1(const int n, const prk_float * in, prk_float * out) {
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    const int j = blockIdx.y * blockDim.y + threadIdx.y;
+    if ( (1 <= i) && (i < n-1) && (1 <= j) && (j < n-1) ) {
+            out[i*n+j] += +in[(i+-1)*n+(j+-1)] * -0.25
+                          +in[(i+-1)*n+(j+0)] * -0.25
+                          +in[(i+0)*n+(j+-1)] * -0.25
+                          +in[(i+0)*n+(j+1)] * 0.25
+                          +in[(i+1)*n+(j+0)] * 0.25
+                          +in[(i+1)*n+(j+1)] * 0.25
+                          ;
+     }
+}
+
+__global__ void grid2(const int n, const prk_float * in, prk_float * out) {
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    const int j = blockIdx.y * blockDim.y + threadIdx.y;
+    if ( (2 <= i) && (i < n-2) && (2 <= j) && (j < n-2) ) {
+            out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625
+                          +in[(i+-2)*n+(j+-1)] * -0.0208333333333
+                          +in[(i+-2)*n+(j+0)] * -0.0208333333333
+                          +in[(i+-2)*n+(j+1)] * -0.0208333333333
+                          +in[(i+-1)*n+(j+-2)] * -0.0208333333333
+                          +in[(i+-1)*n+(j+-1)] * -0.125
+                          +in[(i+-1)*n+(j+0)] * -0.125
+                          +in[(i+-1)*n+(j+2)] * 0.0208333333333
+                          +in[(i+0)*n+(j+-2)] * -0.0208333333333
+                          +in[(i+0)*n+(j+-1)] * -0.125
+                          +in[(i+0)*n+(j+1)] * 0.125
+                          +in[(i+0)*n+(j+2)] * 0.0208333333333
+                          +in[(i+1)*n+(j+-2)] * -0.0208333333333
+                          +in[(i+1)*n+(j+0)] * 0.125
+                          +in[(i+1)*n+(j+1)] * 0.125
+                          +in[(i+1)*n+(j+2)] * 0.0208333333333
+                          +in[(i+2)*n+(j+-1)] * 0.0208333333333
+                          +in[(i+2)*n+(j+0)] * 0.0208333333333
+                          +in[(i+2)*n+(j+1)] * 0.0208333333333
+                          +in[(i+2)*n+(j+2)] * 0.0625
+                          ;
+     }
+}
+
+__global__ void grid3(const int n, const prk_float * in, prk_float * out) {
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    const int j = blockIdx.y * blockDim.y + threadIdx.y;
+    if ( (3 <= i) && (i < n-3) && (3 <= j) && (j < n-3) ) {
+            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778
+                          +in[(i+-3)*n+(j+-2)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+-1)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+0)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+1)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+2)] * -0.00555555555556
+                          +in[(i+-2)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+-2)*n+(j+-2)] * -0.0416666666667
+                          +in[(i+-2)*n+(j+-1)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+0)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+1)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+3)] * 0.00555555555556
+                          +in[(i+-1)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+-1)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+-1)*n+(j+-1)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+0)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+-1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+0)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+0)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+0)*n+(j+-1)] * -0.0833333333333
+                          +in[(i+0)*n+(j+1)] * 0.0833333333333
+                          +in[(i+0)*n+(j+2)] * 0.0138888888889
+                          +in[(i+0)*n+(j+3)] * 0.00555555555556
+                          +in[(i+1)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+1)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+1)*n+(j+0)] * 0.0833333333333
+                          +in[(i+1)*n+(j+1)] * 0.0833333333333
+                          +in[(i+1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+2)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+2)*n+(j+-1)] * 0.0138888888889
+                          +in[(i+2)*n+(j+0)] * 0.0138888888889
+                          +in[(i+2)*n+(j+1)] * 0.0138888888889
+                          +in[(i+2)*n+(j+2)] * 0.0416666666667
+                          +in[(i+2)*n+(j+3)] * 0.00555555555556
+                          +in[(i+3)*n+(j+-2)] * 0.00555555555556
+                          +in[(i+3)*n+(j+-1)] * 0.00555555555556
+                          +in[(i+3)*n+(j+0)] * 0.00555555555556
+                          +in[(i+3)*n+(j+1)] * 0.00555555555556
+                          +in[(i+3)*n+(j+2)] * 0.00555555555556
+                          +in[(i+3)*n+(j+3)] * 0.0277777777778
+                          ;
+     }
+}
+
+__global__ void grid4(const int n, const prk_float * in, prk_float * out) {
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    const int j = blockIdx.y * blockDim.y + threadIdx.y;
+    if ( (4 <= i) && (i < n-4) && (4 <= j) && (j < n-4) ) {
+            out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625
+                          +in[(i+-4)*n+(j+-3)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+-2)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+-1)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+0)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+1)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+2)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+3)] * -0.00223214285714
+                          +in[(i+-3)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-3)*n+(j+-3)] * -0.0208333333333
+                          +in[(i+-3)*n+(j+-2)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+-1)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+0)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+1)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+2)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+4)] * 0.00223214285714
+                          +in[(i+-2)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-2)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+-2)*n+(j+-2)] * -0.03125
+                          +in[(i+-2)*n+(j+-1)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+0)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+1)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+-2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+-1)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-1)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+-1)*n+(j+-2)] * -0.0104166666667
+                          +in[(i+-1)*n+(j+-1)] * -0.0625
+                          +in[(i+-1)*n+(j+0)] * -0.0625
+                          +in[(i+-1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+-1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+-1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+0)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+0)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+0)*n+(j+-2)] * -0.0104166666667
+                          +in[(i+0)*n+(j+-1)] * -0.0625
+                          +in[(i+0)*n+(j+1)] * 0.0625
+                          +in[(i+0)*n+(j+2)] * 0.0104166666667
+                          +in[(i+0)*n+(j+3)] * 0.00416666666667
+                          +in[(i+0)*n+(j+4)] * 0.00223214285714
+                          +in[(i+1)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+1)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+1)*n+(j+-2)] * -0.0104166666667
+                          +in[(i+1)*n+(j+0)] * 0.0625
+                          +in[(i+1)*n+(j+1)] * 0.0625
+                          +in[(i+1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+2)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+2)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+2)*n+(j+-1)] * 0.0104166666667
+                          +in[(i+2)*n+(j+0)] * 0.0104166666667
+                          +in[(i+2)*n+(j+1)] * 0.0104166666667
+                          +in[(i+2)*n+(j+2)] * 0.03125
+                          +in[(i+2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+3)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+3)*n+(j+-2)] * 0.00416666666667
+                          +in[(i+3)*n+(j+-1)] * 0.00416666666667
+                          +in[(i+3)*n+(j+0)] * 0.00416666666667
+                          +in[(i+3)*n+(j+1)] * 0.00416666666667
+                          +in[(i+3)*n+(j+2)] * 0.00416666666667
+                          +in[(i+3)*n+(j+3)] * 0.0208333333333
+                          +in[(i+3)*n+(j+4)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-3)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-2)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-1)] * 0.00223214285714
+                          +in[(i+4)*n+(j+0)] * 0.00223214285714
+                          +in[(i+4)*n+(j+1)] * 0.00223214285714
+                          +in[(i+4)*n+(j+2)] * 0.00223214285714
+                          +in[(i+4)*n+(j+3)] * 0.00223214285714
+                          +in[(i+4)*n+(j+4)] * 0.015625
+                          ;
+     }
+}
+
+__global__ void grid5(const int n, const prk_float * in, prk_float * out) {
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    const int j = blockIdx.y * blockDim.y + threadIdx.y;
+    if ( (5 <= i) && (i < n-5) && (5 <= j) && (j < n-5) ) {
+            out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01
+                          +in[(i+-5)*n+(j+-4)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-3)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-2)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-1)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+0)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+1)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+2)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+3)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+4)] * -0.00111111111111
+                          +in[(i+-4)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-4)*n+(j+-4)] * -0.0125
+                          +in[(i+-4)*n+(j+-3)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+-2)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+-1)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+0)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+1)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+2)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+3)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-3)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-3)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-3)*n+(j+-3)] * -0.0166666666667
+                          +in[(i+-3)*n+(j+-2)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+-1)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+0)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+1)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+2)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-2)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-2)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-2)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+-2)*n+(j+-2)] * -0.025
+                          +in[(i+-2)*n+(j+-1)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+0)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+1)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+-2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-1)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-1)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-1)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+-1)*n+(j+-2)] * -0.00833333333333
+                          +in[(i+-1)*n+(j+-1)] * -0.05
+                          +in[(i+-1)*n+(j+0)] * -0.05
+                          +in[(i+-1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+-1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+-1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+0)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+0)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+0)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+0)*n+(j+-2)] * -0.00833333333333
+                          +in[(i+0)*n+(j+-1)] * -0.05
+                          +in[(i+0)*n+(j+1)] * 0.05
+                          +in[(i+0)*n+(j+2)] * 0.00833333333333
+                          +in[(i+0)*n+(j+3)] * 0.00333333333333
+                          +in[(i+0)*n+(j+4)] * 0.00178571428571
+                          +in[(i+0)*n+(j+5)] * 0.00111111111111
+                          +in[(i+1)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+1)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+1)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+1)*n+(j+-2)] * -0.00833333333333
+                          +in[(i+1)*n+(j+0)] * 0.05
+                          +in[(i+1)*n+(j+1)] * 0.05
+                          +in[(i+1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+2)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+2)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+2)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+2)*n+(j+-1)] * 0.00833333333333
+                          +in[(i+2)*n+(j+0)] * 0.00833333333333
+                          +in[(i+2)*n+(j+1)] * 0.00833333333333
+                          +in[(i+2)*n+(j+2)] * 0.025
+                          +in[(i+2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+3)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+3)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+3)*n+(j+-2)] * 0.00333333333333
+                          +in[(i+3)*n+(j+-1)] * 0.00333333333333
+                          +in[(i+3)*n+(j+0)] * 0.00333333333333
+                          +in[(i+3)*n+(j+1)] * 0.00333333333333
+                          +in[(i+3)*n+(j+2)] * 0.00333333333333
+                          +in[(i+3)*n+(j+3)] * 0.0166666666667
+                          +in[(i+3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+4)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+4)*n+(j+-3)] * 0.00178571428571
+                          +in[(i+4)*n+(j+-2)] * 0.00178571428571
+                          +in[(i+4)*n+(j+-1)] * 0.00178571428571
+                          +in[(i+4)*n+(j+0)] * 0.00178571428571
+                          +in[(i+4)*n+(j+1)] * 0.00178571428571
+                          +in[(i+4)*n+(j+2)] * 0.00178571428571
+                          +in[(i+4)*n+(j+3)] * 0.00178571428571
+                          +in[(i+4)*n+(j+4)] * 0.0125
+                          +in[(i+4)*n+(j+5)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-4)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-3)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-2)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-1)] * 0.00111111111111
+                          +in[(i+5)*n+(j+0)] * 0.00111111111111
+                          +in[(i+5)*n+(j+1)] * 0.00111111111111
+                          +in[(i+5)*n+(j+2)] * 0.00111111111111
+                          +in[(i+5)*n+(j+3)] * 0.00111111111111
+                          +in[(i+5)*n+(j+4)] * 0.00111111111111
+                          +in[(i+5)*n+(j+5)] * 0.01
+                          ;
+     }
+}
+
diff --git a/Cxx11/transpose-cuda.cu b/Cxx11/transpose-cuda.cu
index ddb22ca92..1efdab462 100644
--- a/Cxx11/transpose-cuda.cu
+++ b/Cxx11/transpose-cuda.cu
@@ -56,6 +56,8 @@
 #include "prk_util.h"
 #include "prk_cuda.h"
 
+#define TILED 0
+
 #if TILED
 // The kernel was derived from https://github.com/parallel-forall/code-samples/blob/master/series/cuda-cpp/transpose/transpose.cu,
 // which is the reason for the additional copyright noted above.
@@ -100,7 +102,7 @@ int main(int argc, char * argv[])
   //////////////////////////////////////////////////////////////////////
 
   int iterations;
-  int order;
+  int order, tile_size;
   try {
       if (argc < 3) {
         throw "Usage: <# iterations> <matrix order>";
@@ -123,6 +125,14 @@ int main(int argc, char * argv[])
           std::cout << "Sorry, but order (" << order << ") must be evenly divible by " << tile_dim
                     << " or the results are going to be wrong.\n";
       }
+#else
+      // default tile size for tiling of local transpose
+      tile_size = 32;
+      if (argc > 3) {
+          tile_size = std::atoi(argv[3]);
+          if (tile_size <= 0) tile_size = order;
+          if (tile_size > order) tile_size = order;
+      }
 #endif
 #ifdef __CORIANDERCC__
       // This has not been analyzed, but it is an empirical fact.
@@ -136,15 +146,20 @@ int main(int argc, char * argv[])
     return 1;
   }
 
-  std::cout << "Matrix order          = " << order << std::endl;
   std::cout << "Number of iterations  = " << iterations << std::endl;
+  std::cout << "Matrix order          = " << order << std::endl;
+#if TILED
+  std::cout << "Tile size            = " << tile_dim << std::endl;
+#else
+  std::cout << "Tile size            = " << tile_size << std::endl;
+#endif
 
 #if TILED
   dim3 dimGrid(order/tile_dim, order/tile_dim, 1);
   dim3 dimBlock(tile_dim, block_rows, 1);
 #else
-  dim3 dimGrid(order, order, 1);
-  dim3 dimBlock(1, 1, 1);
+  dim3 dimGrid(prk::divceil(order,tile_size),prk::divceil(order,tile_size),1);
+  dim3 dimBlock(tile_size, tile_size, 1);
 #endif
 
   info.checkDims(dimBlock, dimGrid);

From eef54724574fdeca7737350fbdcdc3c6061a442d Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Sat, 17 Mar 2018 07:30:03 -0700
Subject: [PATCH 053/245] minor fixes in stencil sycl (#319)

---
 Cxx11/stencil-sycl.cc | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc
index dba4a44af..24569821c 100644
--- a/Cxx11/stencil-sycl.cc
+++ b/Cxx11/stencil-sycl.cc
@@ -113,12 +113,12 @@ int main(int argc, char* argv[])
   //////////////////////////////////////////////////////////////////////
 
   int iterations;
-  size_t n;
-  size_t radius = 2;
+  size_t n, tile_size;
   bool star = true;
+  size_t radius = 2;
   try {
       if (argc < 3) {
-        throw "Usage: <# iterations> <array dimension>";
+        throw "Usage: <# iterations> <array dimension> [<tile size> <star/grid> <stencil radius>]";
       }
 
       // number of times to run the algorithm
@@ -135,7 +135,6 @@ int main(int argc, char* argv[])
         throw "ERROR: grid dimension too large - overflow risk";
       }
 
-#if 0
       // default tile size for tiling of local transpose
       tile_size = 32;
       if (argc > 3) {
@@ -143,7 +142,6 @@ int main(int argc, char* argv[])
           if (tile_size <= 0) tile_size = n;
           if (tile_size > n) tile_size = n;
       }
-#endif
 
       // stencil pattern
       if (argc > 4) {
@@ -181,18 +179,18 @@ int main(int argc, char* argv[])
           case 4: stencil = star4; break;
           case 5: stencil = star5; break;
       }
-  } else {
+  }
+#if 0
+  else {
       switch (radius) {
-          //case 1: stencil = grid1; break;
-          //case 2: stencil = grid2; break;
-          //case 3: stencil = grid3; break;
-          //case 4: stencil = grid4; break;
-          //case 5: stencil = grid5; break;
+          case 1: stencil = grid1; break;
+          case 2: stencil = grid2; break;
+          case 3: stencil = grid3; break;
+          case 4: stencil = grid4; break;
+          case 5: stencil = grid5; break;
       }
   }
-
-  // SYCL device queue
-  cl::sycl::queue q;
+#endif
 
   //////////////////////////////////////////////////////////////////////
   // Allocate space and perform the computation
@@ -202,6 +200,8 @@ int main(int argc, char* argv[])
 
   std::vector<double> h_out(n*n,0.0);
 
+  // SYCL device queue
+  cl::sycl::queue q;
   {
     // initialize device buffers from host buffers
     cl::sycl::buffer<double, 2> d_in  { cl::sycl::range<2> {n, n} };
@@ -224,7 +224,7 @@ int main(int argc, char* argv[])
     q.wait();
 
     for (auto iter = 0; iter<=iterations; iter++) {
-   
+
       if (iter==1) stencil_time = prk::wtime();
 
       stencil(q, n, d_in, d_out);
@@ -234,7 +234,7 @@ int main(int argc, char* argv[])
         // accessor methods
         auto in  = d_in.get_access<cl::sycl::access::mode::read_write>(h);
         auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
-       
+
         // Add constant to solution to force refresh of neighbor data, if any
         h.parallel_for<class add>(cl::sycl::range<2> {n, n}, //cl::sycl::id<2> {0, 0},
                                   [=] (cl::sycl::item<2> it) {

From 892c85dd4f415563fe652f43c45d4fb9b1dd0a7f Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Sat, 17 Mar 2018 17:20:23 -0700
Subject: [PATCH 054/245] Sycl stencil 1d (#320)

support 1D and 2D indexing in SYCL codes

Also fixed correctness issue in stencil-sycl that was only observed with triSYCL.  Needed to synchronize between application of the stencil and the add() update.
---
 Cxx11/generate-sycl-stencil.py |  83 +++++++++----
 Cxx11/stencil-sycl.cc          |  85 +++++++------
 Cxx11/stencil_sycl.hpp         | 220 +++++++++++++++++++++++++--------
 Cxx11/transpose-sycl.cc        |  28 +++--
 README.md                      |   2 +
 5 files changed, 294 insertions(+), 124 deletions(-)

diff --git a/Cxx11/generate-sycl-stencil.py b/Cxx11/generate-sycl-stencil.py
index bc049c892..e0c0cae1e 100755
--- a/Cxx11/generate-sycl-stencil.py
+++ b/Cxx11/generate-sycl-stencil.py
@@ -1,36 +1,74 @@
-#!/usr/bin/env python3
+#!/usr/bin/env python
 
 import sys
 import fileinput
 import string
 import os
 
-def codegen(src,pattern,stencil_size,radius,W,model):
+def codegen(src,pattern,stencil_size,radius,W,model,dim):
     src.write('void '+pattern+str(radius)+'(cl::sycl::queue & q, const size_t n,\n')
-    src.write('           cl::sycl::buffer<double, 2> d_in,\n')
-    src.write('           cl::sycl::buffer<double, 2> d_out) {\n')
+    if (dim==2):
+        src.write('           cl::sycl::buffer<double, 2> & d_in,\n')
+        src.write('           cl::sycl::buffer<double, 2> & d_out) {\n')
+    else:
+        src.write('           cl::sycl::buffer<double> & d_in,\n')
+        src.write('           cl::sycl::buffer<double> & d_out) {\n')
     src.write('  q.submit([&](cl::sycl::handler& h) {\n')
     src.write('    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);\n')
     src.write('    auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);\n')
-    src.write('    h.parallel_for<class '+pattern+str(radius)+'>(cl::sycl::range<2> {n-2*'+str(radius)+',n-2*'+str(radius)+'}, cl::sycl::id<2> {'+str(radius)+','+str(radius)+'},\n')
-    src.write('                                [=] (cl::sycl::item<2> it) {\n')
-    src.write('        cl::sycl::id<2> xy = it.get_id();\n')
-    for r in range(1,radius+1):
-        src.write('        cl::sycl::id<2> dx'+str(r)+'(cl::sycl::range<2> {'+str(r)+',0});\n')
-        src.write('        cl::sycl::id<2> dy'+str(r)+'(cl::sycl::range<2> {0,'+str(r)+'});\n')
-    src.write('        out[xy] += ')
+    if (dim==2):
+        src.write('    h.parallel_for<class '+pattern+str(radius)+'_'+str(dim)+'d>(cl::sycl::range<2> {n-2*'+str(radius)+',n-2*'+str(radius)+'}, cl::sycl::id<2> {'+str(radius)+','+str(radius)+'},\n')
+        src.write('                                [=] (cl::sycl::item<2> it) {\n')
+        src.write('        cl::sycl::id<2> xy = it.get_id();\n')
+        for r in range(1,radius+1):
+            src.write('        cl::sycl::id<2> dx'+str(r)+'(cl::sycl::range<2> {'+str(r)+',0});\n')
+            src.write('        cl::sycl::id<2> dy'+str(r)+'(cl::sycl::range<2> {0,'+str(r)+'});\n')
+        src.write('        out[xy] += ')
+    else:
+        src.write('    h.parallel_for<class '+pattern+str(radius)+'_'+str(dim)+'d>(cl::sycl::range<2> {n-2*'+str(radius)+',n-2*'+str(radius)+'}, cl::sycl::id<2> {'+str(radius)+','+str(radius)+'},\n')
+        src.write('                                [=] (cl::sycl::item<2> it) {\n')
+        # 1D indexing the slow way
+        #src.write('        auto i = it[0];\n')
+        #src.write('        auto j = it[1];\n')
+        #src.write('        out[i*n+j] += ')
+        # 1D indexing the fast way
+        src.write('        out[it[0]*n+it[1]] += ')
     if pattern == 'star':
         for i in range(1,radius+1):
-            if i > 1:
-                src.write('\n')
-                src.write(19*' ')
-            src.write('+in[xy+dx'+str(i)+'] * '+str(+1./(2.*i*radius)))
-            src.write('\n'+19*' ')
-            src.write('+in[xy+dy'+str(i)+'] * '+str(+1./(2.*i*radius)))
-            src.write('\n'+19*' ')
-            src.write('+in[xy-dx'+str(i)+'] * '+str(-1./(2.*i*radius)))
-            src.write('\n'+19*' ')
-            src.write('+in[xy-dy'+str(i)+'] * '+str(-1./(2.*i*radius)))
+            if (dim==2):
+                if i > 1:
+                    src.write('\n')
+                    src.write(19*' ')
+                src.write('+in[xy+dx'+str(i)+'] * '+str(+1./(2.*i*radius)))
+                src.write('\n'+19*' ')
+                src.write('+in[xy-dx'+str(i)+'] * '+str(-1./(2.*i*radius)))
+                src.write('\n'+19*' ')
+                src.write('+in[xy+dy'+str(i)+'] * '+str(+1./(2.*i*radius)))
+                src.write('\n'+19*' ')
+                src.write('+in[xy-dy'+str(i)+'] * '+str(-1./(2.*i*radius)))
+            else:
+                # 1D indexing the slow way
+                #if i > 1:
+                #    src.write('\n')
+                #    src.write(22*' ')
+                #src.write('+in[i*n+(j+'+str(i)+')] * '+str(+1./(2.*i*radius)))
+                #src.write('\n'+22*' ')
+                #src.write('+in[i*n+(j-'+str(i)+')] * '+str(-1./(2.*i*radius)))
+                #src.write('\n'+22*' ')
+                #src.write('+in[(i+'+str(i)+')*n+j] * '+str(+1./(2.*i*radius)))
+                #src.write('\n'+22*' ')
+                #src.write('+in[(i-'+str(i)+')*n+j] * '+str(-1./(2.*i*radius)))
+                # 1D indexing the fast way
+                if i > 1:
+                    src.write('\n')
+                    src.write(30*' ')
+                src.write('+in[it[0]*n+(it[1]+'+str(i)+')] * '+str(+1./(2.*i*radius)))
+                src.write('\n'+30*' ')
+                src.write('+in[it[0]*n+(it[1]-'+str(i)+')] * '+str(-1./(2.*i*radius)))
+                src.write('\n'+30*' ')
+                src.write('+in[(it[0]+'+str(i)+')*n+it[1]] * '+str(+1./(2.*i*radius)))
+                src.write('\n'+30*' ')
+                src.write('+in[(it[0]-'+str(i)+')*n+it[1]] * '+str(-1./(2.*i*radius)))
             if i == radius:
                 src.write(';\n')
     else:
@@ -62,7 +100,8 @@ def instance(src,model,pattern,r):
             W[r+j][r+j]    = +1./(4*j*r)
             W[r-j][r-j]    = -1./(4*j*r)
 
-    codegen(src,pattern,stencil_size,r,W,model)
+    codegen(src,pattern,stencil_size,r,W,model,1)
+    codegen(src,pattern,stencil_size,r,W,model,2)
 
 def main():
     for model in ['sycl']:
diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc
index 24569821c..c4bfa6ff8 100644
--- a/Cxx11/stencil-sycl.cc
+++ b/Cxx11/stencil-sycl.cc
@@ -60,10 +60,16 @@
 ///
 //////////////////////////////////////////////////////////////////////
 
+#define USE_2D_INDEXING 0
+
 #include "prk_util.h"
 #include "stencil_sycl.hpp"
 
-void nothing(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double, 2> d_in, cl::sycl::buffer<double, 2> d_out)
+#if USE_2D_INDEXING
+void nothing(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double, 2> & d_in, cl::sycl::buffer<double, 2> & d_out)
+#else
+void nothing(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double> & d_in, cl::sycl::buffer<double> & d_out)
+#endif
 {
     std::cout << "You are trying to use a stencil that does not exist.\n";
     std::cout << "Please generate the new stencil using the code generator\n";
@@ -71,38 +77,6 @@ void nothing(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double, 2> d_
     std::abort();
 }
 
-#if 0
-void star2(cl::sycl::queue & q, const size_t n,
-           cl::sycl::buffer<double, 2> d_in,
-           cl::sycl::buffer<double, 2> d_out)
-{
-   q.submit([&](cl::sycl::handler& h) {
-
-     // accessor methods
-     auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
-     auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
-
-     // Apply the stencil operator
-     h.parallel_for<class star2>(cl::sycl::range<2> {n-4, n-4}, cl::sycl::id<2> {2, 2},
-                                 [=] (cl::sycl::item<2> it) {
-         cl::sycl::id<2> xy = it.get_id();
-         cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
-         cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
-         cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0});
-         cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2});
-         out[xy] += +in[xy-dx1] * -0.25
-                    +in[xy+dx1] *  0.25
-                    +in[xy-dy1] * -0.25
-                    +in[xy+dy1] *  0.25
-                    +in[xy-dx2] * -0.125
-                    +in[xy+dx2] *  0.125
-                    +in[xy-dy2] * -0.125
-                    +in[xy+dy2] *  0.125;
-     });
-   });
-}
-#endif
-
 int main(int argc, char* argv[])
 {
   std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
@@ -198,27 +172,39 @@ int main(int argc, char* argv[])
 
   auto stencil_time = 0.0;
 
+  std::vector<double> h_in(n*n,0.0);
   std::vector<double> h_out(n*n,0.0);
 
   // SYCL device queue
   cl::sycl::queue q;
   {
     // initialize device buffers from host buffers
+#if USE_2D_INDEXING
     cl::sycl::buffer<double, 2> d_in  { cl::sycl::range<2> {n, n} };
     cl::sycl::buffer<double, 2> d_out { h_out.data(), cl::sycl::range<2> {n, n} };
+#else
+    // FIXME: if I don't initialize this buffer from host, the results are wrong.  Why?
+    //cl::sycl::buffer<double> d_in  { cl::sycl::range<1> {n*n} };
+    cl::sycl::buffer<double> d_in  { h_in.data(),  h_in.size() };
+    cl::sycl::buffer<double> d_out { h_out.data(), h_out.size() };
+#endif
 
     q.submit([&](cl::sycl::handler& h) {
 
       // accessor methods
       auto in  = d_in.get_access<cl::sycl::access::mode::read_write>(h);
 
-      // Add constant to solution to force refresh of neighbor data, if any
-      h.parallel_for<class init>(cl::sycl::range<2> {n, n}, //cl::sycl::id<2> {0, 0},
-                                [=] (cl::sycl::item<2> it) {
+      h.parallel_for<class init>(cl::sycl::range<2> {n, n}, [=] (cl::sycl::item<2> it) {
+#if USE_2D_INDEXING
           cl::sycl::id<2> xy = it.get_id();
-          auto i = xy[0];
-          auto j = xy[1];
+          auto i = it[0];
+          auto j = it[1];
           in[xy] = static_cast<double>(i+j);
+#else
+          auto i = it[0];
+          auto j = it[1];
+          in[i*n+j] = static_cast<double>(i+j);
+#endif
       });
     });
     q.wait();
@@ -228,18 +214,29 @@ int main(int argc, char* argv[])
       if (iter==1) stencil_time = prk::wtime();
 
       stencil(q, n, d_in, d_out);
+      // This is only necessary with triSYCL
+      q.wait();
 
       q.submit([&](cl::sycl::handler& h) {
 
         // accessor methods
         auto in  = d_in.get_access<cl::sycl::access::mode::read_write>(h);
-        auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
 
         // Add constant to solution to force refresh of neighbor data, if any
-        h.parallel_for<class add>(cl::sycl::range<2> {n, n}, //cl::sycl::id<2> {0, 0},
+        h.parallel_for<class add>(cl::sycl::range<2> {n, n}, cl::sycl::id<2> {0, 0},
                                   [=] (cl::sycl::item<2> it) {
+#if USE_2D_INDEXING
             cl::sycl::id<2> xy = it.get_id();
             in[xy] += 1.0;
+#else
+#if 0 // This is noticeably slower :-(
+            auto i = it[0];
+            auto j = it[1];
+            in[i*n+j] += 1.0;
+#else
+            in[it[0]*n+it[1]] += 1.0;
+#endif
+#endif
         });
       });
       q.wait();
@@ -247,6 +244,14 @@ int main(int argc, char* argv[])
     stencil_time = prk::wtime() - stencil_time;
   }
 
+#if 0
+  for (auto i=0; i<n; i++) {
+    for (auto j=0; j<n; j++) {
+        std::cerr << i << "," << j << "," << h_out[i*n+j] << "\n";
+    }
+  }
+#endif
+
   //////////////////////////////////////////////////////////////////////
   // Analyze and output results.
   //////////////////////////////////////////////////////////////////////
diff --git a/Cxx11/stencil_sycl.hpp b/Cxx11/stencil_sycl.hpp
index 845082e62..18391ce41 100644
--- a/Cxx11/stencil_sycl.hpp
+++ b/Cxx11/stencil_sycl.hpp
@@ -1,29 +1,65 @@
 void star1(cl::sycl::queue & q, const size_t n,
-           cl::sycl::buffer<double, 2> d_in,
-           cl::sycl::buffer<double, 2> d_out) {
+           cl::sycl::buffer<double> & d_in,
+           cl::sycl::buffer<double> & d_out) {
   q.submit([&](cl::sycl::handler& h) {
-    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);       
+    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
     auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
-    h.parallel_for<class star1>(cl::sycl::range<2> {n-2*1,n-2*1}, cl::sycl::id<2> {1,1},
+    h.parallel_for<class star1_1d>(cl::sycl::range<2> {n-2*1,n-2*1}, cl::sycl::id<2> {1,1},
+                                [=] (cl::sycl::item<2> it) {
+        out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.5
+                              +in[it[0]*n+(it[1]-1)] * -0.5
+                              +in[(it[0]+1)*n+it[1]] * 0.5
+                              +in[(it[0]-1)*n+it[1]] * -0.5;
+    });
+  });
+}
+
+void star1(cl::sycl::queue & q, const size_t n,
+           cl::sycl::buffer<double, 2> & d_in,
+           cl::sycl::buffer<double, 2> & d_out) {
+  q.submit([&](cl::sycl::handler& h) {
+    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
+    auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
+    h.parallel_for<class star1_2d>(cl::sycl::range<2> {n-2*1,n-2*1}, cl::sycl::id<2> {1,1},
                                 [=] (cl::sycl::item<2> it) {
         cl::sycl::id<2> xy = it.get_id();
         cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
         cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
         out[xy] += +in[xy+dx1] * 0.5
-                   +in[xy+dy1] * 0.5
                    +in[xy-dx1] * -0.5
+                   +in[xy+dy1] * 0.5
                    +in[xy-dy1] * -0.5;
     });
   });
 }
 
 void star2(cl::sycl::queue & q, const size_t n,
-           cl::sycl::buffer<double, 2> d_in,
-           cl::sycl::buffer<double, 2> d_out) {
+           cl::sycl::buffer<double> & d_in,
+           cl::sycl::buffer<double> & d_out) {
+  q.submit([&](cl::sycl::handler& h) {
+    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
+    auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
+    h.parallel_for<class star2_1d>(cl::sycl::range<2> {n-2*2,n-2*2}, cl::sycl::id<2> {2,2},
+                                [=] (cl::sycl::item<2> it) {
+        out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.25
+                              +in[it[0]*n+(it[1]-1)] * -0.25
+                              +in[(it[0]+1)*n+it[1]] * 0.25
+                              +in[(it[0]-1)*n+it[1]] * -0.25
+                              +in[it[0]*n+(it[1]+2)] * 0.125
+                              +in[it[0]*n+(it[1]-2)] * -0.125
+                              +in[(it[0]+2)*n+it[1]] * 0.125
+                              +in[(it[0]-2)*n+it[1]] * -0.125;
+    });
+  });
+}
+
+void star2(cl::sycl::queue & q, const size_t n,
+           cl::sycl::buffer<double, 2> & d_in,
+           cl::sycl::buffer<double, 2> & d_out) {
   q.submit([&](cl::sycl::handler& h) {
-    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);       
+    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
     auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
-    h.parallel_for<class star2>(cl::sycl::range<2> {n-2*2,n-2*2}, cl::sycl::id<2> {2,2},
+    h.parallel_for<class star2_2d>(cl::sycl::range<2> {n-2*2,n-2*2}, cl::sycl::id<2> {2,2},
                                 [=] (cl::sycl::item<2> it) {
         cl::sycl::id<2> xy = it.get_id();
         cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
@@ -31,24 +67,48 @@ void star2(cl::sycl::queue & q, const size_t n,
         cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0});
         cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2});
         out[xy] += +in[xy+dx1] * 0.25
-                   +in[xy+dy1] * 0.25
                    +in[xy-dx1] * -0.25
+                   +in[xy+dy1] * 0.25
                    +in[xy-dy1] * -0.25
                    +in[xy+dx2] * 0.125
-                   +in[xy+dy2] * 0.125
                    +in[xy-dx2] * -0.125
+                   +in[xy+dy2] * 0.125
                    +in[xy-dy2] * -0.125;
     });
   });
 }
 
 void star3(cl::sycl::queue & q, const size_t n,
-           cl::sycl::buffer<double, 2> d_in,
-           cl::sycl::buffer<double, 2> d_out) {
+           cl::sycl::buffer<double> & d_in,
+           cl::sycl::buffer<double> & d_out) {
+  q.submit([&](cl::sycl::handler& h) {
+    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
+    auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
+    h.parallel_for<class star3_1d>(cl::sycl::range<2> {n-2*3,n-2*3}, cl::sycl::id<2> {3,3},
+                                [=] (cl::sycl::item<2> it) {
+        out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.166666666667
+                              +in[it[0]*n+(it[1]-1)] * -0.166666666667
+                              +in[(it[0]+1)*n+it[1]] * 0.166666666667
+                              +in[(it[0]-1)*n+it[1]] * -0.166666666667
+                              +in[it[0]*n+(it[1]+2)] * 0.0833333333333
+                              +in[it[0]*n+(it[1]-2)] * -0.0833333333333
+                              +in[(it[0]+2)*n+it[1]] * 0.0833333333333
+                              +in[(it[0]-2)*n+it[1]] * -0.0833333333333
+                              +in[it[0]*n+(it[1]+3)] * 0.0555555555556
+                              +in[it[0]*n+(it[1]-3)] * -0.0555555555556
+                              +in[(it[0]+3)*n+it[1]] * 0.0555555555556
+                              +in[(it[0]-3)*n+it[1]] * -0.0555555555556;
+    });
+  });
+}
+
+void star3(cl::sycl::queue & q, const size_t n,
+           cl::sycl::buffer<double, 2> & d_in,
+           cl::sycl::buffer<double, 2> & d_out) {
   q.submit([&](cl::sycl::handler& h) {
-    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);       
+    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
     auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
-    h.parallel_for<class star3>(cl::sycl::range<2> {n-2*3,n-2*3}, cl::sycl::id<2> {3,3},
+    h.parallel_for<class star3_2d>(cl::sycl::range<2> {n-2*3,n-2*3}, cl::sycl::id<2> {3,3},
                                 [=] (cl::sycl::item<2> it) {
         cl::sycl::id<2> xy = it.get_id();
         cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
@@ -57,29 +117,57 @@ void star3(cl::sycl::queue & q, const size_t n,
         cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2});
         cl::sycl::id<2> dx3(cl::sycl::range<2> {3,0});
         cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3});
-        out[xy] += +in[xy+dx1] * 0.16666666666666666
-                   +in[xy+dy1] * 0.16666666666666666
-                   +in[xy-dx1] * -0.16666666666666666
-                   +in[xy-dy1] * -0.16666666666666666
-                   +in[xy+dx2] * 0.08333333333333333
-                   +in[xy+dy2] * 0.08333333333333333
-                   +in[xy-dx2] * -0.08333333333333333
-                   +in[xy-dy2] * -0.08333333333333333
-                   +in[xy+dx3] * 0.05555555555555555
-                   +in[xy+dy3] * 0.05555555555555555
-                   +in[xy-dx3] * -0.05555555555555555
-                   +in[xy-dy3] * -0.05555555555555555;
+        out[xy] += +in[xy+dx1] * 0.166666666667
+                   +in[xy-dx1] * -0.166666666667
+                   +in[xy+dy1] * 0.166666666667
+                   +in[xy-dy1] * -0.166666666667
+                   +in[xy+dx2] * 0.0833333333333
+                   +in[xy-dx2] * -0.0833333333333
+                   +in[xy+dy2] * 0.0833333333333
+                   +in[xy-dy2] * -0.0833333333333
+                   +in[xy+dx3] * 0.0555555555556
+                   +in[xy-dx3] * -0.0555555555556
+                   +in[xy+dy3] * 0.0555555555556
+                   +in[xy-dy3] * -0.0555555555556;
     });
   });
 }
 
 void star4(cl::sycl::queue & q, const size_t n,
-           cl::sycl::buffer<double, 2> d_in,
-           cl::sycl::buffer<double, 2> d_out) {
+           cl::sycl::buffer<double> & d_in,
+           cl::sycl::buffer<double> & d_out) {
   q.submit([&](cl::sycl::handler& h) {
-    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);       
+    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
     auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
-    h.parallel_for<class star4>(cl::sycl::range<2> {n-2*4,n-2*4}, cl::sycl::id<2> {4,4},
+    h.parallel_for<class star4_1d>(cl::sycl::range<2> {n-2*4,n-2*4}, cl::sycl::id<2> {4,4},
+                                [=] (cl::sycl::item<2> it) {
+        out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.125
+                              +in[it[0]*n+(it[1]-1)] * -0.125
+                              +in[(it[0]+1)*n+it[1]] * 0.125
+                              +in[(it[0]-1)*n+it[1]] * -0.125
+                              +in[it[0]*n+(it[1]+2)] * 0.0625
+                              +in[it[0]*n+(it[1]-2)] * -0.0625
+                              +in[(it[0]+2)*n+it[1]] * 0.0625
+                              +in[(it[0]-2)*n+it[1]] * -0.0625
+                              +in[it[0]*n+(it[1]+3)] * 0.0416666666667
+                              +in[it[0]*n+(it[1]-3)] * -0.0416666666667
+                              +in[(it[0]+3)*n+it[1]] * 0.0416666666667
+                              +in[(it[0]-3)*n+it[1]] * -0.0416666666667
+                              +in[it[0]*n+(it[1]+4)] * 0.03125
+                              +in[it[0]*n+(it[1]-4)] * -0.03125
+                              +in[(it[0]+4)*n+it[1]] * 0.03125
+                              +in[(it[0]-4)*n+it[1]] * -0.03125;
+    });
+  });
+}
+
+void star4(cl::sycl::queue & q, const size_t n,
+           cl::sycl::buffer<double, 2> & d_in,
+           cl::sycl::buffer<double, 2> & d_out) {
+  q.submit([&](cl::sycl::handler& h) {
+    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
+    auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
+    h.parallel_for<class star4_2d>(cl::sycl::range<2> {n-2*4,n-2*4}, cl::sycl::id<2> {4,4},
                                 [=] (cl::sycl::item<2> it) {
         cl::sycl::id<2> xy = it.get_id();
         cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
@@ -91,32 +179,64 @@ void star4(cl::sycl::queue & q, const size_t n,
         cl::sycl::id<2> dx4(cl::sycl::range<2> {4,0});
         cl::sycl::id<2> dy4(cl::sycl::range<2> {0,4});
         out[xy] += +in[xy+dx1] * 0.125
-                   +in[xy+dy1] * 0.125
                    +in[xy-dx1] * -0.125
+                   +in[xy+dy1] * 0.125
                    +in[xy-dy1] * -0.125
                    +in[xy+dx2] * 0.0625
-                   +in[xy+dy2] * 0.0625
                    +in[xy-dx2] * -0.0625
+                   +in[xy+dy2] * 0.0625
                    +in[xy-dy2] * -0.0625
-                   +in[xy+dx3] * 0.041666666666666664
-                   +in[xy+dy3] * 0.041666666666666664
-                   +in[xy-dx3] * -0.041666666666666664
-                   +in[xy-dy3] * -0.041666666666666664
+                   +in[xy+dx3] * 0.0416666666667
+                   +in[xy-dx3] * -0.0416666666667
+                   +in[xy+dy3] * 0.0416666666667
+                   +in[xy-dy3] * -0.0416666666667
                    +in[xy+dx4] * 0.03125
-                   +in[xy+dy4] * 0.03125
                    +in[xy-dx4] * -0.03125
+                   +in[xy+dy4] * 0.03125
                    +in[xy-dy4] * -0.03125;
     });
   });
 }
 
 void star5(cl::sycl::queue & q, const size_t n,
-           cl::sycl::buffer<double, 2> d_in,
-           cl::sycl::buffer<double, 2> d_out) {
+           cl::sycl::buffer<double> & d_in,
+           cl::sycl::buffer<double> & d_out) {
+  q.submit([&](cl::sycl::handler& h) {
+    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
+    auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
+    h.parallel_for<class star5_1d>(cl::sycl::range<2> {n-2*5,n-2*5}, cl::sycl::id<2> {5,5},
+                                [=] (cl::sycl::item<2> it) {
+        out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.1
+                              +in[it[0]*n+(it[1]-1)] * -0.1
+                              +in[(it[0]+1)*n+it[1]] * 0.1
+                              +in[(it[0]-1)*n+it[1]] * -0.1
+                              +in[it[0]*n+(it[1]+2)] * 0.05
+                              +in[it[0]*n+(it[1]-2)] * -0.05
+                              +in[(it[0]+2)*n+it[1]] * 0.05
+                              +in[(it[0]-2)*n+it[1]] * -0.05
+                              +in[it[0]*n+(it[1]+3)] * 0.0333333333333
+                              +in[it[0]*n+(it[1]-3)] * -0.0333333333333
+                              +in[(it[0]+3)*n+it[1]] * 0.0333333333333
+                              +in[(it[0]-3)*n+it[1]] * -0.0333333333333
+                              +in[it[0]*n+(it[1]+4)] * 0.025
+                              +in[it[0]*n+(it[1]-4)] * -0.025
+                              +in[(it[0]+4)*n+it[1]] * 0.025
+                              +in[(it[0]-4)*n+it[1]] * -0.025
+                              +in[it[0]*n+(it[1]+5)] * 0.02
+                              +in[it[0]*n+(it[1]-5)] * -0.02
+                              +in[(it[0]+5)*n+it[1]] * 0.02
+                              +in[(it[0]-5)*n+it[1]] * -0.02;
+    });
+  });
+}
+
+void star5(cl::sycl::queue & q, const size_t n,
+           cl::sycl::buffer<double, 2> & d_in,
+           cl::sycl::buffer<double, 2> & d_out) {
   q.submit([&](cl::sycl::handler& h) {
-    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);       
+    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
     auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
-    h.parallel_for<class star5>(cl::sycl::range<2> {n-2*5,n-2*5}, cl::sycl::id<2> {5,5},
+    h.parallel_for<class star5_2d>(cl::sycl::range<2> {n-2*5,n-2*5}, cl::sycl::id<2> {5,5},
                                 [=] (cl::sycl::item<2> it) {
         cl::sycl::id<2> xy = it.get_id();
         cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
@@ -130,24 +250,24 @@ void star5(cl::sycl::queue & q, const size_t n,
         cl::sycl::id<2> dx5(cl::sycl::range<2> {5,0});
         cl::sycl::id<2> dy5(cl::sycl::range<2> {0,5});
         out[xy] += +in[xy+dx1] * 0.1
-                   +in[xy+dy1] * 0.1
                    +in[xy-dx1] * -0.1
+                   +in[xy+dy1] * 0.1
                    +in[xy-dy1] * -0.1
                    +in[xy+dx2] * 0.05
-                   +in[xy+dy2] * 0.05
                    +in[xy-dx2] * -0.05
+                   +in[xy+dy2] * 0.05
                    +in[xy-dy2] * -0.05
-                   +in[xy+dx3] * 0.03333333333333333
-                   +in[xy+dy3] * 0.03333333333333333
-                   +in[xy-dx3] * -0.03333333333333333
-                   +in[xy-dy3] * -0.03333333333333333
+                   +in[xy+dx3] * 0.0333333333333
+                   +in[xy-dx3] * -0.0333333333333
+                   +in[xy+dy3] * 0.0333333333333
+                   +in[xy-dy3] * -0.0333333333333
                    +in[xy+dx4] * 0.025
-                   +in[xy+dy4] * 0.025
                    +in[xy-dx4] * -0.025
+                   +in[xy+dy4] * 0.025
                    +in[xy-dy4] * -0.025
                    +in[xy+dx5] * 0.02
-                   +in[xy+dy5] * 0.02
                    +in[xy-dx5] * -0.02
+                   +in[xy+dy5] * 0.02
                    +in[xy-dy5] * -0.02;
     });
   });
diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc
index cbbc1a2a1..e75897e77 100644
--- a/Cxx11/transpose-sycl.cc
+++ b/Cxx11/transpose-sycl.cc
@@ -51,6 +51,8 @@
 
 #include "prk_util.h"
 
+#define USE_2D_INDEXING 1
+
 int main(int argc, char * argv[])
 {
   std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
@@ -89,37 +91,34 @@ int main(int argc, char * argv[])
   std::cout << "Number of iterations  = " << iterations << std::endl;
   std::cout << "Matrix order          = " << order << std::endl;
 
-  // SYCL device queue
-  cl::sycl::queue q;
-
   //////////////////////////////////////////////////////////////////////
   /// Allocate space for the input and transpose matrix
   //////////////////////////////////////////////////////////////////////
 
+  auto trans_time = 0.0;
+
   std::vector<double> h_A(order*order);
   std::vector<double> h_B(order*order,0.0);
 
   // fill A with the sequence 0 to order^2-1 as doubles
   std::iota(h_A.begin(), h_A.end(), 0.0);
 
-  auto range = boost::irange(static_cast<size_t>(0),order);
-
-  auto trans_time = 0.0;
-
+  // SYCL device queue
+  cl::sycl::queue q;
   {
     // initialize device buffers from host buffers
 #if USE_2D_INDEXING
-    cl::sycl::buffer<double,2> d_A( cl::sycl::range<2>{order,order} ); // FIXME: does not initialize with host array
-    cl::sycl::buffer<double,2> d_B( cl::sycl::range<2>{order,order} ); // FIXME: does not initialize with host array
+    cl::sycl::buffer<double,2> d_A( h_A.data(), cl::sycl::range<2>{order,order} );
+    cl::sycl::buffer<double,2> d_B( h_B.data(), cl::sycl::range<2>{order,order} );
 #else
     cl::sycl::buffer<double> d_A { h_A.data(), h_A.size() };
     cl::sycl::buffer<double> d_B { h_B.data(), h_B.size() };
 #endif
 
     for (auto iter = 0; iter<=iterations; iter++) {
- 
+
       if (iter==1) trans_time = prk::wtime();
- 
+
       q.submit([&](cl::sycl::handler& h) {
 
         // accessor methods
@@ -129,7 +128,10 @@ int main(int argc, char * argv[])
         // transpose
         h.parallel_for<class transpose>(cl::sycl::range<2>{order,order}, [=] (cl::sycl::item<2> it) {
 #if USE_2D_INDEXING
-#error 2D indexing is not implemented yet.  Fix this!
+          cl::sycl::id<2> ij{it[0],it[1]};
+          cl::sycl::id<2> ji{it[1],it[0]};
+          B[ij] += A[ji];
+          A[ji] += 1.0;
 #else
           B[it[0] * order + it[1]] += A[it[1] * order + it[0]];
           A[it[1] * order + it[0]] += 1.0;
@@ -149,6 +151,8 @@ int main(int argc, char * argv[])
   /// Analyze and output results
   //////////////////////////////////////////////////////////////////////
 
+  auto range = boost::irange(static_cast<size_t>(0),order);
+
   // TODO: replace with std::generate, std::accumulate, or similar
   const auto addit = (iterations+1.) * (iterations/2.);
   auto abserr = 0.0;
diff --git a/README.md b/README.md
index fb756df1d..9360b1d35 100644
--- a/README.md
+++ b/README.md
@@ -79,6 +79,8 @@ y = yes
 
 i = in-progress, incomplete, or incorrect
 
+f = see footnotes
+
 | Parallelism          | p2p | stencil | transpose | nstream | sparse | dgemm |
 |----------------------|-----|---------|-----------|---------|--------|-------|
 | None                 |  y  |    y    |     y     |    y    |    y   |   y   |

From 4e3010ba88b673593a58230a13c63ef22f543664 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Sun, 18 Mar 2018 15:18:05 -0700
Subject: [PATCH 055/245] Boost compute (#316)

add Boost.Compute (just nstream for now)

* valarray does work because of https://github.com/boostorg/compute/issues/758.  i may fix that in Boost.Compute some day.
* add Boost.Compute to Travis (Mac-only)
* split Boost headers since not available in Travis Linux
---
 .gitignore                              |   2 +
 Cxx11/Makefile                          |  17 +-
 Cxx11/nstream-valarray-boost-compute.cc | 184 +++++++++++++++++++
 Cxx11/nstream-vector-boost-compute.cc   | 230 ++++++++++++++++++++++++
 Cxx11/prk_util.h                        |   9 +-
 common/make.defs.cray                   |   2 +-
 common/make.defs.gcc                    |   2 +-
 common/make.defs.intel                  |   2 +-
 common/make.defs.llvm                   |   2 +-
 travis/build-run-prk.sh                 |  49 +++--
 travis/install-boost.sh                 |   6 +-
 11 files changed, 478 insertions(+), 27 deletions(-)
 create mode 100644 Cxx11/nstream-valarray-boost-compute.cc
 create mode 100644 Cxx11/nstream-vector-boost-compute.cc

diff --git a/.gitignore b/.gitignore
index 950d52c2f..96503acd3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -129,6 +129,8 @@ Cxx11/nstream-vector-rangefor
 Cxx11/nstream-vector-stl
 Cxx11/nstream-vector-taskloop
 Cxx11/nstream-vector-tbb
+Cxx11/nstream-valarray-boost-compute
+Cxx11/nstream-vector-boost-compute
 Cxx11/sparse-vector
 Cxx11/stencil-opencl
 Cxx11/stencil-openmp-target
diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index f6fed48ca..e0cbe0b6d 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -50,7 +50,8 @@ ifdef OCCADIR
 endif
 OCCAFLAGS = -DUSE_OCCA -I${OCCADIR}/include -Wl,-rpath -Wl,${OCCADIR}/lib -L${OCCADIR}/lib -locca
 
-.PHONY: all clean vector valarray openmp target opencl taskloop tbb stl pstl rangefor kokkos raja cuda
+.PHONY: all clean vector valarray openmp target opencl taskloop tbb stl pstl \
+	rangefor kokkos raja cuda cublas sycl boost-compute
 
 EXTRA=
 ifeq ($(shell uname -s),Darwin)
@@ -61,7 +62,7 @@ else
   EXTRA += target
 endif
 
-all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sycl occa $(EXTRA)
+all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sycl occa boost-compute $(EXTRA)
 
 p2p: p2p-vector p2p-doacross-vector-openmp p2p-hyperplane-vector-openmp p2p-tasks-openmp p2p-openmp-target \
      p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl
@@ -81,7 +82,7 @@ nstream: nstream-valarray nstream-vector nstream-vector-openmp nstream-openmp-ta
 
 dgemm: dgemm-vector dgemm-cblas
 
-vector: p2p-vector p2p-innerloop-vector stencil-vector transpose-vector nstream-vector sparse-vector dgemm-vector \
+vector: p2p-vector p2p-hyperplane-vector stencil-vector transpose-vector nstream-vector sparse-vector dgemm-vector \
 	transpose-vector-async transpose-vector-thread
 
 valarray: transpose-valarray nstream-valarray
@@ -114,7 +115,11 @@ cublas: transpose-cublas nstream-cublas
 
 occa: transpose-occa nstream-occa
 
-p2p-innerloop-vector: p2p-hyperplane-vector-openmp.cc prk_util.h
+boost-compute: nstream-vector-boost-compute
+# busted
+#nstream-valarray-boost-compute
+
+p2p-hyperplane-vector: p2p-hyperplane-vector-openmp.cc prk_util.h
 	$(CXX) $(CXXFLAGS) $< -o $@
 
 transpose-opencl: transpose-opencl.cc transpose.cl prk_util.h prk_opencl.h
@@ -151,6 +156,9 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h
 %-rangefor: %-rangefor.cc prk_util.h
 	$(CXX) $(CXXFLAGS) $< $(BOOSTFLAGS) -o $@
 
+%-boost-compute: %-boost-compute.cc prk_util.h
+	$(CXX) $(CXXFLAGS) $< $(BOOSTFLAGS) $(OPENCLFLAGS) -o $@
+
 %-raja: %-raja.cc prk_util.h
 	$(CXX) $(CXXFLAGS) $< $(RAJAFLAGS) -o $@
 
@@ -204,6 +212,7 @@ clean:
 	-rm -f *-cublas
 	-rm -f *-cblas
 	-rm -f *-occa
+	-rm -f *-boost-compute
 	-rm -f transpose-vector-async transpose-vector-thread
 
 cleancl:
diff --git a/Cxx11/nstream-valarray-boost-compute.cc b/Cxx11/nstream-valarray-boost-compute.cc
new file mode 100644
index 000000000..50c54846f
--- /dev/null
+++ b/Cxx11/nstream-valarray-boost-compute.cc
@@ -0,0 +1,184 @@
+///
+/// Copyright (c) 2017, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    nstream
+///
+/// PURPOSE: To compute memory bandwidth when adding a vector of a given
+///          number of double precision values to the scalar multiple of
+///          another vector of the same length, and storing the result in
+///          a third vector.
+///
+/// USAGE:   The program takes as input the number
+///          of iterations to loop over the triad vectors, the length of the
+///          vectors, and the offset between vectors
+///
+///          <progname> <# iterations> <vector length> <offset>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// NOTES:   Bandwidth is determined as the number of words read, plus the
+///          number of words written, times the size of the words, divided
+///          by the execution time. For a vector length of N, the total
+///          number of words read and written is 4*N*sizeof(double).
+///
+///
+/// HISTORY: This code is loosely based on the Stream benchmark by John
+///          McCalpin, but does not follow all the Stream rules. Hence,
+///          reported results should not be associated with Stream in
+///          external publications
+///
+///          Converted to C++11 by Jeff Hammond, November 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+
+namespace compute = boost::compute;
+
+using boost::compute::_1;
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/Boost.Compute STREAM triad: A = B + scalar * C" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations, offset;
+  size_t length;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <vector length>";
+      }
+
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      length = std::atol(argv[2]);
+      if (length <= 0) {
+        throw "ERROR: vector length must be positive";
+      }
+
+      offset = (argc>3) ? std::atoi(argv[3]) : 0;
+      if (length <= 0) {
+        throw "ERROR: offset must be nonnegative";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  //compute::compute::device device = compute::compute::system::default_device();
+  auto device = compute::system::default_device();
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Vector length        = " << length << std::endl;
+  std::cout << "Offset               = " << offset << std::endl;
+  std::cout << "Boost.Compute device = " << device.name() << std::endl;
+
+  compute::context context(device);
+  compute::command_queue queue(context, device);
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  auto nstream_time = 0.0;
+
+  std::vector<float> h_A;
+  h_A.resize(length,0.0f);
+
+  const float scalar(3);
+
+  {
+    compute::valarray<float> d_A(0.0f, length);
+    compute::valarray<float> d_B(2.0f, length);
+    compute::valarray<float> d_C(2.0f, length);
+
+    for (auto iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) nstream_time = prk::wtime();
+
+      d_A += d_B + scalar * d_C;
+    }
+
+    nstream_time = prk::wtime() - nstream_time;
+
+    compute::copy(std::begin(d_A), std::end(d_A), h_A.begin());
+    queue.finish();
+  }
+  compute::system::finish();
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  double ar(0);
+  double br(2);
+  double cr(2);
+  for (auto i=0; i<=iterations; i++) {
+      ar += br + scalar * cr;
+  }
+
+  ar *= length;
+
+  double asum(0);
+  for (size_t i=0; i<length; i++) {
+      asum += std::fabs(h_A[i]);
+  }
+
+  double epsilon(1.e-8);
+  if (std::fabs(ar-asum)/asum > epsilon) {
+      std::cout << "Failed Validation on output array\n"
+                << "       Expected checksum: " << ar << "\n"
+                << "       Observed checksum: " << asum << std::endl;
+      std::cout << "ERROR: solution did not validate" << std::endl;
+      return 1;
+  } else {
+      std::cout << "Solution validates" << std::endl;
+      double avgtime = nstream_time/iterations;
+      double nbytes = 4.0 * length * sizeof(float);
+      std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime
+                << " Avg time (s): " << avgtime << std::endl;
+  }
+
+  return 0;
+}
+
+
diff --git a/Cxx11/nstream-vector-boost-compute.cc b/Cxx11/nstream-vector-boost-compute.cc
new file mode 100644
index 000000000..de95b37a7
--- /dev/null
+++ b/Cxx11/nstream-vector-boost-compute.cc
@@ -0,0 +1,230 @@
+///
+/// Copyright (c) 2017, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    nstream
+///
+/// PURPOSE: To compute memory bandwidth when adding a vector of a given
+///          number of double precision values to the scalar multiple of
+///          another vector of the same length, and storing the result in
+///          a third vector.
+///
+/// USAGE:   The program takes as input the number
+///          of iterations to loop over the triad vectors, the length of the
+///          vectors, and the offset between vectors
+///
+///          <progname> <# iterations> <vector length> <offset>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// NOTES:   Bandwidth is determined as the number of words read, plus the
+///          number of words written, times the size of the words, divided
+///          by the execution time. For a vector length of N, the total
+///          number of words read and written is 4*N*sizeof(double).
+///
+///
+/// HISTORY: This code is loosely based on the Stream benchmark by John
+///          McCalpin, but does not follow all the Stream rules. Hence,
+///          reported results should not be associated with Stream in
+///          external publications
+///
+///          Converted to C++11 by Jeff Hammond, November 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+
+namespace compute = boost::compute;
+
+using boost::compute::_1;
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/Boost.Compute STREAM triad: A = B + scalar * C" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations, offset;
+  size_t length;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <vector length>";
+      }
+
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      length = std::atol(argv[2]);
+      if (length <= 0) {
+        throw "ERROR: vector length must be positive";
+      }
+
+      offset = (argc>3) ? std::atoi(argv[3]) : 0;
+      if (length <= 0) {
+        throw "ERROR: offset must be nonnegative";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  //compute::compute::device device = compute::compute::system::default_device();
+  auto device = compute::system::default_device();
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Vector length        = " << length << std::endl;
+  std::cout << "Offset               = " << offset << std::endl;
+  std::cout << "Boost.Compute device = " << device.name() << std::endl;
+
+  compute::context context(device);
+  compute::command_queue queue(context, device);
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  auto nstream_time = 0.0;
+
+  std::vector<float> h_A;
+  h_A.resize(length);
+
+  const float scalar(3);
+
+  {
+    compute::vector<float> d_A(length, context);
+    compute::vector<float> d_B(length, context);
+    compute::vector<float> d_C(length, context);
+
+    compute::fill(d_A.begin(), d_A.end(), 0, queue);
+    compute::fill(d_B.begin(), d_B.end(), 2, queue);
+    compute::fill(d_C.begin(), d_C.end(), 2, queue);
+    queue.finish();
+
+    for (auto iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) nstream_time = prk::wtime();
+
+#if STUPID_HACK_IMPLEMENTATION
+      compute::transform(d_B.begin(), d_B.end(), d_A.begin(), d_A.begin(), compute::plus<float>(), queue);
+      compute::transform(d_C.begin(), d_C.end(), d_A.begin(), d_A.begin(), compute::plus<float>(), queue);
+      compute::transform(d_C.begin(), d_C.end(), d_A.begin(), d_A.begin(), compute::plus<float>(), queue);
+      compute::transform(d_C.begin(), d_C.end(), d_A.begin(), d_A.begin(), compute::plus<float>(), queue);
+#else
+
+#if LAMBDA_MAKE_TUPLE
+      // Aout and Ain are necessary because A += .. does not work
+      auto Aout = compute::lambda::get<0>(boost::compute::_1);
+      auto Ain  = compute::lambda::get<1>(boost::compute::_1);
+      auto B    = compute::lambda::get<2>(boost::compute::_1);
+      auto C    = compute::lambda::get<3>(boost::compute::_1);
+#endif
+
+      compute::for_each(
+          compute::make_zip_iterator(
+              boost::make_tuple(
+                  d_A.begin(), d_A.begin(), d_B.begin(), d_C.begin()
+              )
+          ),
+          compute::make_zip_iterator(
+              boost::make_tuple(
+                  d_A.end(), d_A.end(), d_B.end(), d_C.end()
+              )
+          ),
+#if LAMBDA_MAKE_TUPLE
+          // += does not work here
+          compute::lambda::make_tuple(
+              Aout = Ain + B + scalar * C
+          ),
+#else
+          // += does not work here
+          compute::lambda::get<0>(_1) = compute::lambda::get<1>(_1)
+                                      + compute::lambda::get<2>(_1)
+                                      + compute::lambda::get<3>(_1) * scalar,
+#endif
+          queue
+      );
+#endif
+
+      queue.finish();
+    }
+
+    nstream_time = prk::wtime() - nstream_time;
+
+    compute::copy(d_A.begin(), d_A.end(), h_A.begin(), queue);
+    queue.finish();
+  }
+  compute::system::finish();
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  double ar(0);
+  double br(2);
+  double cr(2);
+  for (auto i=0; i<=iterations; i++) {
+      ar += br + scalar * cr;
+  }
+
+  ar *= length;
+
+  double asum(0);
+  for (size_t i=0; i<length; i++) {
+      asum += std::fabs(h_A[i]);
+  }
+
+  double epsilon(1.e-8);
+  if (std::fabs(ar-asum)/asum > epsilon) {
+      std::cout << "Failed Validation on output array\n"
+                << "       Expected checksum: " << ar << "\n"
+                << "       Observed checksum: " << asum << std::endl;
+      std::cout << "ERROR: solution did not validate" << std::endl;
+      return 1;
+  } else {
+      std::cout << "Solution validates" << std::endl;
+      double avgtime = nstream_time/iterations;
+      double nbytes = 4.0 * length * sizeof(float);
+      std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime
+                << " Avg time (s): " << avgtime << std::endl;
+  }
+
+  return 0;
+}
+
+
diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h
index 1c97f9ba9..0109ba684 100644
--- a/Cxx11/prk_util.h
+++ b/Cxx11/prk_util.h
@@ -180,8 +180,13 @@ const T prk_reduce(I first, I last, T init) {
 # endif
 #endif
 
-#ifdef USE_BOOST
-# include <boost/range/irange.hpp>
+#if defined(USE_BOOST)
+# include "boost/range/irange.hpp"
+#endif
+
+#if defined(USE_BOOST_COMPUTE)
+# include "boost/compute.hpp"
+# include "boost/compute/container/valarray.hpp"
 #endif
 
 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1800)
diff --git a/common/make.defs.cray b/common/make.defs.cray
index 793bcd656..aee737d77 100644
--- a/common/make.defs.cray
+++ b/common/make.defs.cray
@@ -25,7 +25,7 @@ ORNLACCFLAG=-h acc
 # Parallel STL, Boost, etc.
 #
 # NERSC: "module load boost"
-BOOSTFLAG=-DUSE_BOOST -I$${BOOST_DIR}/include
+BOOSTFLAG=-DUSE_BOOST -DUSE_BOOST_COMPUTE -I$${BOOST_DIR}/include
 #
 # MPI
 #
diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index 074a1b696..36732883f 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -70,7 +70,7 @@ TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
 #
 # Parallel STL, Boost, etc.
 #
-BOOSTFLAG=-DUSE_BOOST -I/usr/local/Cellar/boost/1.65.1/include
+BOOSTFLAG=-DUSE_BOOST -DUSE_BOOST_COMPUTE -I/usr/local/Cellar/boost/1.65.1/include
 PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include
 KOKKOSDIR=/opt/kokkos/gcc
 KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG}
diff --git a/common/make.defs.intel b/common/make.defs.intel
index 49beeb6fa..7ecd87ead 100644
--- a/common/make.defs.intel
+++ b/common/make.defs.intel
@@ -66,7 +66,7 @@ TBBFLAG=-DUSE_TBB -tbb
 #
 # Parallel STL, Boost, etc.
 #
-BOOSTFLAG=-DUSE_BOOST -I/usr/local/Cellar/boost/1.65.1/include
+BOOSTFLAG=-DUSE_BOOST -DUSE_BOOST_COMPUTE -I/usr/local/Cellar/boost/1.65.1/include
 PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG}
 KOKKOSDIR=/opt/kokkos/intel
 KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl
diff --git a/common/make.defs.llvm b/common/make.defs.llvm
index c67e8b7db..817f9da7d 100644
--- a/common/make.defs.llvm
+++ b/common/make.defs.llvm
@@ -84,7 +84,7 @@ TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
 #
 # Parallel STL, Boost, etc.
 #
-BOOSTFLAG=-DUSE_BOOST -I/usr/local/Cellar/boost/1.65.1/include
+BOOSTFLAG=-DUSE_BOOST -DUSE_BOOST_COMPUTE -I/usr/local/Cellar/boost/1.65.1/include
 PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include
 KOKKOSDIR=/opt/kokkos/clang
 KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl
diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index 3dbe3cfe1..25bd9feb5 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -322,11 +322,12 @@ case "$PRK_TARGET" in
         $PRK_TARGET_PATH/nstream-valarray   10 16777216 32
 
         # C++11 without external parallelism
-        make -C $PRK_TARGET_PATH p2p-vector p2p-innerloop-vector stencil-vector transpose-vector nstream-vector \
+        make -C $PRK_TARGET_PATH p2p-vector p2p-hyperplane-vector stencil-vector transpose-vector nstream-vector \
                                  dgemm-vector sparse-vector
         $PRK_TARGET_PATH/p2p-vector              10 1024 1024
         $PRK_TARGET_PATH/p2p-vector              10 1024 1024 100 100
-        $PRK_TARGET_PATH/p2p-innerloop-vector    10 1024
+        $PRK_TARGET_PATH/p2p-hyperplane-vector   10 1024
+        $PRK_TARGET_PATH/p2p-hyperplane-vector   10 1024 64
         $PRK_TARGET_PATH/stencil-vector          10 1000
         $PRK_TARGET_PATH/transpose-vector        10 1024 32
         $PRK_TARGET_PATH/nstream-vector          10 16777216 32
@@ -352,19 +353,6 @@ case "$PRK_TARGET" in
         $PRK_TARGET_PATH/transpose-vector-thread 10 1024 512 32
         $PRK_TARGET_PATH/transpose-vector-async  10 1024 512 32
 
-        # C++11 with rangefor
-        echo "BOOSTFLAG=-DUSE_BOOST" >> common/make.defs
-        make -C $PRK_TARGET_PATH rangefor
-        $PRK_TARGET_PATH/stencil-vector-rangefor     10 1000
-        $PRK_TARGET_PATH/transpose-vector-rangefor   10 1024 32
-        $PRK_TARGET_PATH/nstream-vector-rangefor     10 16777216 32
-        #echo "Test stencil code generator"
-        for s in star grid ; do
-            for r in 1 2 3 4 5 ; do
-                $PRK_TARGET_PATH/stencil-vector-rangefor 10 200 20 $s $r
-            done
-        done
-
         # C++11 with OpenMP
         export OMP_NUM_THREADS=2
         case "$CC" in
@@ -448,6 +436,25 @@ case "$PRK_TARGET" in
                 ;;
         esac
 
+        # Boost.Compute found after OpenCL, and only available in Travis with MacOS.
+        if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then
+            echo "BOOSTFLAG=-DUSE_BOOST -DUSE_BOOST_COMPUTE" >> common/make.defs
+        else
+            echo "BOOSTFLAG=-DUSE_BOOST" >> common/make.defs
+        fi
+
+        # C++11 with rangefor and Boost.Ranges
+        make -C $PRK_TARGET_PATH rangefor
+        $PRK_TARGET_PATH/stencil-vector-rangefor     10 1000
+        $PRK_TARGET_PATH/transpose-vector-rangefor   10 1024 32
+        $PRK_TARGET_PATH/nstream-vector-rangefor     10 16777216 32
+        #echo "Test stencil code generator"
+        for s in star grid ; do
+            for r in 1 2 3 4 5 ; do
+                $PRK_TARGET_PATH/stencil-vector-rangefor 10 200 20 $s $r
+            done
+        done
+
         # C++11 with TBB
         # Skip Clang because older Clang from Linux chokes on max_align_t (https://travis-ci.org/jeffhammond/PRK/jobs/243395307)
         if [ "${CC}" = "gcc" ] || [ "${TRAVIS_OS_NAME}" = "osx" ] ; then
@@ -529,6 +536,18 @@ case "$PRK_TARGET" in
             cd ..
         fi
 
+        # Boost.Compute moved after OpenCL to reuse those flags...
+
+        # C++11 with Boost.Compute
+        # Only test Mac because:
+        # (1) We only test OpenCL on MacOS in Travis.
+        # (2) Boost.Compute is not available from APT.
+        # If we ever address 1, we need to enable the Boost.Compute install for Linux.
+        if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then
+            make -C $PRK_TARGET_PATH nstream-vector-boost-compute
+            $PRK_TARGET_PATH/nstream-vector-boost-compute     10 16777216 32
+        fi
+
         # C++11 with Kokkos, RAJA
         case "$CC" in
             gcc)
diff --git a/travis/install-boost.sh b/travis/install-boost.sh
index 578e2f9fc..4070fc414 100755
--- a/travis/install-boost.sh
+++ b/travis/install-boost.sh
@@ -8,12 +8,14 @@ TRAVIS_ROOT="$1"
 
 case "$os" in
     Darwin)
-        echo "Mac"
         brew update
         brew install boost || brew upgrade boost || true
         ;;
 
     Linux)
-        echo "Linux"
+        # We do not test Boost.Compute on Linux because of OpenCL issues...
+        # Boost.Compute is a header-only library
+        #git clone --depth 1 https://github.com/kylelutz/compute.git ${TRAVIS_ROOT}/compute
+        #git clone --depth 1 https://github.com/boostorg/compute.git ${TRAVIS_ROOT}/compute
         ;;
 esac

From 44da14ae9f4822b0b0cc40b5fc444f1280a24866 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sun, 18 Mar 2018 15:24:30 -0700
Subject: [PATCH 056/245] ignore new name [ci skip]

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 96503acd3..fb475624f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -116,6 +116,7 @@ Cxx11/p2p-innerloop-openmp
 Cxx11/p2p-doacross-vector-openmp
 Cxx11/p2p-innerloop-opencl
 Cxx11/p2p-innerloop-vector
+Cxx11/p2p-hyperplane-vector
 Cxx11/p2p-hyperplane-vector-openmp
 Cxx11/p2p-innerloop-vector-tbb
 Cxx11/nstream-kokkos

From ff02ca0d848d2c1ee7cc277b06d346414cff0c45 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Mon, 19 Mar 2018 08:51:51 -0700
Subject: [PATCH 057/245] hoist code, use more compact syntax, etc (#322)

hoist code, use more compact syntax, etc

* fix compiler error with older gcc/clang
```
In file included from stencil-sycl.cc:66:
./stencil_sycl.hpp:20:21: error: call to constructor of
'cl::sycl::id<2>' is ambiguous
    cl::sycl::id<2> dx1({1,0});
```
* move back to c++14 (and allow c++1z warning to persist)
```
clang++-3.9 -std=c++1z -pthread  -DPRKVERSION="2.16" stencil-vector-raja.cc -DUSE_RAJA -I/home/travis/build/ParRes/Kernels/PRK-deps/raja/include -L/home/travis/build/ParRes/Kernels/PRK-deps/raja/lib -lRAJA  -o stencil-vector-raja
In file included from stencil-vector-raja.cc:63:
In file included from ./prk_util.h:217:
In file included from /home/travis/build/ParRes/Kernels/PRK-deps/raja/include/RAJA/RAJA.hpp:45:
In file included from /home/travis/build/ParRes/Kernels/PRK-deps/raja/include/RAJA/util/basic_mempool.hpp:38:
In file included from /usr/bin/../lib/gcc/x86_64-linux-gnu/7.2.0/../../../../include/c++/7.2.0/map:60:
In file included from /usr/bin/../lib/gcc/x86_64-linux-gnu/7.2.0/../../../../include/c++/7.2.0/bits/stl_tree.h:72:
In file included from /usr/bin/../lib/gcc/x86_64-linux-gnu/7.2.0/../../../../include/c++/7.2.0/bits/node_handle.h:39:
/usr/bin/../lib/gcc/x86_64-linux-gnu/7.2.0/../../../../include/c++/7.2.0/optional:1032:27:
error: use of class template 'optional' requires template arguments
  template <typename _Tp> optional(_Tp) -> optional<_Tp>;
                          ^
/usr/bin/../lib/gcc/x86_64-linux-gnu/7.2.0/../../../../include/c++/7.2.0/optional:451:11:
note: template is declared here
    class optional
          ^
/usr/bin/../lib/gcc/x86_64-linux-gnu/7.2.0/../../../../include/c++/7.2.0/optional:1032:40:
error: expected ';' at end of declaration
  template <typename _Tp> optional(_Tp) -> optional<_Tp>;
                                       ^
/usr/bin/../lib/gcc/x86_64-linux-gnu/7.2.0/../../../../include/c++/7.2.0/optional:1032:41:
error: cannot use arrow operator on a type
  template <typename _Tp> optional(_Tp) -> optional<_Tp>;
                                        ^
```
---
 Cxx11/generate-sycl-stencil.py |  50 ++++--------
 Cxx11/stencil-sycl.cc          |   2 +-
 Cxx11/stencil_sycl.hpp         | 140 ++++++++++++++-------------------
 travis/build-run-prk.sh        |   2 +-
 4 files changed, 79 insertions(+), 115 deletions(-)

diff --git a/Cxx11/generate-sycl-stencil.py b/Cxx11/generate-sycl-stencil.py
index e0c0cae1e..fcb0c49bf 100755
--- a/Cxx11/generate-sycl-stencil.py
+++ b/Cxx11/generate-sycl-stencil.py
@@ -5,28 +5,30 @@
 import string
 import os
 
-def codegen(src,pattern,stencil_size,radius,W,model,dim):
-    src.write('void '+pattern+str(radius)+'(cl::sycl::queue & q, const size_t n,\n')
+def codegen(src,pattern,stencil_size,radius,model,dim):
+    src.write('void '+pattern+str(radius)+'(cl::sycl::queue & q, const size_t n, ')
     if (dim==2):
-        src.write('           cl::sycl::buffer<double, 2> & d_in,\n')
-        src.write('           cl::sycl::buffer<double, 2> & d_out) {\n')
+        src.write('cl::sycl::buffer<double, 2> & d_in, ')
+        src.write('cl::sycl::buffer<double, 2> & d_out)\n')
     else:
-        src.write('           cl::sycl::buffer<double> & d_in,\n')
-        src.write('           cl::sycl::buffer<double> & d_out) {\n')
+        src.write('cl::sycl::buffer<double> & d_in, ')
+        src.write('cl::sycl::buffer<double> & d_out)\n')
+    src.write('{\n')
     src.write('  q.submit([&](cl::sycl::handler& h) {\n')
     src.write('    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);\n')
     src.write('    auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);\n')
     if (dim==2):
-        src.write('    h.parallel_for<class '+pattern+str(radius)+'_'+str(dim)+'d>(cl::sycl::range<2> {n-2*'+str(radius)+',n-2*'+str(radius)+'}, cl::sycl::id<2> {'+str(radius)+','+str(radius)+'},\n')
-        src.write('                                [=] (cl::sycl::item<2> it) {\n')
-        src.write('        cl::sycl::id<2> xy = it.get_id();\n')
         for r in range(1,radius+1):
-            src.write('        cl::sycl::id<2> dx'+str(r)+'(cl::sycl::range<2> {'+str(r)+',0});\n')
-            src.write('        cl::sycl::id<2> dy'+str(r)+'(cl::sycl::range<2> {0,'+str(r)+'});\n')
+            src.write('    cl::sycl::id<2> dx'+str(r)+'(cl::sycl::range<2> {'+str(r)+',0});\n')
+            src.write('    cl::sycl::id<2> dy'+str(r)+'(cl::sycl::range<2> {0,'+str(r)+'});\n')
+    src.write('    h.parallel_for<class '+pattern+str(radius)+'_'+str(dim)+'d>(')
+    src.write('{n-'+str(2*radius)+',n-'+str(2*radius)+'}, ')
+    src.write('{'+str(radius)+','+str(radius)+'}, ')
+    src.write('[=] (auto it) {\n')
+    if (dim==2):
+        src.write('        cl::sycl::id<2> xy = it.get_id();\n')
         src.write('        out[xy] += ')
     else:
-        src.write('    h.parallel_for<class '+pattern+str(radius)+'_'+str(dim)+'d>(cl::sycl::range<2> {n-2*'+str(radius)+',n-2*'+str(radius)+'}, cl::sycl::id<2> {'+str(radius)+','+str(radius)+'},\n')
-        src.write('                                [=] (cl::sycl::item<2> it) {\n')
         # 1D indexing the slow way
         #src.write('        auto i = it[0];\n')
         #src.write('        auto j = it[1];\n')
@@ -78,30 +80,12 @@ def codegen(src,pattern,stencil_size,radius,W,model,dim):
     src.write('}\n\n')
 
 def instance(src,model,pattern,r):
-
-    W = [[0.0e0 for x in range(2*r+1)] for x in range(2*r+1)]
     if pattern == 'star':
         stencil_size = 4*r+1
-        for i in range(1,r+1):
-            W[r][r+i] = +1./(2*i*r)
-            W[r+i][r] = +1./(2*i*r)
-            W[r][r-i] = -1./(2*i*r)
-            W[r-i][r] = -1./(2*i*r)
-
     else:
         stencil_size = (2*r+1)**2
-        for j in range(1,r+1):
-            for i in range(-j+1,j):
-                W[r+i][r+j] = +1./(4*j*(2*j-1)*r)
-                W[r+i][r-j] = -1./(4*j*(2*j-1)*r)
-                W[r+j][r+i] = +1./(4*j*(2*j-1)*r)
-                W[r-j][r+i] = -1./(4*j*(2*j-1)*r)
-
-            W[r+j][r+j]    = +1./(4*j*r)
-            W[r-j][r-j]    = -1./(4*j*r)
-
-    codegen(src,pattern,stencil_size,r,W,model,1)
-    codegen(src,pattern,stencil_size,r,W,model,2)
+    codegen(src,pattern,stencil_size,r,model,1)
+    codegen(src,pattern,stencil_size,r,model,2)
 
 def main():
     for model in ['sycl']:
diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc
index c4bfa6ff8..7aceb02c0 100644
--- a/Cxx11/stencil-sycl.cc
+++ b/Cxx11/stencil-sycl.cc
@@ -60,7 +60,7 @@
 ///
 //////////////////////////////////////////////////////////////////////
 
-#define USE_2D_INDEXING 0
+#define USE_2D_INDEXING 1
 
 #include "prk_util.h"
 #include "stencil_sycl.hpp"
diff --git a/Cxx11/stencil_sycl.hpp b/Cxx11/stencil_sycl.hpp
index 18391ce41..261128675 100644
--- a/Cxx11/stencil_sycl.hpp
+++ b/Cxx11/stencil_sycl.hpp
@@ -1,11 +1,9 @@
-void star1(cl::sycl::queue & q, const size_t n,
-           cl::sycl::buffer<double> & d_in,
-           cl::sycl::buffer<double> & d_out) {
+void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double> & d_in, cl::sycl::buffer<double> & d_out)
+{
   q.submit([&](cl::sycl::handler& h) {
     auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
     auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
-    h.parallel_for<class star1_1d>(cl::sycl::range<2> {n-2*1,n-2*1}, cl::sycl::id<2> {1,1},
-                                [=] (cl::sycl::item<2> it) {
+    h.parallel_for<class star1_1d>({n-2,n-2}, {1,1}, [=] (auto it) {
         out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.5
                               +in[it[0]*n+(it[1]-1)] * -0.5
                               +in[(it[0]+1)*n+it[1]] * 0.5
@@ -14,17 +12,15 @@ void star1(cl::sycl::queue & q, const size_t n,
   });
 }
 
-void star1(cl::sycl::queue & q, const size_t n,
-           cl::sycl::buffer<double, 2> & d_in,
-           cl::sycl::buffer<double, 2> & d_out) {
+void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double, 2> & d_in, cl::sycl::buffer<double, 2> & d_out)
+{
   q.submit([&](cl::sycl::handler& h) {
     auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
     auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
-    h.parallel_for<class star1_2d>(cl::sycl::range<2> {n-2*1,n-2*1}, cl::sycl::id<2> {1,1},
-                                [=] (cl::sycl::item<2> it) {
+    cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
+    cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
+    h.parallel_for<class star1_2d>({n-2,n-2}, {1,1}, [=] (auto it) {
         cl::sycl::id<2> xy = it.get_id();
-        cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
-        cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
         out[xy] += +in[xy+dx1] * 0.5
                    +in[xy-dx1] * -0.5
                    +in[xy+dy1] * 0.5
@@ -33,14 +29,12 @@ void star1(cl::sycl::queue & q, const size_t n,
   });
 }
 
-void star2(cl::sycl::queue & q, const size_t n,
-           cl::sycl::buffer<double> & d_in,
-           cl::sycl::buffer<double> & d_out) {
+void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double> & d_in, cl::sycl::buffer<double> & d_out)
+{
   q.submit([&](cl::sycl::handler& h) {
     auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
     auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
-    h.parallel_for<class star2_1d>(cl::sycl::range<2> {n-2*2,n-2*2}, cl::sycl::id<2> {2,2},
-                                [=] (cl::sycl::item<2> it) {
+    h.parallel_for<class star2_1d>({n-4,n-4}, {2,2}, [=] (auto it) {
         out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.25
                               +in[it[0]*n+(it[1]-1)] * -0.25
                               +in[(it[0]+1)*n+it[1]] * 0.25
@@ -53,19 +47,17 @@ void star2(cl::sycl::queue & q, const size_t n,
   });
 }
 
-void star2(cl::sycl::queue & q, const size_t n,
-           cl::sycl::buffer<double, 2> & d_in,
-           cl::sycl::buffer<double, 2> & d_out) {
+void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double, 2> & d_in, cl::sycl::buffer<double, 2> & d_out)
+{
   q.submit([&](cl::sycl::handler& h) {
     auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
     auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
-    h.parallel_for<class star2_2d>(cl::sycl::range<2> {n-2*2,n-2*2}, cl::sycl::id<2> {2,2},
-                                [=] (cl::sycl::item<2> it) {
+    cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
+    cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
+    cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0});
+    cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2});
+    h.parallel_for<class star2_2d>({n-4,n-4}, {2,2}, [=] (auto it) {
         cl::sycl::id<2> xy = it.get_id();
-        cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
-        cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
-        cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0});
-        cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2});
         out[xy] += +in[xy+dx1] * 0.25
                    +in[xy-dx1] * -0.25
                    +in[xy+dy1] * 0.25
@@ -78,14 +70,12 @@ void star2(cl::sycl::queue & q, const size_t n,
   });
 }
 
-void star3(cl::sycl::queue & q, const size_t n,
-           cl::sycl::buffer<double> & d_in,
-           cl::sycl::buffer<double> & d_out) {
+void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double> & d_in, cl::sycl::buffer<double> & d_out)
+{
   q.submit([&](cl::sycl::handler& h) {
     auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
     auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
-    h.parallel_for<class star3_1d>(cl::sycl::range<2> {n-2*3,n-2*3}, cl::sycl::id<2> {3,3},
-                                [=] (cl::sycl::item<2> it) {
+    h.parallel_for<class star3_1d>({n-6,n-6}, {3,3}, [=] (auto it) {
         out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.166666666667
                               +in[it[0]*n+(it[1]-1)] * -0.166666666667
                               +in[(it[0]+1)*n+it[1]] * 0.166666666667
@@ -102,21 +92,19 @@ void star3(cl::sycl::queue & q, const size_t n,
   });
 }
 
-void star3(cl::sycl::queue & q, const size_t n,
-           cl::sycl::buffer<double, 2> & d_in,
-           cl::sycl::buffer<double, 2> & d_out) {
+void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double, 2> & d_in, cl::sycl::buffer<double, 2> & d_out)
+{
   q.submit([&](cl::sycl::handler& h) {
     auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
     auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
-    h.parallel_for<class star3_2d>(cl::sycl::range<2> {n-2*3,n-2*3}, cl::sycl::id<2> {3,3},
-                                [=] (cl::sycl::item<2> it) {
+    cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
+    cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
+    cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0});
+    cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2});
+    cl::sycl::id<2> dx3(cl::sycl::range<2> {3,0});
+    cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3});
+    h.parallel_for<class star3_2d>({n-6,n-6}, {3,3}, [=] (auto it) {
         cl::sycl::id<2> xy = it.get_id();
-        cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
-        cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
-        cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0});
-        cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2});
-        cl::sycl::id<2> dx3(cl::sycl::range<2> {3,0});
-        cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3});
         out[xy] += +in[xy+dx1] * 0.166666666667
                    +in[xy-dx1] * -0.166666666667
                    +in[xy+dy1] * 0.166666666667
@@ -133,14 +121,12 @@ void star3(cl::sycl::queue & q, const size_t n,
   });
 }
 
-void star4(cl::sycl::queue & q, const size_t n,
-           cl::sycl::buffer<double> & d_in,
-           cl::sycl::buffer<double> & d_out) {
+void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double> & d_in, cl::sycl::buffer<double> & d_out)
+{
   q.submit([&](cl::sycl::handler& h) {
     auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
     auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
-    h.parallel_for<class star4_1d>(cl::sycl::range<2> {n-2*4,n-2*4}, cl::sycl::id<2> {4,4},
-                                [=] (cl::sycl::item<2> it) {
+    h.parallel_for<class star4_1d>({n-8,n-8}, {4,4}, [=] (auto it) {
         out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.125
                               +in[it[0]*n+(it[1]-1)] * -0.125
                               +in[(it[0]+1)*n+it[1]] * 0.125
@@ -161,23 +147,21 @@ void star4(cl::sycl::queue & q, const size_t n,
   });
 }
 
-void star4(cl::sycl::queue & q, const size_t n,
-           cl::sycl::buffer<double, 2> & d_in,
-           cl::sycl::buffer<double, 2> & d_out) {
+void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double, 2> & d_in, cl::sycl::buffer<double, 2> & d_out)
+{
   q.submit([&](cl::sycl::handler& h) {
     auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
     auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
-    h.parallel_for<class star4_2d>(cl::sycl::range<2> {n-2*4,n-2*4}, cl::sycl::id<2> {4,4},
-                                [=] (cl::sycl::item<2> it) {
+    cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
+    cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
+    cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0});
+    cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2});
+    cl::sycl::id<2> dx3(cl::sycl::range<2> {3,0});
+    cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3});
+    cl::sycl::id<2> dx4(cl::sycl::range<2> {4,0});
+    cl::sycl::id<2> dy4(cl::sycl::range<2> {0,4});
+    h.parallel_for<class star4_2d>({n-8,n-8}, {4,4}, [=] (auto it) {
         cl::sycl::id<2> xy = it.get_id();
-        cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
-        cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
-        cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0});
-        cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2});
-        cl::sycl::id<2> dx3(cl::sycl::range<2> {3,0});
-        cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3});
-        cl::sycl::id<2> dx4(cl::sycl::range<2> {4,0});
-        cl::sycl::id<2> dy4(cl::sycl::range<2> {0,4});
         out[xy] += +in[xy+dx1] * 0.125
                    +in[xy-dx1] * -0.125
                    +in[xy+dy1] * 0.125
@@ -198,14 +182,12 @@ void star4(cl::sycl::queue & q, const size_t n,
   });
 }
 
-void star5(cl::sycl::queue & q, const size_t n,
-           cl::sycl::buffer<double> & d_in,
-           cl::sycl::buffer<double> & d_out) {
+void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double> & d_in, cl::sycl::buffer<double> & d_out)
+{
   q.submit([&](cl::sycl::handler& h) {
     auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
     auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
-    h.parallel_for<class star5_1d>(cl::sycl::range<2> {n-2*5,n-2*5}, cl::sycl::id<2> {5,5},
-                                [=] (cl::sycl::item<2> it) {
+    h.parallel_for<class star5_1d>({n-10,n-10}, {5,5}, [=] (auto it) {
         out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.1
                               +in[it[0]*n+(it[1]-1)] * -0.1
                               +in[(it[0]+1)*n+it[1]] * 0.1
@@ -230,25 +212,23 @@ void star5(cl::sycl::queue & q, const size_t n,
   });
 }
 
-void star5(cl::sycl::queue & q, const size_t n,
-           cl::sycl::buffer<double, 2> & d_in,
-           cl::sycl::buffer<double, 2> & d_out) {
+void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double, 2> & d_in, cl::sycl::buffer<double, 2> & d_out)
+{
   q.submit([&](cl::sycl::handler& h) {
     auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
     auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
-    h.parallel_for<class star5_2d>(cl::sycl::range<2> {n-2*5,n-2*5}, cl::sycl::id<2> {5,5},
-                                [=] (cl::sycl::item<2> it) {
+    cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
+    cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
+    cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0});
+    cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2});
+    cl::sycl::id<2> dx3(cl::sycl::range<2> {3,0});
+    cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3});
+    cl::sycl::id<2> dx4(cl::sycl::range<2> {4,0});
+    cl::sycl::id<2> dy4(cl::sycl::range<2> {0,4});
+    cl::sycl::id<2> dx5(cl::sycl::range<2> {5,0});
+    cl::sycl::id<2> dy5(cl::sycl::range<2> {0,5});
+    h.parallel_for<class star5_2d>({n-10,n-10}, {5,5}, [=] (auto it) {
         cl::sycl::id<2> xy = it.get_id();
-        cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
-        cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
-        cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0});
-        cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2});
-        cl::sycl::id<2> dx3(cl::sycl::range<2> {3,0});
-        cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3});
-        cl::sycl::id<2> dx4(cl::sycl::range<2> {4,0});
-        cl::sycl::id<2> dy4(cl::sycl::range<2> {0,4});
-        cl::sycl::id<2> dx5(cl::sycl::range<2> {5,0});
-        cl::sycl::id<2> dy5(cl::sycl::range<2> {0,5});
         out[xy] += +in[xy+dx1] * 0.1
                    +in[xy-dx1] * -0.1
                    +in[xy+dy1] * 0.1
diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index 25bd9feb5..7ae2e14b4 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -314,7 +314,7 @@ case "$PRK_TARGET" in
         ${PRK_CXX} -v
         # Need to increment this for PSTL
         # The pthread flag is supported by GCC and Clang at least
-        echo "CXX=${PRK_CXX} -std=c++11 -pthread" >> common/make.defs
+        echo "CXX=${PRK_CXX} -std=c++14 -pthread" >> common/make.defs
 
         # C++11 without external parallelism
         make -C $PRK_TARGET_PATH transpose-valarray nstream-valarray

From 451a1ebdadde406e64d097df7991eb3419d73de3 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Mon, 19 Mar 2018 08:52:42 -0700
Subject: [PATCH 058/245] cleanup Boost compute nstream (#323)

- remove stupid hack implementation
- remove the pedantic version that doesn't use make_tuple
---
 Cxx11/nstream-vector-boost-compute.cc | 53 +++++++--------------------
 1 file changed, 14 insertions(+), 39 deletions(-)

diff --git a/Cxx11/nstream-vector-boost-compute.cc b/Cxx11/nstream-vector-boost-compute.cc
index de95b37a7..fec24fbbf 100644
--- a/Cxx11/nstream-vector-boost-compute.cc
+++ b/Cxx11/nstream-vector-boost-compute.cc
@@ -62,6 +62,8 @@
 ///
 //////////////////////////////////////////////////////////////////////
 
+#define LAMBDA_MAKE_TUPLE 1
+
 #include "prk_util.h"
 
 namespace compute = boost::compute;
@@ -104,7 +106,6 @@ int main(int argc, char * argv[])
     return 1;
   }
 
-  //compute::compute::device device = compute::compute::system::default_device();
   auto device = compute::system::default_device();
 
   std::cout << "Number of iterations = " << iterations << std::endl;
@@ -112,9 +113,6 @@ int main(int argc, char * argv[])
   std::cout << "Offset               = " << offset << std::endl;
   std::cout << "Boost.Compute device = " << device.name() << std::endl;
 
-  compute::context context(device);
-  compute::command_queue queue(context, device);
-
   //////////////////////////////////////////////////////////////////////
   // Allocate space and perform the computation
   //////////////////////////////////////////////////////////////////////
@@ -126,6 +124,8 @@ int main(int argc, char * argv[])
 
   const float scalar(3);
 
+  compute::context context(device);
+  compute::command_queue queue(context, device);
   {
     compute::vector<float> d_A(length, context);
     compute::vector<float> d_B(length, context);
@@ -140,47 +140,22 @@ int main(int argc, char * argv[])
 
       if (iter==1) nstream_time = prk::wtime();
 
-#if STUPID_HACK_IMPLEMENTATION
-      compute::transform(d_B.begin(), d_B.end(), d_A.begin(), d_A.begin(), compute::plus<float>(), queue);
-      compute::transform(d_C.begin(), d_C.end(), d_A.begin(), d_A.begin(), compute::plus<float>(), queue);
-      compute::transform(d_C.begin(), d_C.end(), d_A.begin(), d_A.begin(), compute::plus<float>(), queue);
-      compute::transform(d_C.begin(), d_C.end(), d_A.begin(), d_A.begin(), compute::plus<float>(), queue);
-#else
-
-#if LAMBDA_MAKE_TUPLE
       // Aout and Ain are necessary because A += .. does not work
       auto Aout = compute::lambda::get<0>(boost::compute::_1);
       auto Ain  = compute::lambda::get<1>(boost::compute::_1);
       auto B    = compute::lambda::get<2>(boost::compute::_1);
       auto C    = compute::lambda::get<3>(boost::compute::_1);
-#endif
-
-      compute::for_each(
-          compute::make_zip_iterator(
-              boost::make_tuple(
-                  d_A.begin(), d_A.begin(), d_B.begin(), d_C.begin()
-              )
-          ),
-          compute::make_zip_iterator(
-              boost::make_tuple(
-                  d_A.end(), d_A.end(), d_B.end(), d_C.end()
-              )
-          ),
-#if LAMBDA_MAKE_TUPLE
-          // += does not work here
-          compute::lambda::make_tuple(
-              Aout = Ain + B + scalar * C
-          ),
-#else
-          // += does not work here
-          compute::lambda::get<0>(_1) = compute::lambda::get<1>(_1)
-                                      + compute::lambda::get<2>(_1)
-                                      + compute::lambda::get<3>(_1) * scalar,
-#endif
-          queue
-      );
-#endif
 
+      auto begin = compute::make_zip_iterator( boost::make_tuple( d_A.begin(), d_A.begin(), d_B.begin(), d_C.begin()));
+      auto end   = compute::make_zip_iterator( boost::make_tuple( d_A.end(),   d_A.end(),   d_B.end(),   d_C.end()));
+
+      compute::for_each(begin, end,
+                        compute::lambda::make_tuple
+                        (
+                            Aout = Ain + B + scalar * C
+                        ),
+                        queue
+                       );
       queue.finish();
     }
 

From 6ef0faaae730538254d00056b68a2bc31bdf2ab4 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Mon, 19 Mar 2018 10:28:55 -0700
Subject: [PATCH 059/245] Update README.md

[ci skip]
---
 README.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9360b1d35..871ea9272 100644
--- a/README.md
+++ b/README.md
@@ -90,14 +90,17 @@ f = see footnotes
 | OpenMP target        |  y  |    y    |     y     |    y    |        |       |
 | OpenCL 1.x           |  i  |    y    |     y     |    y    |        |       |
 | SYCL                 |     |    y    |     y     |    y    |        |       |
+| Boost.Compute        |     |         |           |    y    |        |       |
 | Parallel STL         |  y  |    y    |     y     |    y    |        |       |
 | TBB                  |  i  |    y    |     y     |    y    |        |       |
 | Kokkos               |  y  |    y    |     y     |    y    |        |       |
 | RAJA                 |  y  |    y    |     y     |    y    |        |       |
-| CUDA                 |     |         |     y     |    y    |        |       |
+| CUDA                 |  i  |    y    |     y     |    y    |        |       |
 | CUBLAS               |     |         |     y     |    y    |        |       |
 | CBLAS                |     |         |           |         |        |   y   |
 
+* [SYCL](http://sycl.tech/)
+* [Boost.Compute](http://boostorg.github.io/compute/)
 * [TBB](https://www.threadingbuildingblocks.org/)
 * [Kokkos](https://github.com/kokkos/kokkos)
 * [RAJA](https://github.com/LLNL/RAJA)

From 0c3fceb0dbe22b5e31358497b9a0e214b498d9ce Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Tue, 20 Mar 2018 09:13:13 -0700
Subject: [PATCH 060/245] workaround CUDA compiler breaking intrinsics headers

CUDA compiler requires disabling one of the following:
1) all x86 intrinsics
2) all Intel + AMD intrinsics
3) MWAIT + all AVX-512 intrinsics

it is almost as if this sort of idiotic behavior is intentional...

[ci skip]
---
 common/make.defs.cuda | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/common/make.defs.cuda b/common/make.defs.cuda
index 07838070e..48b85710d 100644
--- a/common/make.defs.cuda
+++ b/common/make.defs.cuda
@@ -46,7 +46,27 @@ NVCC=/opt/llvm/cocl/bin/cocl
 #NVCC=nvcc --compiler-bindir=<path to older GCC> --gpu-architecture=sm_61
 CUDAFLAGS=-g -O3 -std=c++11
 # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233
+# heavy hammer:
+#CUDAFLAGS+=-D_X86INTRIN_H_INCLUDED
+# big hammers:
+#CUDAFLAGS+=-D_IMMINTRIN_H_INCLUDED
+#CUDAFLAGS+=-D_FMA4INTRIN_H_INCLUDED
+#CUDAFLAGS+=-D_XOPMMINTRIN_H_INCLUDED
+# many tiny hammers:
 CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED
+CUDAFLAGS+=-D_AVX512FINTRIN_H_INCLUDED
+CUDAFLAGS+=-D_AVX512VLINTRIN_H_INCLUDED
+CUDAFLAGS+=-D_AVX512BWINTRIN_H_INCLUDED
+CUDAFLAGS+=-D_AVX512DQINTRIN_H_INCLUDED
+CUDAFLAGS+=-D_AVX512VLBWINTRIN_H_INCLUDED
+CUDAFLAGS+=-D_AVX512VBMIVLINTRIN_H_INCLUDED
+CUDAFLAGS+=-D_AVX512VBMIINTRIN_H_INCLUDED
+CUDAFLAGS+=-D_AVX512VLDQINTRIN_H_INCLUDED
+CUDAFLAGS+=-D_AVX512CDINTRIN_H_INCLUDED
+CUDAFLAGS+=-D_AVX512PFINTRIN_H_INCLUDED
+CUDAFLAGS+=-D_AVX512IFMAINTRIN_H_INCLUDED
+CUDAFLAGS+=-D_AVX512IFMAVLINTRIN_H_INCLUDED
+CUDAFLAGS+=-D_AVX512ERINTRIN_H_INCLUDED
 #
 # MPI
 #

From 8e17a9a3b28d4752dfe35079a1539c5576af7922 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Mon, 19 Mar 2018 12:55:30 -0700
Subject: [PATCH 061/245] add p2p hyperplane in TBB

---
 Cxx11/Makefile                     |   5 +-
 Cxx11/p2p-hyperplane-vector-tbb.cc | 210 +++++++++++++++++++++++++++++
 Cxx11/p2p-innerloop-vector-tbb.cc  |   5 +-
 travis/build-run-prk.sh            |  10 +-
 4 files changed, 222 insertions(+), 8 deletions(-)
 create mode 100644 Cxx11/p2p-hyperplane-vector-tbb.cc

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index e0cbe0b6d..76e5d0d73 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -65,7 +65,7 @@ endif
 all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sycl occa boost-compute $(EXTRA)
 
 p2p: p2p-vector p2p-doacross-vector-openmp p2p-hyperplane-vector-openmp p2p-tasks-openmp p2p-openmp-target \
-     p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl
+     p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl p2p-hyperplane-vector-tbb
 
 stencil: stencil-valarray stencil-vector stencil-vector-async stencil-vector-openmp stencil-openmp-target \
 	 stencil-vector-taskloop stencil-vector-stl stencil-vector-pstl stencil-vector-raja \
@@ -97,7 +97,8 @@ opencl: p2p-innerloop-opencl stencil-opencl transpose-opencl nstream-opencl
 
 sycl: stencil-sycl transpose-sycl nstream-sycl
 
-tbb: p2p-innerloop-vector-tbb p2p-vector-tbb stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb
+tbb: p2p-innerloop-vector-tbb p2p-vector-tbb stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb \
+     p2p-hyperplane-vector-tbb
 
 stl: stencil-vector-stl transpose-vector-stl nstream-vector-stl
 
diff --git a/Cxx11/p2p-hyperplane-vector-tbb.cc b/Cxx11/p2p-hyperplane-vector-tbb.cc
new file mode 100644
index 000000000..250356319
--- /dev/null
+++ b/Cxx11/p2p-hyperplane-vector-tbb.cc
@@ -0,0 +1,210 @@
+///
+/// Copyright (c) 2013, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    Pipeline
+///
+/// PURPOSE: This program tests the efficiency with which point-to-point
+///          synchronization can be carried out. It does so by executing
+///          a pipelined algorithm on an n^2 grid. The first array dimension
+///          is distributed among the threads (stripwise decomposition).
+///
+/// USAGE:   The program takes as input the
+///          dimensions of the grid, and the number of iterations on the grid
+///
+///                <progname> <iterations> <n>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// FUNCTIONS CALLED:
+///
+///          Other than standard C functions, the following
+///          functions are used in this program:
+///
+///          wtime()
+///
+/// HISTORY: - Written by Rob Van der Wijngaart, February 2009.
+///            C99-ification by Jeff Hammond, February 2016.
+///            C++11-ification by Jeff Hammond, May 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+
+inline void sweep_tile_sequential(int startm, int endm,
+                                  int startn, int endn,
+                                  int n, std::vector<double> & grid)
+{
+  for (auto i=startm; i<endm; i++) {
+    for (auto j=startn; j<endn; j++) {
+      grid[i*n+j] = grid[(i-1)*n+j] + grid[i*n+(j-1)] - grid[(i-1)*n+(j-1)];
+    }
+  }
+}
+
+#if 0
+inline void sweep_tile_hyperplane(int startm, int endm,
+                                  int startn, int endn,
+                                  int n, std::vector<double> & grid)
+{
+  for (auto i=2; i<=2*n-2; i++) {
+    for (auto j=std::max(2,i-n+2); j<=std::min(i,n); j++) {
+      const auto x = i-j+1;
+      const auto y = j-1;
+      grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)];
+    }
+  }
+}
+#endif
+
+int main(int argc, char* argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/TBB HYPERPLANE pipeline execution on 2D grid" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Process and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations;
+  int n, nc, nb;
+  try {
+      if (argc < 3) {
+        throw " <# iterations> <array dimension> [<chunk dimension>]";
+      }
+
+      // number of times to run the pipeline algorithm
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      // grid dimensions
+      n = std::atoi(argv[2]);
+      if (n < 1) {
+        throw "ERROR: grid dimensions must be positive";
+      } else if ( static_cast<size_t>(n)*static_cast<size_t>(n) > static_cast<size_t>(INT_MAX)) {
+        throw "ERROR: grid dimension too large - overflow risk";
+      }
+
+      // grid chunk dimensions
+      nc = (argc > 3) ? std::atoi(argv[3]) : 1;
+      nc = std::max(1,nc);
+      nc = std::min(n,nc);
+
+      // number of grid blocks
+      nb = (n-1)/nc;
+      if ((n-1)%nc) nb++;
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  const char* envvar = std::getenv("TBB_NUM_THREADS");
+  int num_threads = (envvar!=NULL) ? std::atoi(envvar) : tbb::task_scheduler_init::default_num_threads();
+  tbb::task_scheduler_init init(num_threads);
+
+  std::cout << "Number of threads    = " << num_threads << std::endl;
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Grid sizes           = " << n << ", " << n << std::endl;
+  std::cout << "Grid chunk sizes     = " << nc << std::endl;
+  std::cout << "TBB partitioner: " << typeid(tbb_partitioner).name() << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  auto pipeline_time = 0.0; // silence compiler warning
+
+  std::vector<double> grid(n*n,0.0);
+
+  // set boundary values (bottom and left side of grid)
+  for (auto j=0; j<n; j++) {
+    grid[0*n+j] = static_cast<double>(j);
+    grid[j*n+0] = static_cast<double>(j);
+  }
+
+  for (auto iter = 0; iter<=iterations; iter++) {
+
+    if (iter==1) pipeline_time = prk::wtime();
+
+    if (nc==1) {
+      for (auto i=2; i<=2*n-2; i++) {
+        //OMP_FOR_SIMD
+        //for (auto j=std::max(2,i-n+2); j<=std::min(i,n); j++) {
+        tbb::parallel_for( std::max(2,i-n+2), std::min(i,n)+1, [=,&grid](int j) {
+          const auto x = i-j+1;
+          const auto y = j-1;
+          grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)];
+        });
+      }
+    } else {
+      for (int i=2; i<=2*(nb+1)-2; i++) {
+        //OMP_FOR()
+        //for (int j=std::max(2,i-(nb+1)+2); j<=std::min(i,nb+1); j++) {
+        tbb::parallel_for( std::max(2,i-(nb+1)+2), std::min(i,nb+1)+1, [=,&grid](int j) {
+          const int ib = nc*(i-j)+1;
+          const int jb = nc*(j-2)+1;
+          sweep_tile_sequential(ib, std::min(n,ib+nc), jb, std::min(n,jb+nc), n, grid);
+        });
+      }
+    }
+    grid[0*n+0] = -grid[(n-1)*n+(n-1)];
+  }
+
+  pipeline_time = prk::wtime() - pipeline_time;
+  //////////////////////////////////////////////////////////////////////
+  // Analyze and output results.
+  //////////////////////////////////////////////////////////////////////
+
+  const double epsilon = 1.e-8;
+  auto corner_val = ((iterations+1.)*(2.*n-2.));
+  if ( (std::fabs(grid[(n-1)*n+(n-1)] - corner_val)/corner_val) > epsilon) {
+    std::cout << "ERROR: checksum " << grid[(n-1)*n+(n-1)]
+              << " does not match verification value " << corner_val << std::endl;
+    return 1;
+  }
+
+#ifdef VERBOSE
+  std::cout << "Solution validates; verification value = " << corner_val << std::endl;
+#else
+  std::cout << "Solution validates" << std::endl;
+#endif
+  auto avgtime = pipeline_time/iterations;
+  std::cout << "Rate (MFlops/s): "
+            << 2.0e-6 * ( (n-1.)*(n-1.) )/avgtime
+            << " Avg time (s): " << avgtime << std::endl;
+
+  return 0;
+}
diff --git a/Cxx11/p2p-innerloop-vector-tbb.cc b/Cxx11/p2p-innerloop-vector-tbb.cc
index 788226f71..2bff51f15 100644
--- a/Cxx11/p2p-innerloop-vector-tbb.cc
+++ b/Cxx11/p2p-innerloop-vector-tbb.cc
@@ -72,9 +72,8 @@ int main(int argc, char* argv[])
 
   int iterations;
   int n;
-  int mc, nc;
   try {
-      if (argc < 3){
+      if (argc < 3) {
         throw " <# iterations> <array dimension>";
       }
 
@@ -121,7 +120,9 @@ int main(int argc, char* argv[])
   }
 
   for (auto iter = 0; iter<=iterations; iter++){
+
     if (iter == 1) pipeline_time = prk::wtime();
+
     for (auto i=2; i<=2*n-2; i++) {
       tbb::parallel_for( std::max(2,i-n+2), std::min(i,n)+1, [=,&grid](int j) {
                const auto x = i-j+2-1;
diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index 7ae2e14b4..05a5b49a1 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -473,10 +473,12 @@ case "$PRK_TARGET" in
                     ;;
             esac
             make -C $PRK_TARGET_PATH stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb
-            #$PRK_TARGET_PATH/p2p-vector-tbb     10 1024 1024 64 64
-            $PRK_TARGET_PATH/stencil-vector-tbb     10 1000
-            $PRK_TARGET_PATH/transpose-vector-tbb   10 1024 32
-            $PRK_TARGET_PATH/nstream-vector-tbb     10 16777216 32
+            $PRK_TARGET_PATH/p2p-innerloop-vector-tbb     10 1024
+            $PRK_TARGET_PATH/p2p-hyperplane-vector-tbb    10 1024 1
+            $PRK_TARGET_PATH/p2p-hyperplane-vector-tbb    10 1024 32
+            $PRK_TARGET_PATH/stencil-vector-tbb           10 1000
+            $PRK_TARGET_PATH/transpose-vector-tbb         10 1024 32
+            $PRK_TARGET_PATH/nstream-vector-tbb           10 16777216 32
             #echo "Test stencil code generator"
             for s in star grid ; do
                 for r in 1 2 3 4 5 ; do

From a946d934af89a2721746899c1ddd40cf7592cdde Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Mon, 19 Mar 2018 15:14:39 -0700
Subject: [PATCH 062/245] add STL/PSTL p2p hyperplane ala TBB (and OpenMP)

---
 .gitignore                          |   3 +
 Cxx11/p2p-hyperplane-vector-pstl.cc | 225 ++++++++++++++++++++++++++++
 Cxx11/p2p-hyperplane-vector-tbb.cc  |   1 +
 travis/build-run-prk.sh             |  22 +--
 4 files changed, 242 insertions(+), 9 deletions(-)
 create mode 100644 Cxx11/p2p-hyperplane-vector-pstl.cc

diff --git a/.gitignore b/.gitignore
index fb475624f..44d361e21 100644
--- a/.gitignore
+++ b/.gitignore
@@ -210,3 +210,6 @@ FORTRAN/transpose-tasks-openmp
 RUST/p2p
 RUST/stencil
 RUST/transpose
+p2p-hyperplane-vector-stl
+p2p-hyperplane-vector-pstl
+p2p-hyperplane-vector-tbb
diff --git a/Cxx11/p2p-hyperplane-vector-pstl.cc b/Cxx11/p2p-hyperplane-vector-pstl.cc
new file mode 100644
index 000000000..81b58d50c
--- /dev/null
+++ b/Cxx11/p2p-hyperplane-vector-pstl.cc
@@ -0,0 +1,225 @@
+///
+/// Copyright (c) 2013, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    Pipeline
+///
+/// PURPOSE: This program tests the efficiency with which point-to-point
+///          synchronization can be carried out. It does so by executing
+///          a pipelined algorithm on an n^2 grid. The first array dimension
+///          is distributed among the threads (stripwise decomposition).
+///
+/// USAGE:   The program takes as input the
+///          dimensions of the grid, and the number of iterations on the grid
+///
+///                <progname> <iterations> <n>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// FUNCTIONS CALLED:
+///
+///          Other than standard C functions, the following
+///          functions are used in this program:
+///
+///          wtime()
+///
+/// HISTORY: - Written by Rob Van der Wijngaart, February 2009.
+///            C99-ification by Jeff Hammond, February 2016.
+///            C++11-ification by Jeff Hammond, May 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+
+inline void sweep_tile_sequential(int startm, int endm,
+                                  int startn, int endn,
+                                  int n, std::vector<double> & grid)
+{
+  for (auto i=startm; i<endm; i++) {
+    for (auto j=startn; j<endn; j++) {
+      grid[i*n+j] = grid[(i-1)*n+j] + grid[i*n+(j-1)] - grid[(i-1)*n+(j-1)];
+    }
+  }
+}
+
+#if 0
+inline void sweep_tile_hyperplane(int startm, int endm,
+                                  int startn, int endn,
+                                  int n, std::vector<double> & grid)
+{
+  for (auto i=2; i<=2*n-2; i++) {
+    for (auto j=std::max(2,i-n+2); j<=std::min(i,n); j++) {
+      const auto x = i-j+1;
+      const auto y = j-1;
+      grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)];
+    }
+  }
+}
+#endif
+
+int main(int argc, char* argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+#if defined(USE_PSTL)
+  std::cout << "C++17 PSTL HYPERPLANE pipeline execution on 2D grid" << std::endl;
+#else
+  std::cout << "C++11 STL HYPERPLANE pipeline execution on 2D grid" << std::endl;
+#endif
+
+  //////////////////////////////////////////////////////////////////////
+  // Process and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations;
+  int n, nc, nb;
+  try {
+      if (argc < 3) {
+        throw " <# iterations> <array dimension> [<chunk dimension>]";
+      }
+
+      // number of times to run the pipeline algorithm
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      // grid dimensions
+      n = std::atoi(argv[2]);
+      if (n < 1) {
+        throw "ERROR: grid dimensions must be positive";
+      } else if ( static_cast<size_t>(n)*static_cast<size_t>(n) > static_cast<size_t>(INT_MAX)) {
+        throw "ERROR: grid dimension too large - overflow risk";
+      }
+
+      // grid chunk dimensions
+      nc = (argc > 3) ? std::atoi(argv[3]) : 1;
+      nc = std::max(1,nc);
+      nc = std::min(n,nc);
+
+      // number of grid blocks
+      nb = (n-1)/nc;
+      if ((n-1)%nc) nb++;
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Grid sizes           = " << n << ", " << n << std::endl;
+  std::cout << "Grid chunk sizes     = " << nc << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  auto pipeline_time = 0.0; // silence compiler warning
+
+  std::vector<double> grid(n*n,0.0);
+
+  // set boundary values (bottom and left side of grid)
+  for (auto j=0; j<n; j++) {
+    grid[0*n+j] = static_cast<double>(j);
+    grid[j*n+0] = static_cast<double>(j);
+  }
+
+  for (auto iter = 0; iter<=iterations; iter++) {
+
+    if (iter==1) pipeline_time = prk::wtime();
+
+    if (nc==1) {
+      for (auto i=2; i<=2*n-2; i++) {
+        const auto begin = std::max(2,i-n+2);
+        const auto end   = std::min(i,n)+1;
+        auto range = boost::irange(begin,end);
+#if defined(USE_PSTL) && defined(USE_INTEL_PSTL)
+        std::for_each( pstl::execution::par, std::begin(range), std::end(range), [&] (auto j) {
+#elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \
+                        && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) )
+        __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (auto j) {
+#else
+        std::for_each( std::begin(range), std::end(range), [&] (auto j) {
+#endif
+          const auto x = i-j+1;
+          const auto y = j-1;
+          grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)];
+        });
+      }
+    } else {
+      for (int i=2; i<=2*(nb+1)-2; i++) {
+        const auto begin = std::max(2,i-(nb+1)+2);
+        const auto end   = std::min(i,nb+1)+1;
+        auto range = boost::irange(begin,end);
+#if defined(USE_PSTL) && defined(USE_INTEL_PSTL)
+        std::for_each( pstl::execution::par, std::begin(range), std::end(range), [&] (auto j) {
+#elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \
+                        && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) )
+        __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (auto j) {
+#else
+        std::for_each( std::begin(range), std::end(range), [&] (auto j) {
+#endif
+          const int ib = nc*(i-j)+1;
+          const int jb = nc*(j-2)+1;
+          sweep_tile_sequential(ib, std::min(n,ib+nc), jb, std::min(n,jb+nc), n, grid);
+        });
+      }
+    }
+    grid[0*n+0] = -grid[(n-1)*n+(n-1)];
+  }
+
+  pipeline_time = prk::wtime() - pipeline_time;
+
+  //////////////////////////////////////////////////////////////////////
+  // Analyze and output results.
+  //////////////////////////////////////////////////////////////////////
+
+  const double epsilon = 1.e-8;
+  auto corner_val = ((iterations+1.)*(2.*n-2.));
+  if ( (std::fabs(grid[(n-1)*n+(n-1)] - corner_val)/corner_val) > epsilon) {
+    std::cout << "ERROR: checksum " << grid[(n-1)*n+(n-1)]
+              << " does not match verification value " << corner_val << std::endl;
+    return 1;
+  }
+
+#ifdef VERBOSE
+  std::cout << "Solution validates; verification value = " << corner_val << std::endl;
+#else
+  std::cout << "Solution validates" << std::endl;
+#endif
+  auto avgtime = pipeline_time/iterations;
+  std::cout << "Rate (MFlops/s): "
+            << 2.0e-6 * ( (n-1.)*(n-1.) )/avgtime
+            << " Avg time (s): " << avgtime << std::endl;
+
+  return 0;
+}
diff --git a/Cxx11/p2p-hyperplane-vector-tbb.cc b/Cxx11/p2p-hyperplane-vector-tbb.cc
index 250356319..6c4ad9aac 100644
--- a/Cxx11/p2p-hyperplane-vector-tbb.cc
+++ b/Cxx11/p2p-hyperplane-vector-tbb.cc
@@ -184,6 +184,7 @@ int main(int argc, char* argv[])
   }
 
   pipeline_time = prk::wtime() - pipeline_time;
+
   //////////////////////////////////////////////////////////////////////
   // Analyze and output results.
   //////////////////////////////////////////////////////////////////////
diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index 05a5b49a1..a88fade88 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -472,7 +472,7 @@ case "$PRK_TARGET" in
                     export LD_LIBRARY_PATH=${TBBROOT}/lib:${LD_LIBRARY_PATH}
                     ;;
             esac
-            make -C $PRK_TARGET_PATH stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb
+            make -C $PRK_TARGET_PATH p2p-innerloop-vector-tbb p2p-hyperplane-vector-tbb stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb
             $PRK_TARGET_PATH/p2p-innerloop-vector-tbb     10 1024
             $PRK_TARGET_PATH/p2p-hyperplane-vector-tbb    10 1024 1
             $PRK_TARGET_PATH/p2p-hyperplane-vector-tbb    10 1024 32
@@ -488,10 +488,12 @@ case "$PRK_TARGET" in
         fi
 
         # C++11 with STL
-        make -C $PRK_TARGET_PATH stencil-vector-stl transpose-vector-stl nstream-vector-stl
-        $PRK_TARGET_PATH/stencil-vector-stl     10 1000
-        $PRK_TARGET_PATH/transpose-vector-stl   10 1024 32
-        $PRK_TARGET_PATH/nstream-vector-stl     10 16777216 32
+        make -C $PRK_TARGET_PATH p2p-hyperplane-vector-stl stencil-vector-stl transpose-vector-stl nstream-vector-stl
+        $PRK_TARGET_PATH/p2p-hyperplane-vector-stl    10 1024 1
+        $PRK_TARGET_PATH/p2p-hyperplane-vector-stl    10 1024 32
+        $PRK_TARGET_PATH/stencil-vector-stl           10 1000
+        $PRK_TARGET_PATH/transpose-vector-stl         10 1024 32
+        $PRK_TARGET_PATH/nstream-vector-stl           10 16777216 32
         #echo "Test stencil code generator"
         for s in star grid ; do
             for r in 1 2 3 4 5 ; do
@@ -508,10 +510,12 @@ case "$PRK_TARGET" in
             else
                 echo "PSTLFLAG=-DUSE_PSTL -fopenmp ${TBBFLAG} -DUSE_INTEL_PSTL -I$TRAVIS_ROOT/pstl/include" >> common/make.defs
             fi
-            make -C $PRK_TARGET_PATH stencil-vector-pstl transpose-vector-pstl nstream-vector-pstl
-            $PRK_TARGET_PATH/stencil-vector-pstl     10 1000
-            $PRK_TARGET_PATH/transpose-vector-pstl   10 1024 32
-            $PRK_TARGET_PATH/nstream-vector-pstl     10 16777216 32
+            make -C $PRK_TARGET_PATH p2p-hyperplane-vector-pstl stencil-vector-pstl transpose-vector-pstl nstream-vector-pstl
+            $PRK_TARGET_PATH/p2p-hyperplane-vector-pstl    10 1024 1
+            $PRK_TARGET_PATH/p2p-hyperplane-vector-pstl    10 1024 32
+            $PRK_TARGET_PATH/stencil-vector-pstl           10 1000
+            $PRK_TARGET_PATH/transpose-vector-pstl         10 1024 32
+            $PRK_TARGET_PATH/nstream-vector-pstl           10 16777216 32
             #echo "Test stencil code generator"
             for s in star grid ; do
                 for r in 1 2 3 4 5 ; do

From 1b94d4c10caa8dba77dbc0462455c7fb5fe4308d Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Mon, 19 Mar 2018 16:40:43 -0700
Subject: [PATCH 063/245] add p2p innerloop (unblocked hyperplane) for SYCL

the performance of this is terrible with triSYCL.  like 100x100 times
out.  may be a race condition leading to deadlock.  need to debug.
---
 Cxx11/Makefile               |   5 +-
 Cxx11/p2p-hyperplane-sycl.cc | 221 +++++++++++++++++++++++++++++++++++
 travis/build-run-prk.sh      |   9 +-
 3 files changed, 229 insertions(+), 6 deletions(-)
 create mode 100644 Cxx11/p2p-hyperplane-sycl.cc

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index 76e5d0d73..190b44dbb 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -65,7 +65,8 @@ endif
 all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sycl occa boost-compute $(EXTRA)
 
 p2p: p2p-vector p2p-doacross-vector-openmp p2p-hyperplane-vector-openmp p2p-tasks-openmp p2p-openmp-target \
-     p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl p2p-hyperplane-vector-tbb
+     p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl p2p-hyperplane-vector-tbb \
+     p2p-hyperplane-sycl
 
 stencil: stencil-valarray stencil-vector stencil-vector-async stencil-vector-openmp stencil-openmp-target \
 	 stencil-vector-taskloop stencil-vector-stl stencil-vector-pstl stencil-vector-raja \
@@ -95,7 +96,7 @@ taskloop: stencil-vector-taskloop transpose-vector-taskloop nstream-vector-taskl
 
 opencl: p2p-innerloop-opencl stencil-opencl transpose-opencl nstream-opencl
 
-sycl: stencil-sycl transpose-sycl nstream-sycl
+sycl: p2p-hyperplane-sycl stencil-sycl transpose-sycl nstream-sycl
 
 tbb: p2p-innerloop-vector-tbb p2p-vector-tbb stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb \
      p2p-hyperplane-vector-tbb
diff --git a/Cxx11/p2p-hyperplane-sycl.cc b/Cxx11/p2p-hyperplane-sycl.cc
new file mode 100644
index 000000000..58f7e7e2d
--- /dev/null
+++ b/Cxx11/p2p-hyperplane-sycl.cc
@@ -0,0 +1,221 @@
+///
+/// Copyright (c) 2013, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    Pipeline
+///
+/// PURPOSE: This program tests the efficiency with which point-to-point
+///          synchronization can be carried out. It does so by executing
+///          a pipelined algorithm on an n^2 grid. The first array dimension
+///          is distributed among the threads (stripwise decomposition).
+///
+/// USAGE:   The program takes as input the
+///          dimensions of the grid, and the number of iterations on the grid
+///
+///                <progname> <iterations> <n>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// FUNCTIONS CALLED:
+///
+///          Other than standard C functions, the following
+///          functions are used in this program:
+///
+///          wtime()
+///
+/// HISTORY: - Written by Rob Van der Wijngaart, February 2009.
+///            C99-ification by Jeff Hammond, February 2016.
+///            C++11-ification by Jeff Hammond, May 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+
+inline void sweep_tile_sequential(int startm, int endm,
+                                  int startn, int endn,
+                                  int n, std::vector<double> & grid)
+{
+  for (auto i=startm; i<endm; i++) {
+    for (auto j=startn; j<endn; j++) {
+      grid[i*n+j] = grid[(i-1)*n+j] + grid[i*n+(j-1)] - grid[(i-1)*n+(j-1)];
+    }
+  }
+}
+
+#if 0
+inline void sweep_tile_hyperplane(int startm, int endm,
+                                  int startn, int endn,
+                                  int n, std::vector<double> & grid)
+{
+  for (auto i=2; i<=2*n-2; i++) {
+    for (auto j=std::max(2,i-n+2); j<=std::min(i,n); j++) {
+      const auto x = i-j+1;
+      const auto y = j-1;
+      grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)];
+    }
+  }
+}
+#endif
+
+int main(int argc, char* argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/SYCL HYPERPLANE pipeline execution on 2D grid" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Process and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations;
+  int n;
+#if 0
+  int nc, nb;
+#endif
+  try {
+      if (argc < 3) {
+        throw " <# iterations> <array dimension> [<chunk dimension>]";
+      }
+
+      // number of times to run the pipeline algorithm
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      // grid dimensions
+      n = std::atoi(argv[2]);
+      if (n < 1) {
+        throw "ERROR: grid dimensions must be positive";
+      } else if ( static_cast<size_t>(n)*static_cast<size_t>(n) > static_cast<size_t>(INT_MAX)) {
+        throw "ERROR: grid dimension too large - overflow risk";
+      }
+
+#if 0
+      // grid chunk dimensions
+      nc = (argc > 3) ? std::atoi(argv[3]) : 1;
+      nc = std::max(1,nc);
+      nc = std::min(n,nc);
+
+      // number of grid blocks
+      nb = (n-1)/nc;
+      if ((n-1)%nc) nb++;
+#endif
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Grid sizes           = " << n << ", " << n << std::endl;
+#if 0
+  std::cout << "Grid chunk sizes     = " << nc << std::endl;
+#endif
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  auto pipeline_time = 0.0; // silence compiler warning
+
+  std::vector<double> h_grid(n*n,0.0);
+  for (int j=0; j<n; j++) {
+    h_grid[0*n+j] = static_cast<double>(j);
+    h_grid[j*n+0] = static_cast<double>(j);
+  }
+
+  cl::sycl::queue q;
+  {
+    cl::sycl::buffer<double> d_grid { h_grid.data(), h_grid.size() };
+
+    for (auto iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) pipeline_time = prk::wtime();
+
+      for (int i=2; i<=2*n-2; i++) {
+
+        cl::sycl::id<1> I{unsigned(i)};
+        cl::sycl::id<1> One{1};
+
+        q.submit([&](cl::sycl::handler& h) {
+
+          auto grid = d_grid.get_access<cl::sycl::access::mode::read_write>(h);
+
+          unsigned begin = std::max(2,i-n+2);
+          unsigned end   = std::min(i,n)+1;
+          unsigned range = end-begin;
+
+          h.parallel_for<class p2p>(cl::sycl::range<1>{range}, cl::sycl::id<1>{begin}, [=] (cl::sycl::item<1> J) {
+            cl::sycl::id<1> N{unsigned(n)};
+            cl::sycl::id<1> X{I-J+One};
+            cl::sycl::id<1> Y{J-One};
+            cl::sycl::id<1> Xold{X-One}; // x-1
+            cl::sycl::id<1> Yold{Y-One}; // y-1
+            cl::sycl::id<1> index0{X*N+Y};
+            cl::sycl::id<1> index1{Xold*N+Y};
+            cl::sycl::id<1> index2{X*N+Yold};
+            cl::sycl::id<1> index3{Xold*N+Yold};
+            grid[index0] = grid[index1] + grid[index2] - grid[index3];
+          });
+        });
+        q.wait();
+      }
+      h_grid[0*n+0] = -h_grid[(n-1)*n+(n-1)];
+    }
+    pipeline_time = prk::wtime() - pipeline_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  // Analyze and output results.
+  //////////////////////////////////////////////////////////////////////
+
+  const double epsilon = 1.e-8;
+  auto corner_val = ((iterations+1.)*(2.*n-2.));
+  if ( (std::fabs(h_grid[(n-1)*n+(n-1)] - corner_val)/corner_val) > epsilon) {
+    std::cout << "ERROR: checksum " << h_grid[(n-1)*n+(n-1)]
+              << " does not match verification value " << corner_val << std::endl;
+    return 1;
+  }
+
+#ifdef VERBOSE
+  std::cout << "Solution validates; verification value = " << corner_val << std::endl;
+#else
+  std::cout << "Solution validates" << std::endl;
+#endif
+  auto avgtime = pipeline_time/iterations;
+  std::cout << "Rate (MFlops/s): "
+            << 2.0e-6 * ( (n-1.)*(n-1.) )/avgtime
+            << " Avg time (s): " << avgtime << std::endl;
+
+  return 0;
+}
diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index a88fade88..5812e949f 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -623,10 +623,11 @@ case "$PRK_TARGET" in
                 echo "SYCLCXX=${PRK_CXX} -fopenmp -std=c++14" >> common/make.defs
             fi
             echo "SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/include" >> common/make.defs
-            make -C $PRK_TARGET_PATH stencil-sycl transpose-sycl nstream-sycl
-            $PRK_TARGET_PATH/stencil-sycl     10 1000
-            $PRK_TARGET_PATH/transpose-sycl   10 1024 32
-            $PRK_TARGET_PATH/nstream-sycl     10 16777216 32
+            make -C $PRK_TARGET_PATH p2p-hyperplane-sycl stencil-sycl transpose-sycl nstream-sycl
+            $PRK_TARGET_PATH/p2p-hyperplane-sycl 10 50 1 # 100 takes too long :-o
+            $PRK_TARGET_PATH/stencil-sycl        10 1000
+            $PRK_TARGET_PATH/transpose-sycl      10 1024 32
+            $PRK_TARGET_PATH/nstream-sycl        10 16777216 32
             #echo "Test stencil code generator"
             for s in star ; do # grid ; do # grid not supported yet
                 for r in 1 2 3 4 5 ; do

From 50b527ebca6189cfff0743b04685c0161c5ae082 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Tue, 20 Mar 2018 05:32:42 -0700
Subject: [PATCH 064/245] disable p2p-hyperplane-sycl

---
 travis/build-run-prk.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index 5812e949f..7e78338fd 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -624,7 +624,7 @@ case "$PRK_TARGET" in
             fi
             echo "SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/include" >> common/make.defs
             make -C $PRK_TARGET_PATH p2p-hyperplane-sycl stencil-sycl transpose-sycl nstream-sycl
-            $PRK_TARGET_PATH/p2p-hyperplane-sycl 10 50 1 # 100 takes too long :-o
+            #$PRK_TARGET_PATH/p2p-hyperplane-sycl 10 50 1 # 100 takes too long :-o
             $PRK_TARGET_PATH/stencil-sycl        10 1000
             $PRK_TARGET_PATH/transpose-sycl      10 1024 32
             $PRK_TARGET_PATH/nstream-sycl        10 16777216 32

From 5bb8be6915357c5af9a745e84c7391976f4632d3 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Tue, 20 Mar 2018 05:52:04 -0700
Subject: [PATCH 065/245] fix errors in stencil code generator

---
 Cxx11/generate-sycl-stencil.py |   6 +-
 Cxx11/stencil_sycl.hpp         | 100 ++++++++++++++++-----------------
 2 files changed, 53 insertions(+), 53 deletions(-)

diff --git a/Cxx11/generate-sycl-stencil.py b/Cxx11/generate-sycl-stencil.py
index fcb0c49bf..1c71ff03c 100755
--- a/Cxx11/generate-sycl-stencil.py
+++ b/Cxx11/generate-sycl-stencil.py
@@ -22,9 +22,9 @@ def codegen(src,pattern,stencil_size,radius,model,dim):
             src.write('    cl::sycl::id<2> dx'+str(r)+'(cl::sycl::range<2> {'+str(r)+',0});\n')
             src.write('    cl::sycl::id<2> dy'+str(r)+'(cl::sycl::range<2> {0,'+str(r)+'});\n')
     src.write('    h.parallel_for<class '+pattern+str(radius)+'_'+str(dim)+'d>(')
-    src.write('{n-'+str(2*radius)+',n-'+str(2*radius)+'}, ')
-    src.write('{'+str(radius)+','+str(radius)+'}, ')
-    src.write('[=] (auto it) {\n')
+    src.write('cl::sycl::range<2> {n-'+str(2*radius)+',n-'+str(2*radius)+'}, ')
+    src.write('cl::sycl::id<2> {'+str(radius)+','+str(radius)+'}, ')
+    src.write('[=] (cl::sycl::item<2> it) {\n')
     if (dim==2):
         src.write('        cl::sycl::id<2> xy = it.get_id();\n')
         src.write('        out[xy] += ')
diff --git a/Cxx11/stencil_sycl.hpp b/Cxx11/stencil_sycl.hpp
index 261128675..6fbf8d9f7 100644
--- a/Cxx11/stencil_sycl.hpp
+++ b/Cxx11/stencil_sycl.hpp
@@ -3,7 +3,7 @@ void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double> & d_in,
   q.submit([&](cl::sycl::handler& h) {
     auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
     auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
-    h.parallel_for<class star1_1d>({n-2,n-2}, {1,1}, [=] (auto it) {
+    h.parallel_for<class star1_1d>(cl::sycl::range<2> {n-2,n-2}, cl::sycl::id<2> {1,1}, [=] (cl::sycl::item<2> it) {
         out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.5
                               +in[it[0]*n+(it[1]-1)] * -0.5
                               +in[(it[0]+1)*n+it[1]] * 0.5
@@ -19,7 +19,7 @@ void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double, 2> & d_
     auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
     cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
     cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
-    h.parallel_for<class star1_2d>({n-2,n-2}, {1,1}, [=] (auto it) {
+    h.parallel_for<class star1_2d>(cl::sycl::range<2> {n-2,n-2}, cl::sycl::id<2> {1,1}, [=] (cl::sycl::item<2> it) {
         cl::sycl::id<2> xy = it.get_id();
         out[xy] += +in[xy+dx1] * 0.5
                    +in[xy-dx1] * -0.5
@@ -34,7 +34,7 @@ void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double> & d_in,
   q.submit([&](cl::sycl::handler& h) {
     auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
     auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
-    h.parallel_for<class star2_1d>({n-4,n-4}, {2,2}, [=] (auto it) {
+    h.parallel_for<class star2_1d>(cl::sycl::range<2> {n-4,n-4}, cl::sycl::id<2> {2,2}, [=] (cl::sycl::item<2> it) {
         out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.25
                               +in[it[0]*n+(it[1]-1)] * -0.25
                               +in[(it[0]+1)*n+it[1]] * 0.25
@@ -56,7 +56,7 @@ void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double, 2> & d_
     cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
     cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0});
     cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2});
-    h.parallel_for<class star2_2d>({n-4,n-4}, {2,2}, [=] (auto it) {
+    h.parallel_for<class star2_2d>(cl::sycl::range<2> {n-4,n-4}, cl::sycl::id<2> {2,2}, [=] (cl::sycl::item<2> it) {
         cl::sycl::id<2> xy = it.get_id();
         out[xy] += +in[xy+dx1] * 0.25
                    +in[xy-dx1] * -0.25
@@ -75,19 +75,19 @@ void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double> & d_in,
   q.submit([&](cl::sycl::handler& h) {
     auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
     auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
-    h.parallel_for<class star3_1d>({n-6,n-6}, {3,3}, [=] (auto it) {
-        out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.166666666667
-                              +in[it[0]*n+(it[1]-1)] * -0.166666666667
-                              +in[(it[0]+1)*n+it[1]] * 0.166666666667
-                              +in[(it[0]-1)*n+it[1]] * -0.166666666667
-                              +in[it[0]*n+(it[1]+2)] * 0.0833333333333
-                              +in[it[0]*n+(it[1]-2)] * -0.0833333333333
-                              +in[(it[0]+2)*n+it[1]] * 0.0833333333333
-                              +in[(it[0]-2)*n+it[1]] * -0.0833333333333
-                              +in[it[0]*n+(it[1]+3)] * 0.0555555555556
-                              +in[it[0]*n+(it[1]-3)] * -0.0555555555556
-                              +in[(it[0]+3)*n+it[1]] * 0.0555555555556
-                              +in[(it[0]-3)*n+it[1]] * -0.0555555555556;
+    h.parallel_for<class star3_1d>(cl::sycl::range<2> {n-6,n-6}, cl::sycl::id<2> {3,3}, [=] (cl::sycl::item<2> it) {
+        out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.16666666666666666
+                              +in[it[0]*n+(it[1]-1)] * -0.16666666666666666
+                              +in[(it[0]+1)*n+it[1]] * 0.16666666666666666
+                              +in[(it[0]-1)*n+it[1]] * -0.16666666666666666
+                              +in[it[0]*n+(it[1]+2)] * 0.08333333333333333
+                              +in[it[0]*n+(it[1]-2)] * -0.08333333333333333
+                              +in[(it[0]+2)*n+it[1]] * 0.08333333333333333
+                              +in[(it[0]-2)*n+it[1]] * -0.08333333333333333
+                              +in[it[0]*n+(it[1]+3)] * 0.05555555555555555
+                              +in[it[0]*n+(it[1]-3)] * -0.05555555555555555
+                              +in[(it[0]+3)*n+it[1]] * 0.05555555555555555
+                              +in[(it[0]-3)*n+it[1]] * -0.05555555555555555;
     });
   });
 }
@@ -103,20 +103,20 @@ void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double, 2> & d_
     cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2});
     cl::sycl::id<2> dx3(cl::sycl::range<2> {3,0});
     cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3});
-    h.parallel_for<class star3_2d>({n-6,n-6}, {3,3}, [=] (auto it) {
+    h.parallel_for<class star3_2d>(cl::sycl::range<2> {n-6,n-6}, cl::sycl::id<2> {3,3}, [=] (cl::sycl::item<2> it) {
         cl::sycl::id<2> xy = it.get_id();
-        out[xy] += +in[xy+dx1] * 0.166666666667
-                   +in[xy-dx1] * -0.166666666667
-                   +in[xy+dy1] * 0.166666666667
-                   +in[xy-dy1] * -0.166666666667
-                   +in[xy+dx2] * 0.0833333333333
-                   +in[xy-dx2] * -0.0833333333333
-                   +in[xy+dy2] * 0.0833333333333
-                   +in[xy-dy2] * -0.0833333333333
-                   +in[xy+dx3] * 0.0555555555556
-                   +in[xy-dx3] * -0.0555555555556
-                   +in[xy+dy3] * 0.0555555555556
-                   +in[xy-dy3] * -0.0555555555556;
+        out[xy] += +in[xy+dx1] * 0.16666666666666666
+                   +in[xy-dx1] * -0.16666666666666666
+                   +in[xy+dy1] * 0.16666666666666666
+                   +in[xy-dy1] * -0.16666666666666666
+                   +in[xy+dx2] * 0.08333333333333333
+                   +in[xy-dx2] * -0.08333333333333333
+                   +in[xy+dy2] * 0.08333333333333333
+                   +in[xy-dy2] * -0.08333333333333333
+                   +in[xy+dx3] * 0.05555555555555555
+                   +in[xy-dx3] * -0.05555555555555555
+                   +in[xy+dy3] * 0.05555555555555555
+                   +in[xy-dy3] * -0.05555555555555555;
     });
   });
 }
@@ -126,7 +126,7 @@ void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double> & d_in,
   q.submit([&](cl::sycl::handler& h) {
     auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
     auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
-    h.parallel_for<class star4_1d>({n-8,n-8}, {4,4}, [=] (auto it) {
+    h.parallel_for<class star4_1d>(cl::sycl::range<2> {n-8,n-8}, cl::sycl::id<2> {4,4}, [=] (cl::sycl::item<2> it) {
         out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.125
                               +in[it[0]*n+(it[1]-1)] * -0.125
                               +in[(it[0]+1)*n+it[1]] * 0.125
@@ -135,10 +135,10 @@ void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double> & d_in,
                               +in[it[0]*n+(it[1]-2)] * -0.0625
                               +in[(it[0]+2)*n+it[1]] * 0.0625
                               +in[(it[0]-2)*n+it[1]] * -0.0625
-                              +in[it[0]*n+(it[1]+3)] * 0.0416666666667
-                              +in[it[0]*n+(it[1]-3)] * -0.0416666666667
-                              +in[(it[0]+3)*n+it[1]] * 0.0416666666667
-                              +in[(it[0]-3)*n+it[1]] * -0.0416666666667
+                              +in[it[0]*n+(it[1]+3)] * 0.041666666666666664
+                              +in[it[0]*n+(it[1]-3)] * -0.041666666666666664
+                              +in[(it[0]+3)*n+it[1]] * 0.041666666666666664
+                              +in[(it[0]-3)*n+it[1]] * -0.041666666666666664
                               +in[it[0]*n+(it[1]+4)] * 0.03125
                               +in[it[0]*n+(it[1]-4)] * -0.03125
                               +in[(it[0]+4)*n+it[1]] * 0.03125
@@ -160,7 +160,7 @@ void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double, 2> & d_
     cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3});
     cl::sycl::id<2> dx4(cl::sycl::range<2> {4,0});
     cl::sycl::id<2> dy4(cl::sycl::range<2> {0,4});
-    h.parallel_for<class star4_2d>({n-8,n-8}, {4,4}, [=] (auto it) {
+    h.parallel_for<class star4_2d>(cl::sycl::range<2> {n-8,n-8}, cl::sycl::id<2> {4,4}, [=] (cl::sycl::item<2> it) {
         cl::sycl::id<2> xy = it.get_id();
         out[xy] += +in[xy+dx1] * 0.125
                    +in[xy-dx1] * -0.125
@@ -170,10 +170,10 @@ void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double, 2> & d_
                    +in[xy-dx2] * -0.0625
                    +in[xy+dy2] * 0.0625
                    +in[xy-dy2] * -0.0625
-                   +in[xy+dx3] * 0.0416666666667
-                   +in[xy-dx3] * -0.0416666666667
-                   +in[xy+dy3] * 0.0416666666667
-                   +in[xy-dy3] * -0.0416666666667
+                   +in[xy+dx3] * 0.041666666666666664
+                   +in[xy-dx3] * -0.041666666666666664
+                   +in[xy+dy3] * 0.041666666666666664
+                   +in[xy-dy3] * -0.041666666666666664
                    +in[xy+dx4] * 0.03125
                    +in[xy-dx4] * -0.03125
                    +in[xy+dy4] * 0.03125
@@ -187,7 +187,7 @@ void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double> & d_in,
   q.submit([&](cl::sycl::handler& h) {
     auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
     auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
-    h.parallel_for<class star5_1d>({n-10,n-10}, {5,5}, [=] (auto it) {
+    h.parallel_for<class star5_1d>(cl::sycl::range<2> {n-10,n-10}, cl::sycl::id<2> {5,5}, [=] (cl::sycl::item<2> it) {
         out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.1
                               +in[it[0]*n+(it[1]-1)] * -0.1
                               +in[(it[0]+1)*n+it[1]] * 0.1
@@ -196,10 +196,10 @@ void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double> & d_in,
                               +in[it[0]*n+(it[1]-2)] * -0.05
                               +in[(it[0]+2)*n+it[1]] * 0.05
                               +in[(it[0]-2)*n+it[1]] * -0.05
-                              +in[it[0]*n+(it[1]+3)] * 0.0333333333333
-                              +in[it[0]*n+(it[1]-3)] * -0.0333333333333
-                              +in[(it[0]+3)*n+it[1]] * 0.0333333333333
-                              +in[(it[0]-3)*n+it[1]] * -0.0333333333333
+                              +in[it[0]*n+(it[1]+3)] * 0.03333333333333333
+                              +in[it[0]*n+(it[1]-3)] * -0.03333333333333333
+                              +in[(it[0]+3)*n+it[1]] * 0.03333333333333333
+                              +in[(it[0]-3)*n+it[1]] * -0.03333333333333333
                               +in[it[0]*n+(it[1]+4)] * 0.025
                               +in[it[0]*n+(it[1]-4)] * -0.025
                               +in[(it[0]+4)*n+it[1]] * 0.025
@@ -227,7 +227,7 @@ void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double, 2> & d_
     cl::sycl::id<2> dy4(cl::sycl::range<2> {0,4});
     cl::sycl::id<2> dx5(cl::sycl::range<2> {5,0});
     cl::sycl::id<2> dy5(cl::sycl::range<2> {0,5});
-    h.parallel_for<class star5_2d>({n-10,n-10}, {5,5}, [=] (auto it) {
+    h.parallel_for<class star5_2d>(cl::sycl::range<2> {n-10,n-10}, cl::sycl::id<2> {5,5}, [=] (cl::sycl::item<2> it) {
         cl::sycl::id<2> xy = it.get_id();
         out[xy] += +in[xy+dx1] * 0.1
                    +in[xy-dx1] * -0.1
@@ -237,10 +237,10 @@ void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double, 2> & d_
                    +in[xy-dx2] * -0.05
                    +in[xy+dy2] * 0.05
                    +in[xy-dy2] * -0.05
-                   +in[xy+dx3] * 0.0333333333333
-                   +in[xy-dx3] * -0.0333333333333
-                   +in[xy+dy3] * 0.0333333333333
-                   +in[xy-dy3] * -0.0333333333333
+                   +in[xy+dx3] * 0.03333333333333333
+                   +in[xy-dx3] * -0.03333333333333333
+                   +in[xy+dy3] * 0.03333333333333333
+                   +in[xy-dy3] * -0.03333333333333333
                    +in[xy+dx4] * 0.025
                    +in[xy-dx4] * -0.025
                    +in[xy+dy4] * 0.025

From d675e38a9a64129fdae52a327bfc941e58c1dba1 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Tue, 20 Mar 2018 06:06:56 -0700
Subject: [PATCH 066/245] gross fix for SYCL p2p

corner update wasn't coherent with parallel sweep.
this is the wrong solution but is at least correct with ComputeCpp.
triSYCL didn't care because it uses OpenMP host execution.
---
 Cxx11/p2p-hyperplane-sycl.cc | 58 ++++++++++++++++++++++--------------
 1 file changed, 35 insertions(+), 23 deletions(-)

diff --git a/Cxx11/p2p-hyperplane-sycl.cc b/Cxx11/p2p-hyperplane-sycl.cc
index 58f7e7e2d..8d6e23595 100644
--- a/Cxx11/p2p-hyperplane-sycl.cc
+++ b/Cxx11/p2p-hyperplane-sycl.cc
@@ -156,45 +156,57 @@ int main(int argc, char* argv[])
 
   cl::sycl::queue q;
   {
-    cl::sycl::buffer<double> d_grid { h_grid.data(), h_grid.size() };
-
     for (auto iter = 0; iter<=iterations; iter++) {
 
       if (iter==1) pipeline_time = prk::wtime();
 
-      for (int i=2; i<=2*n-2; i++) {
+      {
+        cl::sycl::buffer<double> d_grid { h_grid.data(), h_grid.size() };
+
+        for (int i=2; i<=2*n-2; i++) {
 
-        cl::sycl::id<1> I{unsigned(i)};
-        cl::sycl::id<1> One{1};
+          cl::sycl::id<1> I{unsigned(i)};
+          cl::sycl::id<1> One{1};
 
-        q.submit([&](cl::sycl::handler& h) {
+          q.submit([&](cl::sycl::handler& h) {
 
-          auto grid = d_grid.get_access<cl::sycl::access::mode::read_write>(h);
+            auto grid = d_grid.get_access<cl::sycl::access::mode::read_write>(h);
 
-          unsigned begin = std::max(2,i-n+2);
-          unsigned end   = std::min(i,n)+1;
-          unsigned range = end-begin;
+            unsigned begin = std::max(2,i-n+2);
+            unsigned end   = std::min(i,n)+1;
+            unsigned range = end-begin;
 
-          h.parallel_for<class p2p>(cl::sycl::range<1>{range}, cl::sycl::id<1>{begin}, [=] (cl::sycl::item<1> J) {
-            cl::sycl::id<1> N{unsigned(n)};
-            cl::sycl::id<1> X{I-J+One};
-            cl::sycl::id<1> Y{J-One};
-            cl::sycl::id<1> Xold{X-One}; // x-1
-            cl::sycl::id<1> Yold{Y-One}; // y-1
-            cl::sycl::id<1> index0{X*N+Y};
-            cl::sycl::id<1> index1{Xold*N+Y};
-            cl::sycl::id<1> index2{X*N+Yold};
-            cl::sycl::id<1> index3{Xold*N+Yold};
-            grid[index0] = grid[index1] + grid[index2] - grid[index3];
+            h.parallel_for<class p2p>(cl::sycl::range<1>{range}, cl::sycl::id<1>{begin}, [=] (cl::sycl::item<1> j) {
+              auto J = j.get_id();
+              cl::sycl::id<1> N{unsigned(n)};
+              cl::sycl::id<1> X{I-J+One};
+              cl::sycl::id<1> Y{J-One};
+              cl::sycl::id<1> Xold{X-One}; // x-1
+              cl::sycl::id<1> Yold{Y-One}; // y-1
+              cl::sycl::id<1> index0{X*N+Y};
+              cl::sycl::id<1> index1{Xold*N+Y};
+              cl::sycl::id<1> index2{X*N+Yold};
+              cl::sycl::id<1> index3{Xold*N+Yold};
+              grid[index0] = grid[index1] + grid[index2] - grid[index3];
+              //std::cout << "I,J=" << I[0] << "," << J[0] << "\n";
+            });
           });
-        });
-        q.wait();
+          q.wait();
+        }
       }
       h_grid[0*n+0] = -h_grid[(n-1)*n+(n-1)];
     }
     pipeline_time = prk::wtime() - pipeline_time;
   }
 
+#if 0
+  for (int i=0; i<n; ++i) {
+      for (int j=0; j<n; ++j) {
+          std::cout << i << "," << j << "=" << h_grid[i*n+j] << "\n";
+      }
+  }
+#endif
+
   //////////////////////////////////////////////////////////////////////
   // Analyze and output results.
   //////////////////////////////////////////////////////////////////////

From 0aeddc0a037d7f1f384471052df17a130e62cdda Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Tue, 20 Mar 2018 06:32:36 -0700
Subject: [PATCH 067/245] improve SYCL p2p quite a bit

it still performs terribly but at least the design isn't trash.
---
 Cxx11/p2p-hyperplane-sycl.cc | 62 ++++++++++++++++++++----------------
 1 file changed, 34 insertions(+), 28 deletions(-)

diff --git a/Cxx11/p2p-hyperplane-sycl.cc b/Cxx11/p2p-hyperplane-sycl.cc
index 8d6e23595..305f5de38 100644
--- a/Cxx11/p2p-hyperplane-sycl.cc
+++ b/Cxx11/p2p-hyperplane-sycl.cc
@@ -156,46 +156,52 @@ int main(int argc, char* argv[])
 
   cl::sycl::queue q;
   {
+    cl::sycl::buffer<double> d_grid { h_grid.data(), h_grid.size() };
+
     for (auto iter = 0; iter<=iterations; iter++) {
 
       if (iter==1) pipeline_time = prk::wtime();
 
-      {
-        cl::sycl::buffer<double> d_grid { h_grid.data(), h_grid.size() };
-
-        for (int i=2; i<=2*n-2; i++) {
+      for (int i=2; i<=2*n-2; i++) {
 
-          cl::sycl::id<1> I{unsigned(i)};
-          cl::sycl::id<1> One{1};
+        cl::sycl::id<1> I{unsigned(i)};
+        cl::sycl::id<1> One{1};
 
-          q.submit([&](cl::sycl::handler& h) {
+        q.submit([&](cl::sycl::handler& h) {
 
-            auto grid = d_grid.get_access<cl::sycl::access::mode::read_write>(h);
+          auto grid = d_grid.get_access<cl::sycl::access::mode::read_write>(h);
 
-            unsigned begin = std::max(2,i-n+2);
-            unsigned end   = std::min(i,n)+1;
-            unsigned range = end-begin;
+          unsigned begin = std::max(2,i-n+2);
+          unsigned end   = std::min(i,n)+1;
+          unsigned range = end-begin;
 
-            h.parallel_for<class p2p>(cl::sycl::range<1>{range}, cl::sycl::id<1>{begin}, [=] (cl::sycl::item<1> j) {
-              auto J = j.get_id();
-              cl::sycl::id<1> N{unsigned(n)};
-              cl::sycl::id<1> X{I-J+One};
-              cl::sycl::id<1> Y{J-One};
-              cl::sycl::id<1> Xold{X-One}; // x-1
-              cl::sycl::id<1> Yold{Y-One}; // y-1
-              cl::sycl::id<1> index0{X*N+Y};
-              cl::sycl::id<1> index1{Xold*N+Y};
-              cl::sycl::id<1> index2{X*N+Yold};
-              cl::sycl::id<1> index3{Xold*N+Yold};
-              grid[index0] = grid[index1] + grid[index2] - grid[index3];
-              //std::cout << "I,J=" << I[0] << "," << J[0] << "\n";
-            });
+          h.parallel_for<class sweep>(cl::sycl::range<1>{range}, cl::sycl::id<1>{begin}, [=] (cl::sycl::item<1> j) {
+            auto J = j.get_id();
+            cl::sycl::id<1> N{unsigned(n)};
+            cl::sycl::id<1> X{I-J+One};
+            cl::sycl::id<1> Y{J-One};
+            cl::sycl::id<1> Xold{X-One}; // x-1
+            cl::sycl::id<1> Yold{Y-One}; // y-1
+            cl::sycl::id<1> index0{X*N+Y};
+            cl::sycl::id<1> index1{Xold*N+Y};
+            cl::sycl::id<1> index2{X*N+Yold};
+            cl::sycl::id<1> index3{Xold*N+Yold};
+            grid[index0] = grid[index1] + grid[index2] - grid[index3];
           });
-          q.wait();
-        }
+        });
+        q.wait();
       }
-      h_grid[0*n+0] = -h_grid[(n-1)*n+(n-1)];
+      q.submit([&](cl::sycl::handler& h) {
+
+        auto grid = d_grid.get_access<cl::sycl::access::mode::read_write>(h);
+
+        h.single_task<class corner>([=] {
+            grid[0*n+0] = -grid[(n-1)*n+(n-1)];
+        });
+      });
+      q.wait();
     }
+    q.wait();
     pipeline_time = prk::wtime() - pipeline_time;
   }
 

From b0479199e6ae063c4d47dc33cfdc9a4e2e34914f Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 21 Mar 2018 06:25:06 -0700
Subject: [PATCH 068/245] improve SYCL example build options [ci skip]

---
 common/make.defs.llvm | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/common/make.defs.llvm b/common/make.defs.llvm
index 817f9da7d..2e1ab47de 100644
--- a/common/make.defs.llvm
+++ b/common/make.defs.llvm
@@ -60,8 +60,17 @@ OPENCLFLAG=-framework OpenCL
 SYCLDIR=/opt/sycl/latest
 SYCLCXX=${SYCLDIR}/bin/compute++
 SYCLFLAG=-DUSE_SYCL -sycl-driver -I$(SYCLDIR)/include -L$(SYCLDIR)/lib -Wl,-rpath=$(SYCLDIR)/lib -lComputeCpp
+SYCLFLAG+=-std=c++14
 # This makes a huge difference in e.g. nstream...
-SYCLFLAG+=-no-serial-memop
+#SYCLFLAG+=-no-serial-memop
+# CentOS7 and Ubuntu14 built for this
+#SYCLFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0
+# PRK header rejects GCC4
+#SYCLFLAG+=--gcc-toolchain=/swtools/gcc/5.4.0
+# If not found automatically
+#SYCLFLAG+=${OPENCLFLAG}
+# NVIDIA target
+#SYCLFLAG+=-sycl-target ptx64
 #
 # triSYCL
 # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...

From a6d234dc305ba566f643bbba79301495b7b3c2a6 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Mon, 26 Mar 2018 09:58:46 -0700
Subject: [PATCH 069/245] ORNL-ACC wavefront (C++ and Fortran) (#325)

ORNL-ACC (aka OpenACC) wavefront

* add ACC p2p w/ innerloop strategy
* add ignoring
* add attempt at OACC async (tasks)
* PGI fixes
* progress on OpenACC p2p
* add ACC p2p in C++
* p2p with ACC in C++
* add ORNL-ACC C++ p2p to Travis
---
 .gitignore                             |   4 +
 Cxx11/Makefile                         |   8 +-
 Cxx11/p2p-hyperplane-vector-ornlacc.cc | 197 +++++++++++++++++++++
 FORTRAN/Makefile                       |   2 +-
 FORTRAN/nstream.f90                    |  30 +++-
 FORTRAN/p2p-async-ornlacc.f90          | 230 +++++++++++++++++++++++++
 FORTRAN/p2p-innerloop-ornlacc.f90      | 191 ++++++++++++++++++++
 travis/build-run-prk.sh                |   5 +
 8 files changed, 659 insertions(+), 8 deletions(-)
 create mode 100644 Cxx11/p2p-hyperplane-vector-ornlacc.cc
 create mode 100644 FORTRAN/p2p-async-ornlacc.f90
 create mode 100644 FORTRAN/p2p-innerloop-ornlacc.f90

diff --git a/.gitignore b/.gitignore
index 44d361e21..1dc8a0272 100644
--- a/.gitignore
+++ b/.gitignore
@@ -194,12 +194,15 @@ FORTRAN/p2p-tasks-openmp
 FORTRAN/p2p-doacross-openmp
 FORTRAN/p2p-innerloop-openmp
 FORTRAN/p2p-datapar-openmp
+FORTRAN/p2p-innerloop-ornlacc
+FORTRAN/p2p-ornlacc
 FORTRAN/stencil
 FORTRAN/stencil-coarray
 FORTRAN/stencil-openmp
 FORTRAN/stencil-openmp-target
 FORTRAN/stencil-pretty
 FORTRAN/stencil-taskloop-openmp
+FORTRAN/stencil-ornlacc
 FORTRAN/transpose
 FORTRAN/transpose-coarray
 FORTRAN/transpose-openmp
@@ -207,6 +210,7 @@ FORTRAN/transpose-openmp-target
 FORTRAN/transpose-pretty
 FORTRAN/transpose-taskloop-openmp
 FORTRAN/transpose-tasks-openmp
+FORTRAN/transpose-ornlacc
 RUST/p2p
 RUST/stencil
 RUST/transpose
diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index 190b44dbb..5538ceabd 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -44,6 +44,7 @@ STLFLAGS = $(STLFLAG) $(BOOSTFLAGS)
 PSTLFLAGS = $(PSTLFLAG) $(BOOSTFLAGS)
 RAJAFLAGS = $(RAJAFLAG)
 KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) $(BOOSTFLAGS)
+ORNLACCFLAGS = $(ORNLACCFLAG)
 
 ifdef OCCADIR
   include ${OCCADIR}/scripts/makefile
@@ -66,7 +67,7 @@ all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sy
 
 p2p: p2p-vector p2p-doacross-vector-openmp p2p-hyperplane-vector-openmp p2p-tasks-openmp p2p-openmp-target \
      p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl p2p-hyperplane-vector-tbb \
-     p2p-hyperplane-sycl
+     p2p-hyperplane-sycl p2p-hyperplane-vector-ornlacc
 
 stencil: stencil-valarray stencil-vector stencil-vector-async stencil-vector-openmp stencil-openmp-target \
 	 stencil-vector-taskloop stencil-vector-stl stencil-vector-pstl stencil-vector-raja \
@@ -117,6 +118,8 @@ cublas: transpose-cublas nstream-cublas
 
 occa: transpose-occa nstream-occa
 
+ornlacc: p2p-hyperplane-vector-ornlacc
+
 boost-compute: nstream-vector-boost-compute
 # busted
 #nstream-valarray-boost-compute
@@ -181,6 +184,9 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h
 	$(info PRK help: Set OCCA_CXX=$(firstword $(CXX)) to use that compiler for OKL files.)
 	$(CXX) $(CXXFLAGS) $< $(OCCAFLAGS) -o $@
 
+%-ornlacc: %-ornlacc.cc prk_util.h
+	$(CXX) $(CXXFLAGS) $< $(ORNLACCFLAGS) -o $@
+
 %: %.cc prk_util.h
 	$(CXX) $(CXXFLAGS) $< -o $@
 
diff --git a/Cxx11/p2p-hyperplane-vector-ornlacc.cc b/Cxx11/p2p-hyperplane-vector-ornlacc.cc
new file mode 100644
index 000000000..eb4a092e1
--- /dev/null
+++ b/Cxx11/p2p-hyperplane-vector-ornlacc.cc
@@ -0,0 +1,197 @@
+///
+/// Copyright (c) 2013, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    Pipeline
+///
+/// PURPOSE: This program tests the efficiency with which point-to-point
+///          synchronization can be carried out. It does so by executing
+///          a pipelined algorithm on an n^2 grid. The first array dimension
+///          is distributed among the threads (stripwise decomposition).
+///
+/// USAGE:   The program takes as input the
+///          dimensions of the grid, and the number of iterations on the grid
+///
+///                <progname> <iterations> <n>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// FUNCTIONS CALLED:
+///
+///          Other than standard C functions, the following
+///          functions are used in this program:
+///
+///          wtime()
+///
+/// HISTORY: - Written by Rob Van der Wijngaart, February 2009.
+///            C99-ification by Jeff Hammond, February 2016.
+///            C++11-ification by Jeff Hammond, May 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+
+int main(int argc, char* argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/ORNL-ACC HYPERPLANE pipeline execution on 2D grid" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Process and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations;
+  int n, nc, nb;
+  try {
+      if (argc < 3) {
+        throw " <# iterations> <array dimension> [<chunk dimension>]";
+      }
+
+      // number of times to run the pipeline algorithm
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 0) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      // grid dimensions
+      n = std::atoi(argv[2]);
+      if (n < 1) {
+        throw "ERROR: grid dimensions must be positive";
+      } else if ( static_cast<size_t>(n)*static_cast<size_t>(n) > INT_MAX) {
+        throw "ERROR: grid dimension too large - overflow risk";
+      }
+
+      // grid chunk dimensions
+      nc = (argc > 3) ? std::atoi(argv[3]) : 1;
+      nc = std::max(1,nc);
+      nc = std::min(n,nc);
+
+      // number of grid blocks
+      nb = (n-1)/nc;
+      if ((n-1)%nc) nb++;
+      //std::cerr << "n="  << n << std::endl;
+      //std::cerr << "nb=" << nb << std::endl;
+      //std::cerr << "nc=" << nc << std::endl;
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Grid sizes           = " << n << ", " << n << std::endl;
+  std::cout << "Grid chunk sizes     = " << nc << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  auto pipeline_time = 0.0;
+
+  double * grid = new double[n*n];
+
+  for (int i=0; i<n; i++) {
+    for (int j=0; j<n; j++) {
+      grid[i*n+j] = 0.0;
+    }
+  }
+  for (int j=0; j<n; j++) {
+    grid[0*n+j] = static_cast<double>(j);
+  }
+  for (int i=0; i<n; i++) {
+    grid[i*n+0] = static_cast<double>(i);
+  }
+
+  #pragma acc data pcopy(grid)
+  {
+    for (auto iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) pipeline_time = prk::wtime();
+
+      if (nc==1) {
+        for (int i=2; i<=2*n-2; i++) {
+          #pragma acc parallel loop independent
+          for (int j=std::max(2,i-n+2); j<=std::min(i,n); j++) {
+            const int x = i-j+1;
+            const int y = j-1;
+            grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)];
+          }
+        }
+      } else {
+        for (int i=2; i<=2*(nb+1)-2; i++) {
+          #pragma acc parallel loop gang
+          for (int j=std::max(2,i-(nb+1)+2); j<=std::min(i,nb+1); j++) {
+            const int ib = nc*(i-j)+1;
+            const int jb = nc*(j-2)+1;
+            //sweep_tile_sequential(ib, std::min(n,ib+nc), jb, std::min(n,jb+nc), n, grid);
+            #pragma acc loop vector
+            for (int i=ib; i<std::min(n,ib+nc); i++) {
+              for (int j=jb; j<std::min(n,jb+nc); j++) {
+                grid[i*n+j] = grid[(i-1)*n+j] + grid[i*n+(j-1)] - grid[(i-1)*n+(j-1)];
+              }
+            }
+          }
+        }
+      }
+      #pragma acc kernels
+      {
+        grid[0*n+0] = -grid[(n-1)*n+(n-1)];
+      }
+    }
+    pipeline_time = prk::wtime() - pipeline_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  // Analyze and output results.
+  //////////////////////////////////////////////////////////////////////
+
+  const double epsilon = 1.e-8;
+  auto corner_val = ((iterations+1.)*(2.*n-2.));
+  if ( (std::fabs(grid[(n-1)*n+(n-1)] - corner_val)/corner_val) > epsilon) {
+    std::cout << "ERROR: checksum " << grid[(n-1)*n+(n-1)]
+              << " does not match verification value " << corner_val << std::endl;
+    return 1;
+  }
+
+#ifdef VERBOSE
+  std::cout << "Solution validates; verification value = " << corner_val << std::endl;
+#else
+  std::cout << "Solution validates" << std::endl;
+#endif
+  auto avgtime = pipeline_time/iterations;
+  std::cout << "Rate (MFlops/s): "
+            << 2.0e-6 * ( (n-1.)*(n-1.) )/avgtime
+            << " Avg time (s): " << avgtime << std::endl;
+
+  return 0;
+}
diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile
index ee57e8255..898a237c4 100644
--- a/FORTRAN/Makefile
+++ b/FORTRAN/Makefile
@@ -59,7 +59,7 @@ coarray: p2p-coarray stencil-coarray transpose-coarray
 
 target: stencil-openmp-target transpose-openmp-target nstream-openmp-target dgemm-openmp-target
 
-ornlacc: p2p-ornlacc stencil-ornlacc transpose-ornlacc nstream-ornlacc
+ornlacc: p2p-ornlacc p2p-innerloop-ornlacc stencil-ornlacc transpose-ornlacc nstream-ornlacc
 
 %: %.f90
 	$(FC) $(FCFLAGS) $< -o $@
diff --git a/FORTRAN/nstream.f90 b/FORTRAN/nstream.f90
index 5b7799e1f..63986ab54 100644
--- a/FORTRAN/nstream.f90
+++ b/FORTRAN/nstream.f90
@@ -183,13 +183,24 @@ program main
 #if defined(_OPENMP)
   !$omp do
   do i=1,length
+    A(i) = 0
+    B(i) = 2
+    C(i) = 2
+  enddo
+  !$omp end do
+#elif defined(PGI)
+  forall (i=1:length)
+    A(i) = 0
+    B(i) = 2
+    C(i) = 2
+  end forall
 #else
   do concurrent (i=1:length)
-#endif
     A(i) = 0
     B(i) = 2
     C(i) = 2
   enddo
+#endif
 
   ! need this because otherwise no barrier between initialization
   ! and iteration 0 (warmup), which will lead to incorrectness.
@@ -211,11 +222,18 @@ program main
 #if defined(_OPENMP)
     !$omp do
     do i=1,length
+      A(i) = A(i) + B(i) + scalar * C(i)
+    enddo
+    !$omp end do
+#elif defined(PGI)
+    forall (i=1:length)
+      A(i) = A(i) + B(i) + scalar * C(i)
+    end forall
 #else
     do concurrent (i=1:length)
-#endif
       A(i) = A(i) + B(i) + scalar * C(i)
     enddo
+#endif
   enddo ! iterations
 
   t1 = prk_get_wtime()
@@ -241,16 +259,16 @@ program main
   ar = ar * length
 
   asum = 0
-#if defined(_OPENMP)
+#if defined(_OPENMP) || defined(PGI)
   !$omp parallel do reduction(+:asum)
   do i=1,length
+    asum = asum + abs(A(i))
+  enddo
+  !$omp end parallel do
 #else
   do concurrent (i=1:length)
-#endif
     asum = asum + abs(A(i))
   enddo
-#ifdef _OPENMP
-  !$omp end parallel do
 #endif
 
   deallocate( C )
diff --git a/FORTRAN/p2p-async-ornlacc.f90 b/FORTRAN/p2p-async-ornlacc.f90
new file mode 100644
index 000000000..e42cbb46d
--- /dev/null
+++ b/FORTRAN/p2p-async-ornlacc.f90
@@ -0,0 +1,230 @@
+!
+! Copyright (c) 2015, Intel Corporation
+!
+! Redistribution and use in source and binary forms, with or without
+! modification, are permitted provided that the following conditions
+! are met:
+!
+! * Redistributions of source code must retain the above copyright
+!       notice, this list of conditions and the following disclaimer.
+! * Redistributions in binary form must reproduce the above
+!       copyright notice, this list of conditions and the following
+!       disclaimer in the documentation and/or other materials provided
+!       with the distribution.
+! * Neither the name of Intel Corporation nor the names of its
+!       contributors may be used to endorse or promote products
+!       derived from this software without specific prior written
+!       permission.
+!
+! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+! POSSIBILITY OF SUCH DAMAGE.
+
+!*******************************************************************
+! NAME:    Pipeline
+!
+! PURPOSE: This program tests the efficiency with which point-to-point
+!          synchronization can be carried out. It does so by executing
+!          a pipelined algorithm on an m*n grid. The first array dimension
+!          is distributed among the threads (stripwise decomposition).
+!
+! USAGE:   The program takes as input the
+!          dimensions of the grid, and the number of iterations on the grid
+!
+!                <progname> <iterations> <m> <n>
+!
+!          The output consists of diagnostics to make sure the
+!          algorithm worked, and of timing statistics.
+!
+! FUNCTIONS CALLED:
+!
+!          Other than standard C functions, the following
+!          functions are used in this program:
+!
+! HISTORY: - Written by Rob Van der Wijngaart, February 2009.
+!            Converted to Fortran by Jeff Hammond, January 2016.
+! *******************************************************************
+
+function prk_get_wtime() result(t)
+  use iso_fortran_env
+  implicit none
+  real(kind=REAL64) ::  t
+  integer(kind=INT64) :: c, r
+  call system_clock(count = c, count_rate = r)
+  t = real(c,REAL64) / real(r,REAL64)
+end function prk_get_wtime
+
+subroutine sweep_tile(startm,endm,startn,endn,m,n,grid)
+  use iso_fortran_env
+  implicit none
+  integer(kind=INT32), intent(in) :: m,n
+  integer(kind=INT32), intent(in) :: startm,endm
+  integer(kind=INT32), intent(in) :: startn,endn
+  real(kind=REAL64), intent(inout) ::  grid(m,n)
+  integer(kind=INT32) :: i,j
+  !$acc kernels
+  do j=startn,endn
+    do i=startm,endm
+      grid(i,j) = grid(i-1,j) + grid(i,j-1) - grid(i-1,j-1)
+    enddo
+  enddo
+  !$acc end kernels
+end subroutine
+
+program main
+  use iso_fortran_env
+  implicit none
+  real(kind=REAL64) :: prk_get_wtime
+  ! for argument parsing
+  integer :: err
+  integer :: arglen
+  character(len=32) :: argtmp
+  ! problem definition
+  integer(kind=INT32) :: iterations                     ! number of times to run the pipeline algorithm
+  integer(kind=INT32) :: m, n
+  real(kind=REAL64) :: corner_val                       ! verification value at top right corner of grid
+  real(kind=REAL64), allocatable :: grid(:,:)           ! array holding grid values
+  ! runtime variables
+  integer(kind=INT32) :: i, j, k
+  integer(kind=INT32) :: ic, mc                         ! ic = chunking index, mc = chunking dimension
+  integer(kind=INT32) :: jc, nc                         ! jc = chunking index, nc = chunking dimension
+  integer(kind=INT32) :: lic, ljc                       ! hold indexes of last block
+  real(kind=REAL64) ::  t0, t1, pipeline_time, avgtime  ! timing parameters
+  real(kind=REAL64), parameter ::  epsilon=1.D-8        ! error tolerance
+
+  ! ********************************************************************
+  ! read and test input parameters
+  ! ********************************************************************
+
+  write(*,'(a25)') 'Parallel Research Kernels'
+  write(*,'(a52)') 'Fortran ORNL-ACC TASKS pipeline execution on 2D grid'
+
+  if (command_argument_count().lt.2) then
+    write(*,'(a17,i1)') 'argument count = ', command_argument_count()
+    write(*,'(a34,2a39)')  'Usage: ./synch_p2p <# iterations> ',      &
+                           '<array x-dimension> <array y-dimension>', &
+                           '<chunk x-dimension> <chunk y-dimension>'
+    stop 1
+  endif
+
+  iterations = 1
+  call get_command_argument(1,argtmp,arglen,err)
+  if (err.eq.0) read(argtmp,'(i32)') iterations
+
+  m = 1
+  call get_command_argument(2,argtmp,arglen,err)
+  if (err.eq.0) read(argtmp,'(i32)') m
+
+  n = m
+  if (command_argument_count().gt.2) then
+    call get_command_argument(3,argtmp,arglen,err)
+    if (err.eq.0) read(argtmp,'(i32)') n
+
+    mc = m
+    call get_command_argument(4,argtmp,arglen,err)
+    if (err.eq.0) read(argtmp,'(i32)') mc
+
+    nc = n
+    call get_command_argument(5,argtmp,arglen,err)
+    if (err.eq.0) read(argtmp,'(i32)') nc
+  endif
+
+  if (iterations .lt. 1) then
+    write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations
+    stop 1
+  endif
+
+  if ((m .lt. 1).or.(n .lt. 1)) then
+    write(*,'(a,i5,i5)') 'ERROR: array dimensions must be >= 1 : ', m, n
+    stop 1
+  endif
+
+  ! mc=m or nc=n disables chunking in that dimension, which means
+  ! there is no task parallelism to exploit
+  !if (((mc.lt.1).or.(mc.gt.m)).or.((nc.lt.1).or.(nc.gt.n))) then
+  !  mc = int(m/omp_get_max_threads())
+  !  nc = int(n/omp_get_max_threads())
+  !endif
+  mc = max(1,mc)
+  nc = max(1,nc)
+
+  write(*,'(a,i8)')    'Number of iterations     = ', iterations
+  write(*,'(a,i8,i8)') 'Grid sizes               = ', m, n
+  write(*,'(a,i8,i8)') 'Size of chunking         = ', mc, nc
+
+  allocate( grid(m,n), stat=err)
+  if (err .ne. 0) then
+    write(*,'(a,i3)') 'allocation of grid returned ',err
+    stop 1
+  endif
+
+  lic = (m/mc-1) * mc + 2
+  ljc = (n/nc-1) * nc + 2
+
+  !$acc parallel loop gang
+  do j=1,n
+    !$acc loop vector
+    do i=1,m
+      grid(i,j) = 0.0d0
+    enddo
+  enddo
+  do j=1,n
+    grid(1,j) = real(j-1,REAL64)
+  enddo
+  do i=1,m
+    grid(i,1) = real(i-1,REAL64)
+  enddo
+
+  !$acc data pcopy(grid)
+
+  do k=0,iterations
+
+    if (k.eq.1) t0 = prk_get_wtime()
+
+    do ic=2,m,mc
+      do jc=2,n,nc
+        !$acc  async(grid(ic,jc))      wait(grid(1,1)) &
+        !$acc& wait(grid(ic-mc,jc-nc)) wait(grid(ic-mc,jc)) &
+        !$acc& wait(grid(ic,jc-nc))    wait(grid(ic,jc))
+        call sweep_tile(ic,min(m,ic+mc-1),jc,min(n,jc+nc-1),m,n,grid)
+      enddo
+    enddo
+    !$acc async(grid(1,1)) wait(grid(lic,ljc))
+    grid(1,1) = -grid(m,n)
+
+  enddo
+
+  t1 = prk_get_wtime()
+  pipeline_time = t1 - t0
+
+  !$acc end data
+
+  ! ********************************************************************
+  ! ** Analyze and output results.
+  ! ********************************************************************
+
+  ! verify correctness, using top right value
+  corner_val = real((iterations+1)*(n+m-2),REAL64);
+  if (abs(grid(m,n)-corner_val)/corner_val .gt. epsilon) then
+    write(*,'(a,f10.2,a,f10.2)') 'ERROR: checksum ',grid(m,n), &
+            ' does not match verification value ', corner_val
+    stop 1
+  endif
+
+  write(*,'(a)') 'Solution validates'
+  avgtime = pipeline_time/iterations
+  write(*,'(a,f13.6,a,f10.6)') 'Rate (MFlop/s): ',2.d-6*real((m-1)*(n-1),INT64)/avgtime, &
+         ' Avg time (s): ', avgtime
+
+  deallocate( grid )
+
+end program
diff --git a/FORTRAN/p2p-innerloop-ornlacc.f90 b/FORTRAN/p2p-innerloop-ornlacc.f90
new file mode 100644
index 000000000..9e5ff8da7
--- /dev/null
+++ b/FORTRAN/p2p-innerloop-ornlacc.f90
@@ -0,0 +1,191 @@
+!
+! Copyright (c) 2015, Intel Corporation
+!
+! Redistribution and use in source and binary forms, with or without
+! modification, are permitted provided that the following conditions
+! are met:
+!
+! * Redistributions of source code must retain the above copyright
+!       notice, this list of conditions and the following disclaimer.
+! * Redistributions in binary form must reproduce the above
+!       copyright notice, this list of conditions and the following
+!       disclaimer in the documentation and/or other materials provided
+!       with the distribution.
+! * Neither the name of Intel Corporation nor the names of its
+!       contributors may be used to endorse or promote products
+!       derived from this software without specific prior written
+!       permission.
+!
+! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+! POSSIBILITY OF SUCH DAMAGE.
+
+!*******************************************************************
+! NAME:    Pipeline
+!
+! PURPOSE: This program tests the efficiency with which point-to-point
+!          synchronization can be carried out. It does so by executing
+!          a pipelined algorithm on an m*n grid. The first array dimension
+!          is distributed among the threads (stripwise decomposition).
+!
+! USAGE:   The program takes as input the
+!          dimensions of the grid, and the number of iterations on the grid
+!
+!                <progname> <iterations> <m> <n>
+!
+!          The output consists of diagnostics to make sure the
+!          algorithm worked, and of timing statistics.
+!
+! FUNCTIONS CALLED:
+!
+!          Other than standard C functions, the following
+!          functions are used in this program:
+!
+! HISTORY: - Written by Rob Van der Wijngaart, February 2009.
+!            Converted to Fortran by Jeff Hammond, January 2016.
+! *******************************************************************
+
+function prk_get_wtime() result(t)
+  use iso_fortran_env
+  implicit none
+  real(kind=REAL64) ::  t
+  integer(kind=INT64) :: c, r
+  call system_clock(count = c, count_rate = r)
+  t = real(c,REAL64) / real(r,REAL64)
+end function prk_get_wtime
+
+program main
+  use iso_fortran_env
+  implicit none
+  real(kind=REAL64) :: prk_get_wtime
+  ! for argument parsing
+  integer :: err
+  integer :: arglen
+  character(len=32) :: argtmp
+  ! problem definition
+  integer(kind=INT32) :: iterations                     ! number of times to run the pipeline algorithm
+  integer(kind=INT32) :: n
+  real(kind=REAL64) :: corner_val                       ! verification value at top right corner of grid
+  real(kind=REAL64), allocatable :: grid(:,:)           ! array holding grid values
+  ! runtime variables
+  integer(kind=INT32) :: i, j, k
+  integer(kind=INT32) :: x, y
+  real(kind=REAL64) ::  t0, t1, pipeline_time, avgtime  ! timing parameters
+  real(kind=REAL64), parameter ::  epsilon=1.D-8        ! error tolerance
+
+  ! ********************************************************************
+  ! read and test input parameters
+  ! ********************************************************************
+
+  write(*,'(a25)') 'Parallel Research Kernels'
+  write(*,'(a55)') 'Fortran OpenACC INNERLOOP pipeline execution on 2D grid'
+
+  if (command_argument_count().lt.2) then
+    write(*,'(a17,i1)') 'argument count = ', command_argument_count()
+    write(*,'(a34,a16)') 'Usage: ./synch_p2p <# iterations> ',  &
+                         '<grid dimension>'
+    stop 1
+  endif
+
+  iterations = 1
+  call get_command_argument(1,argtmp,arglen,err)
+  if (err.eq.0) read(argtmp,'(i32)') iterations
+
+  n = 1
+  call get_command_argument(2,argtmp,arglen,err)
+  if (err.eq.0) read(argtmp,'(i32)') n
+
+  if (n .gt. 16384) then
+    write(*,'(a,i5)') 'WARNING: grid size exceeds 16384: ', n
+    write(*,'(a)')    'PGI 17.10 + CUDA 9.0 generates illegal address'
+  endif
+
+  if (iterations .lt. 1) then
+    write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations
+    stop 1
+  endif
+
+  if (n .lt. 1) then
+    write(*,'(a,i5,i5)') 'ERROR: array dimensions must be >= 1 : ', n
+    stop 1
+  endif
+
+#ifdef _OPENMP
+  write(*,'(a,i8)')    'Number of threads        = ', omp_get_max_threads()
+#endif
+  write(*,'(a,i8)')    'Number of iterations     = ', iterations
+  write(*,'(a,i8,i8)') 'Grid sizes               = ', n, n
+
+  allocate( grid(n,n), stat=err)
+  if (err .ne. 0) then
+    write(*,'(a,i3)') 'allocation of grid returned ',err
+    stop 1
+  endif
+
+  do j=1,n
+    do i=1,n
+      grid(i,j) = 0.0d0
+    enddo
+  enddo
+  do j=1,n
+    grid(1,j) = real(j-1,REAL64)
+  enddo
+  do i=1,n
+    grid(i,1) = real(i-1,REAL64)
+  enddo
+
+  !$acc data pcopy(grid)
+
+  do k=0,iterations
+
+    if (k.eq.1) t0 = prk_get_wtime()
+
+    do i=2,2*n-2
+      !$acc parallel loop independent
+      do j=max(2,i-n+2),min(i,n)
+        x = i-j+2
+        y = j
+        grid(x,y) = grid(x-1,y) + grid(x,y-1) - grid(x-1,y-1)
+      enddo
+    enddo
+    !$acc kernels
+    grid(1,1) = -grid(n,n)
+    !$acc end kernels
+
+  enddo
+
+  t1 = prk_get_wtime()
+
+  !$acc end data
+
+  pipeline_time = t1 - t0
+
+  ! ********************************************************************
+  ! ** Analyze and output results.
+  ! ********************************************************************
+
+  ! verify correctness, using top right value
+  corner_val = real((iterations+1)*(2*n-2),REAL64);
+  if (abs(grid(n,n)-corner_val)/corner_val .gt. epsilon) then
+    write(*,'(a,f10.2,a,f10.2)') 'ERROR: checksum ',grid(n,n), &
+            ' does not match verification value ', corner_val
+    stop 1
+  endif
+
+  write(*,'(a)') 'Solution validates'
+  avgtime = pipeline_time/iterations
+  write(*,'(a,f13.6,a,f10.6)') 'Rate (MFlop/s): ',2.d-6*real((n-1)*(n-1),REAL64)/avgtime, &
+         ' Avg time (s): ', avgtime
+
+  deallocate( grid )
+
+end program
diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index 7e78338fd..73883df11 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -384,6 +384,11 @@ case "$PRK_TARGET" in
                         $PRK_TARGET_PATH/stencil-vector-openmp 10 200 20 $s $r
                     done
                 done
+                # ORNL-ACC
+                echo "ORNLACCFLAG=-fopenacc" >> common/make.defs
+                make -C $PRK_TARGET_PATH p2p-hyperplane-vector-ornlacc
+                $PRK_TARGET_PATH/p2p-hyperplane-vector-ornlacc     10 1024
+                $PRK_TARGET_PATH/p2p-hyperplane-vector-ornlacc     10 1024 64
                 ;;
             clang)
                 # Host

From ec62d470095b6021345e6a25b52729ea30253cb0 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 4 Apr 2018 14:28:13 -0700
Subject: [PATCH 070/245] add CUBLAS DGEMM (#326)

- add a bunch of overdue ignoring
- twiddle transpose-cublas output
---
 .gitignore                |  27 +++-
 Cxx11/Makefile            |   4 +-
 Cxx11/dgemm-cublas.cu     | 252 ++++++++++++++++++++++++++++++++++++++
 Cxx11/transpose-cublas.cu |   4 +-
 4 files changed, 280 insertions(+), 7 deletions(-)
 create mode 100644 Cxx11/dgemm-cublas.cu

diff --git a/.gitignore b/.gitignore
index 1dc8a0272..259f7766b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,6 +18,9 @@ octave-workspace                # Octave crashes
 *.swp                           # Vim
 */*.swp
 */*/*.swp
+*.swo                           # Vim
+*/*.swo
+*/*/*.swo
 *.dSYM                          # Mac
 */*.dSYM
 */*/*.dSYM
@@ -43,6 +46,10 @@ func.c                          # PRK C89 stencil generated code
 *.output                        # ALCF Cobalt scheduler
 *.error                         # ALCF Cobalt scheduler
 
+*.log
+*.log2
+*.log3
+
 MPI1/AMR/amr
 MPI1/Branch/branch
 MPI1/DGEMM/dgemm
@@ -106,6 +113,8 @@ C1z/transpose-target
 C1z/transpose-taskloop
 C1z/transpose-ispc
 Cxx11/dgemm-vector
+Cxx11/dgemm-cblas
+Cxx11/dgemm-cublas
 Cxx11/p2p-openmp-target
 Cxx11/p2p-tasks-openmp
 Cxx11/p2p-vector
@@ -119,6 +128,10 @@ Cxx11/p2p-innerloop-vector
 Cxx11/p2p-hyperplane-vector
 Cxx11/p2p-hyperplane-vector-openmp
 Cxx11/p2p-innerloop-vector-tbb
+Cxx11/p2p-hyperplane-vector-stl
+Cxx11/p2p-hyperplane-vector-pstl
+Cxx11/p2p-hyperplane-vector-tbb
+Cxx11/p2p-hyperplane-sycl
 Cxx11/nstream-kokkos
 Cxx11/nstream-opencl
 Cxx11/nstream-valarray
@@ -132,6 +145,10 @@ Cxx11/nstream-vector-taskloop
 Cxx11/nstream-vector-tbb
 Cxx11/nstream-valarray-boost-compute
 Cxx11/nstream-vector-boost-compute
+Cxx11/nstream-cublas
+Cxx11/nstream-cuda
+Cxx11/nstream-openmp-target
+Cxx11/nstream-sycl
 Cxx11/sparse-vector
 Cxx11/stencil-opencl
 Cxx11/stencil-openmp-target
@@ -145,7 +162,10 @@ Cxx11/stencil-vector-rangefor
 Cxx11/stencil-vector-tbb
 Cxx11/stencil-vector-taskloop
 Cxx11/stencil-kokkos
+Cxx11/stencil-cuda
+Cxx11/stencil-sycl
 Cxx11/transpose-opencl
+Cxx11/transpose-sycl
 Cxx11/transpose-openmp-target
 Cxx11/transpose-valarray
 Cxx11/transpose-vector
@@ -160,6 +180,8 @@ Cxx11/transpose-vector-rangefor
 Cxx11/transpose-vector-tbb
 Cxx11/transpose-vector-taskloop
 Cxx11/transpose-kokkos
+Cxx11/transpose-cublas
+Cxx11/transpose-cuda
 Cxx11/grid1.cl
 Cxx11/grid2.cl
 Cxx11/grid3.cl
@@ -181,11 +203,13 @@ Cxx11/star9.cl
 FORTRAN/dgemm-taskloop-openmp
 FORTRAN/dgemm-pretty
 FORTRAN/dgemm-openmp
+FORTRAN/dgemm-openmp-target
 FORTRAN/dgemm
 FORTRAN/nstream
 FORTRAN/nstream-openmp
 FORTRAN/nstream-pretty
 FORTRAN/nstream-taskloop-openmp
+FORTRAN/nstream-openmp-target
 FORTRAN/p2p
 FORTRAN/p2p-innerloop
 FORTRAN/p2p-coarray
@@ -214,6 +238,3 @@ FORTRAN/transpose-ornlacc
 RUST/p2p
 RUST/stencil
 RUST/transpose
-p2p-hyperplane-vector-stl
-p2p-hyperplane-vector-pstl
-p2p-hyperplane-vector-tbb
diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index 5538ceabd..99c2b34b8 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -82,7 +82,7 @@ nstream: nstream-valarray nstream-vector nstream-vector-openmp nstream-openmp-ta
 	 nstream-vector-taskloop nstream-vector-stl nstream-vector-pstl nstream-vector-raja \
 	 nstream-vector-rangefor nstream-vector-tbb nstream-kokkos nstream-opencl
 
-dgemm: dgemm-vector dgemm-cblas
+dgemm: dgemm-vector dgemm-cblas dgemm-cublas
 
 vector: p2p-vector p2p-hyperplane-vector stencil-vector transpose-vector nstream-vector sparse-vector dgemm-vector \
 	transpose-vector-async transpose-vector-thread
@@ -114,7 +114,7 @@ raja: p2p-vector-raja stencil-vector-raja transpose-vector-raja nstream-vector-r
 
 cuda: stencil-cuda transpose-cuda nstream-cuda
 
-cublas: transpose-cublas nstream-cublas
+cublas: transpose-cublas nstream-cublas dgemm-cublas
 
 occa: transpose-occa nstream-occa
 
diff --git a/Cxx11/dgemm-cublas.cu b/Cxx11/dgemm-cublas.cu
new file mode 100644
index 000000000..3dad895a0
--- /dev/null
+++ b/Cxx11/dgemm-cublas.cu
@@ -0,0 +1,252 @@
+///
+/// Copyright (c) 2018, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    dgemm
+///
+/// PURPOSE: This program tests the efficiency with which a dense matrix
+///          dense multiplication is carried out
+///
+/// USAGE:   The program takes as input the matrix order,
+///          the number of times the matrix-matrix multiplication
+///          is carried out, and, optionally, a tile size for matrix
+///          blocking
+///
+///          <progname> <# iterations> <matrix order> [<tile size>]
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// FUNCTIONS CALLED:
+///
+///          Other than OpenMP or standard C functions, the following
+///          functions are used in this program:
+///
+///          wtime()
+///
+/// HISTORY: Written by Rob Van der Wijngaart, February 2009.
+///          Converted to C++11 by Jeff Hammond, December, 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+#include "prk_cuda.h"
+
+__global__ void init(unsigned order, double * A, double * B, double * C)
+{
+    auto i = blockIdx.x * blockDim.x + threadIdx.x;
+    auto j = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if ((i<order) && (j<order)) {
+      A[i*order+j] = i;
+      B[i*order+j] = i;
+      C[i*order+j] = 0;
+    }
+}
+
+__global__ void init(unsigned order, double * C)
+{
+    auto i = blockIdx.x * blockDim.x + threadIdx.x;
+    auto j = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if ((i<order) && (j<order)) {
+      C[i*order+j] = 0;
+    }
+}
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/CUBLAS Dense matrix-matrix multiplication: C += A x B" << std::endl;
+
+  prk::CUDA::info info;
+  info.print();
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations;
+  int order;
+  int input_copy = 0;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <matrix order> <copy input every iteration [0/1]>";
+      }
+
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      order = std::atoi(argv[2]);
+      if (order <= 0) {
+        throw "ERROR: Matrix Order must be greater than 0";
+      } else if (order > std::floor(std::sqrt(INT_MAX))) {
+        throw "ERROR: matrix dimension too large - overflow risk";
+      }
+
+      if (argc > 3) {
+        input_copy = std::atoi(argv[2]);
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Matrix order         = " << order << std::endl;
+  std::cout << "Input copy           = " << (input_copy ? "yes" : "no") << std::endl;
+
+  cublasHandle_t h;
+  prk::CUDA::check( cublasCreate(&h) );
+
+  int tile_size = 32;
+  dim3 dimGrid(prk::divceil(order,tile_size),prk::divceil(order,tile_size),1);
+  dim3 dimBlock(tile_size, tile_size, 1);
+
+  info.checkDims(dimBlock, dimGrid);
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space for matrices
+  //////////////////////////////////////////////////////////////////////
+
+  double dgemm_time(0);
+
+  const size_t nelems = (size_t)order * (size_t)order;
+  const size_t bytes = nelems * sizeof(double);
+
+  // host buffers
+  double * h_a;
+  double * h_b;
+  double * h_c;
+  prk::CUDA::check( cudaMallocHost((void**)&h_a, bytes) );
+  prk::CUDA::check( cudaMallocHost((void**)&h_b, bytes) );
+  prk::CUDA::check( cudaMallocHost((void**)&h_c, bytes) );
+
+  // device buffers
+  double * d_a;
+  double * d_b;
+  double * d_c;
+  prk::CUDA::check( cudaMalloc((void**)&d_a, bytes) );
+  prk::CUDA::check( cudaMalloc((void**)&d_b, bytes) );
+  prk::CUDA::check( cudaMalloc((void**)&d_c, bytes) );
+
+  if (input_copy) {
+
+    for (int i=0; i<order; ++i) {
+      for (int j=0; j<order; ++j) {
+         h_a[i*order+j] = i;
+         h_b[i*order+j] = i;
+      }
+    }
+
+    prk::CUDA::check( cudaMemcpy(d_a, &(h_a[0]), bytes, cudaMemcpyHostToDevice) );
+    prk::CUDA::check( cudaMemcpy(d_b, &(h_b[0]), bytes, cudaMemcpyHostToDevice) );
+
+    init<<<dimGrid, dimBlock>>>(order, d_c);
+
+  } else {
+
+    init<<<dimGrid, dimBlock>>>(order, d_a, d_b, d_c);
+
+  }
+
+  {
+    for (auto iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) dgemm_time = prk::wtime();
+
+      if (input_copy) {
+        prk::CUDA::check( cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice) );
+        prk::CUDA::check( cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice) );
+      }
+
+      double one(1);
+      prk::CUDA::check( cublasDgemm(h,
+                                    CUBLAS_OP_N, CUBLAS_OP_N, // opA, opB
+                                    order, order, order,      // m, n, k
+                                    &one,                     // alpha
+                                    d_a, order,               // A, lda
+                                    d_b, order,               // B, ldb
+                                    &one,                     // beta
+                                    d_c, order) );            // C, ldc
+      prk::CUDA::check( cudaDeviceSynchronize() );
+    }
+    dgemm_time = prk::wtime() - dgemm_time;
+  }
+
+  // copy output back to host
+  prk::CUDA::check( cudaMemcpy(&(h_c[0]), d_c, bytes, cudaMemcpyDeviceToHost) );
+
+  prk::CUDA::check( cudaFree(d_c) );
+  prk::CUDA::check( cudaFree(d_b) );
+  prk::CUDA::check( cudaFree(d_a) );
+
+  prk::CUDA::check( cudaFreeHost(h_a) );
+  prk::CUDA::check( cudaFreeHost(h_b) );
+
+  prk::CUDA::check( cublasDestroy(h) );
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  const auto epsilon = 1.0e-8;
+  const auto forder = static_cast<double>(order);
+  const auto reference = 0.25 * std::pow(forder,3) * std::pow(forder-1.0,2) * (iterations+1);
+  const auto checksum = prk_reduce( &(h_c[0]), &(h_c[nelems]), 0.0);
+  const auto residuum = std::abs(checksum-reference)/reference;
+
+  if (residuum < epsilon) {
+#if VERBOSE
+    std::cout << "Reference checksum = " << reference << "\n"
+              << "Actual checksum = " << checksum << std::endl;
+#endif
+    std::cout << "Solution validates" << std::endl;
+    auto avgtime = dgemm_time/iterations;
+    auto nflops = 2.0 * std::pow(forder,3);
+    std::cout << "Rate (MF/s): " << 1.0e-6 * nflops/avgtime
+              << " Avg time (s): " << avgtime << std::endl;
+  } else {
+    std::cout << "Reference checksum = " << reference << "\n"
+              << "Actual checksum = " << checksum << std::endl;
+    return 1;
+  }
+
+  prk::CUDA::check( cudaFreeHost(h_c) );
+
+  return 0;
+}
+
+
diff --git a/Cxx11/transpose-cublas.cu b/Cxx11/transpose-cublas.cu
index 2ec85d35b..4f265599c 100644
--- a/Cxx11/transpose-cublas.cu
+++ b/Cxx11/transpose-cublas.cu
@@ -93,8 +93,8 @@ int main(int argc, char * argv[])
     return 1;
   }
 
-  std::cout << "Matrix order          = " << order << std::endl;
-  std::cout << "Number of iterations  = " << iterations << std::endl;
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Matrix order         = " << order << std::endl;
 
   cublasHandle_t h;
   //prk::CUDA::check( cublasInit() );

From 509e765bb964984bc9e40b8fbe87a3fcec4a8ecc Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Wed, 4 Apr 2018 14:30:17 -0700
Subject: [PATCH 071/245] bugfix input parse arg 3

---
 Cxx11/dgemm-cublas.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cxx11/dgemm-cublas.cu b/Cxx11/dgemm-cublas.cu
index 3dad895a0..08bcf80c7 100644
--- a/Cxx11/dgemm-cublas.cu
+++ b/Cxx11/dgemm-cublas.cu
@@ -116,7 +116,7 @@ int main(int argc, char * argv[])
       }
 
       if (argc > 3) {
-        input_copy = std::atoi(argv[2]);
+        input_copy = std::atoi(argv[3]);
       }
   }
   catch (const char * e) {

From 803217dd36c03751110997be80b70adf511bf215 Mon Sep 17 00:00:00 2001
From: caizixian <caizixian@users.noreply.github.com>
Date: Tue, 10 Apr 2018 03:37:50 +1000
Subject: [PATCH 072/245] Fix misleading comments (#327)

---
 OPENMP/Transpose/transpose.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/OPENMP/Transpose/transpose.c b/OPENMP/Transpose/transpose.c
index f89623278..70b67178c 100644
--- a/OPENMP/Transpose/transpose.c
+++ b/OPENMP/Transpose/transpose.c
@@ -42,7 +42,7 @@ USAGE:   Program input is three command line arguments that give the
          matrix order, the number of times to repeat the operation 
          (iterations), and the number of threads to use:
 
-         transpose <# threads> <matrix_size> <# iterations> [tile size]
+         transpose <# threads> <# iterations> <matrix_size> [tile size]
 
          An optional parameter specifies the tile size used to divide the
          individual matrix blocks for improved cache and TLB performance. 

From b7bddfcfd2398a5b5908d2012d084eaecd571a26 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Thu, 12 Apr 2018 12:21:07 -0700
Subject: [PATCH 073/245] PGI+GPU cleanup (#328)

fix a bunch of things

* add OACC p2p w/ innerloop strategy
* git ignore binaries
* add attempt at OACC async (tasks)
* PGI fixes
* PGI fixes
* add OACC p2p in C++
* add OACC C++ p2p to Travis
* explicit grid size because PGI 17.4 complained
* clean ornlacc binaries
* add comment to help user
* cleanup printout
* ignore C++ header checking with PGI
* fix pcopy bug
* add CUDA to PGI example; add flag that fixes p2p issue
* add PGI example flags for PSTL/RAJA/KOKKOS (only STL tested)
* add warning about correctness issue
---
 .gitignore                             |  3 ++
 Cxx11/Makefile                         |  1 +
 Cxx11/p2p-hyperplane-vector-ornlacc.cc |  2 +-
 Cxx11/prk_util.h                       |  2 +-
 Cxx11/stencil-cuda.cu                  |  3 ++
 FORTRAN/nstream-openmp-target.f90      | 11 +++---
 FORTRAN/nstream-ornlacc.f90            | 13 ++------
 FORTRAN/nstream-pretty.f90             |  6 ++--
 FORTRAN/nstream-taskloop-openmp.f90    |  8 ++---
 FORTRAN/nstream.f90                    | 18 ++++++----
 FORTRAN/p2p-innerloop-ornlacc.f90      |  1 +
 common/make.defs.pgi                   | 46 ++++++++++++++++++++++++--
 12 files changed, 81 insertions(+), 33 deletions(-)

diff --git a/.gitignore b/.gitignore
index 259f7766b..d5100141d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -210,6 +210,7 @@ FORTRAN/nstream-openmp
 FORTRAN/nstream-pretty
 FORTRAN/nstream-taskloop-openmp
 FORTRAN/nstream-openmp-target
+FORTRAN/nstream-ornlacc
 FORTRAN/p2p
 FORTRAN/p2p-innerloop
 FORTRAN/p2p-coarray
@@ -225,6 +226,7 @@ FORTRAN/stencil-coarray
 FORTRAN/stencil-openmp
 FORTRAN/stencil-openmp-target
 FORTRAN/stencil-pretty
+FORTRAN/stencil-ornlacc
 FORTRAN/stencil-taskloop-openmp
 FORTRAN/stencil-ornlacc
 FORTRAN/transpose
@@ -232,6 +234,7 @@ FORTRAN/transpose-coarray
 FORTRAN/transpose-openmp
 FORTRAN/transpose-openmp-target
 FORTRAN/transpose-pretty
+FORTRAN/transpose-ornlacc
 FORTRAN/transpose-taskloop-openmp
 FORTRAN/transpose-tasks-openmp
 FORTRAN/transpose-ornlacc
diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index 99c2b34b8..3af1ad2e7 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -221,6 +221,7 @@ clean:
 	-rm -f *-cblas
 	-rm -f *-occa
 	-rm -f *-boost-compute
+	-rm -f *-ornlacc
 	-rm -f transpose-vector-async transpose-vector-thread
 
 cleancl:
diff --git a/Cxx11/p2p-hyperplane-vector-ornlacc.cc b/Cxx11/p2p-hyperplane-vector-ornlacc.cc
index eb4a092e1..05aac1ced 100644
--- a/Cxx11/p2p-hyperplane-vector-ornlacc.cc
+++ b/Cxx11/p2p-hyperplane-vector-ornlacc.cc
@@ -132,7 +132,7 @@ int main(int argc, char* argv[])
     grid[i*n+0] = static_cast<double>(i);
   }
 
-  #pragma acc data pcopy(grid)
+  #pragma acc data pcopy(grid[0:n*n])
   {
     for (auto iter = 0; iter<=iterations; iter++) {
 
diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h
index 0109ba684..8bb718fe0 100644
--- a/Cxx11/prk_util.h
+++ b/Cxx11/prk_util.h
@@ -40,7 +40,7 @@
 #include <cassert>
 
 // Test standard library _after_ standard headers have been included...
-#if !defined(__NVCC__) && (defined(__GLIBCXX__) || defined(_GLIBCXX_RELEASE) ) && !defined(_GLIBCXX_USE_CXX11_ABI)
+#if !defined(__NVCC__) && !defined(__PGI) && (defined(__GLIBCXX__) || defined(_GLIBCXX_RELEASE) ) && !defined(_GLIBCXX_USE_CXX11_ABI)
 # error You are using an ancient version GNU libstdc++.  Either upgrade your GCC or tell ICC to use a newer version via the -gxx-name= option.
 #endif
 
diff --git a/Cxx11/stencil-cuda.cu b/Cxx11/stencil-cuda.cu
index ba544ada7..fcfe9e48c 100644
--- a/Cxx11/stencil-cuda.cu
+++ b/Cxx11/stencil-cuda.cu
@@ -122,6 +122,9 @@ int main(int argc, char* argv[])
           tile_size = std::atoi(argv[3]);
           if (tile_size <= 0) tile_size = n;
           if (tile_size > n) tile_size = n;
+          if (tile_size > 32) {
+              std::cout << "Warning: tile_size > 32 may lead to incorrect results (observed for CUDA 9.0 on GV100).\n";
+          }
       }
 
       // stencil pattern
diff --git a/FORTRAN/nstream-openmp-target.f90 b/FORTRAN/nstream-openmp-target.f90
index fd98f717e..96c4b1679 100644
--- a/FORTRAN/nstream-openmp-target.f90
+++ b/FORTRAN/nstream-openmp-target.f90
@@ -134,10 +134,10 @@ program main
     endif
   endif
 
-  write(*,'(a,i8)') 'Number of threads    = ',omp_get_max_threads()
-  write(*,'(a,i8)') 'Number of iterations = ', iterations
-  write(*,'(a,i8)') 'Matrix length        = ', length
-  write(*,'(a,i8)') 'Offset               = ', offset
+  write(*,'(a,i12)') 'Number of threads    = ', omp_get_max_threads()
+  write(*,'(a,i12)') 'Number of iterations = ', iterations
+  write(*,'(a,i12)') 'Matrix length        = ', length
+  write(*,'(a,i12)') 'Offset               = ', offset
 
   ! ********************************************************************
   ! ** Allocate space for the input and transpose matrix
@@ -189,10 +189,11 @@ program main
   enddo ! iterations
 
   t1 = omp_get_wtime()
-  nstream_time = t1 - t0
 
   !$omp end target data
 
+  nstream_time = t1 - t0
+
   ! ********************************************************************
   ! ** Analyze and output results.
   ! ********************************************************************
diff --git a/FORTRAN/nstream-ornlacc.f90 b/FORTRAN/nstream-ornlacc.f90
index e6d073947..033dee814 100644
--- a/FORTRAN/nstream-ornlacc.f90
+++ b/FORTRAN/nstream-ornlacc.f90
@@ -133,9 +133,9 @@ program main
     endif
   endif
 
-  write(*,'(a,i8)') 'Number of iterations = ', iterations
-  write(*,'(a,i8)') 'Matrix length        = ', length
-  write(*,'(a,i8)') 'Offset               = ', offset
+  write(*,'(a,i12)') 'Number of iterations = ', iterations
+  write(*,'(a,i12)') 'Vector length        = ', length
+  write(*,'(a,i12)') 'Offset               = ', offset
 
   ! ********************************************************************
   ! ** Allocate space for the input and transpose matrix
@@ -163,13 +163,6 @@ program main
 
   t0 = 0
 
-#ifdef _OPENMP
-!$omp parallel default(none)                           &
-!$omp&  shared(A,B,C,t0,t1)                            &
-!$omp&  firstprivate(length,iterations,offset,scalar)  &
-!$omp&  private(i,k)
-#endif
-
   !$acc parallel loop gang
   do i=1,length
     A(i) = 0
diff --git a/FORTRAN/nstream-pretty.f90 b/FORTRAN/nstream-pretty.f90
index be0fb217d..a15e365ec 100644
--- a/FORTRAN/nstream-pretty.f90
+++ b/FORTRAN/nstream-pretty.f90
@@ -133,9 +133,9 @@ program main
     endif
   endif
 
-  write(*,'(a,i8)') 'Number of iterations = ', iterations
-  write(*,'(a,i8)') 'Matrix length        = ', length
-  write(*,'(a,i8)') 'Offset               = ', offset
+  write(*,'(a,i12)') 'Number of iterations = ', iterations
+  write(*,'(a,i12)') 'Vector length        = ', length
+  write(*,'(a,i12)') 'Offset               = ', offset
 
   ! ********************************************************************
   ! ** Allocate space for the input and transpose matrix
diff --git a/FORTRAN/nstream-taskloop-openmp.f90 b/FORTRAN/nstream-taskloop-openmp.f90
index 553a220fc..636e45d73 100644
--- a/FORTRAN/nstream-taskloop-openmp.f90
+++ b/FORTRAN/nstream-taskloop-openmp.f90
@@ -124,10 +124,10 @@ program main
     endif
   endif
 
-  write(*,'(a,i8)') 'Number of threads    = ',omp_get_max_threads()
-  write(*,'(a,i8)') 'Number of iterations = ', iterations
-  write(*,'(a,i8)') 'Matrix length        = ', length
-  write(*,'(a,i8)') 'Offset               = ', offset
+  write(*,'(a,i12)') 'Number of threads    = ', omp_get_max_threads()
+  write(*,'(a,i12)') 'Number of iterations = ', iterations
+  write(*,'(a,i12)') 'Matrix length        = ', length
+  write(*,'(a,i12)') 'Offset               = ', offset
 
   ! ********************************************************************
   ! ** Allocate space for the input and transpose matrix
diff --git a/FORTRAN/nstream.f90 b/FORTRAN/nstream.f90
index 63986ab54..9d35024b5 100644
--- a/FORTRAN/nstream.f90
+++ b/FORTRAN/nstream.f90
@@ -141,11 +141,11 @@ program main
   endif
 
 #ifdef _OPENMP
-  write(*,'(a,i8)') 'Number of threads    = ',omp_get_max_threads()
+  write(*,'(a,i12)') 'Number of threads    = ', omp_get_max_threads()
 #endif
-  write(*,'(a,i8)') 'Number of iterations = ', iterations
-  write(*,'(a,i8)') 'Matrix length        = ', length
-  write(*,'(a,i8)') 'Offset               = ', offset
+  write(*,'(a,i12)') 'Number of iterations = ', iterations
+  write(*,'(a,i12)') 'Vector length        = ', length
+  write(*,'(a,i12)') 'Offset               = ', offset
 
   ! ********************************************************************
   ! ** Allocate space for the input and transpose matrix
@@ -212,10 +212,10 @@ program main
 #ifdef _OPENMP
     !$omp barrier
     !$omp master
-#endif
-    t0 = prk_get_wtime()
-#ifdef _OPENMP
+    t0 = omp_get_wtime()
     !$omp end master
+#else
+    t0 = prk_get_wtime()
 #endif
     endif
 
@@ -236,7 +236,11 @@ program main
 #endif
   enddo ! iterations
 
+#ifdef _OPENMP
+  t1 = omp_get_wtime()
+#else
   t1 = prk_get_wtime()
+#endif
 
 #ifdef _OPENMP
   !$omp end parallel
diff --git a/FORTRAN/p2p-innerloop-ornlacc.f90 b/FORTRAN/p2p-innerloop-ornlacc.f90
index 9e5ff8da7..32c24a4d4 100644
--- a/FORTRAN/p2p-innerloop-ornlacc.f90
+++ b/FORTRAN/p2p-innerloop-ornlacc.f90
@@ -107,6 +107,7 @@ program main
   if (n .gt. 16384) then
     write(*,'(a,i5)') 'WARNING: grid size exceeds 16384: ', n
     write(*,'(a)')    'PGI 17.10 + CUDA 9.0 generates illegal address'
+    write(*,'(a)')    'unless you compiled with -Mlarge_arrays.'
   endif
 
   if (iterations .lt. 1) then
diff --git a/common/make.defs.pgi b/common/make.defs.pgi
index ca8fb2d45..ddaf99a69 100644
--- a/common/make.defs.pgi
+++ b/common/make.defs.pgi
@@ -19,7 +19,9 @@ DEFAULT_OPT_FLAGS=-O2 -tp=haswell
 #
 OPENMPFLAG=-mp #-Minfo=mp,vect
 OFFLOADFLAG=-mp #-Minfo=mp,vect
-ORNLACCFLAG=-acc -ta=multicore #-Minfo=accel
+#ORNLACCFLAG=-acc -ta=multicore -Minfo=accel
+ORNLACCFLAG=-acc -ta=tesla:cc70 -Minfo=accel
+ORNLACCFLAG+=-Mlarge_arrays
 #
 # OpenCL flags
 #
@@ -27,7 +29,47 @@ ORNLACCFLAG=-acc -ta=multicore #-Minfo=accel
 #OPENCLFLAG=-framework OpenCL
 # Linux
 OPENCLDIR=/etc/alternatives/opencl-intel-tools
-OPENCLFLAG=-I$OPENCLDIR -L$OPENCLDIR/lib64 -lOpenCL
+OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL
+#
+# Parallel STL, Boost, etc.
+#
+BOOSTFLAG=-DUSE_BOOST -I.
+PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG}
+KOKKOSDIR=./kokkos
+KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG}
+RAJADIR=./raja
+RAJAFLAG=-DUSE_RAJA -I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
+#
+# CUDA flags
+#
+# Linux w/ NVIDIA CUDA
+# NVCC never supports the latest GCC.
+# Use appropriate arch or code is compiled to ancient features.
+#NVCC=nvcc --compiler-bindir=<path to older GCC> --gpu-architecture=sm_61
+NVCC=nvcc --gpu-architecture=sm_61
+CUDAFLAGS=-g -O3 -std=c++11
+# https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233
+# heavy hammer:
+#CUDAFLAGS+=-D_X86INTRIN_H_INCLUDED
+# big hammers:
+#CUDAFLAGS+=-D_IMMINTRIN_H_INCLUDED
+#CUDAFLAGS+=-D_FMA4INTRIN_H_INCLUDED
+#CUDAFLAGS+=-D_XOPMMINTRIN_H_INCLUDED
+# many tiny hammers:
+CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED
+CUDAFLAGS+=-D_AVX512FINTRIN_H_INCLUDED
+CUDAFLAGS+=-D_AVX512VLINTRIN_H_INCLUDED
+CUDAFLAGS+=-D_AVX512BWINTRIN_H_INCLUDED
+CUDAFLAGS+=-D_AVX512DQINTRIN_H_INCLUDED
+CUDAFLAGS+=-D_AVX512VLBWINTRIN_H_INCLUDED
+CUDAFLAGS+=-D_AVX512VBMIVLINTRIN_H_INCLUDED
+CUDAFLAGS+=-D_AVX512VBMIINTRIN_H_INCLUDED
+CUDAFLAGS+=-D_AVX512VLDQINTRIN_H_INCLUDED
+CUDAFLAGS+=-D_AVX512CDINTRIN_H_INCLUDED
+CUDAFLAGS+=-D_AVX512PFINTRIN_H_INCLUDED
+CUDAFLAGS+=-D_AVX512IFMAINTRIN_H_INCLUDED
+CUDAFLAGS+=-D_AVX512IFMAVLINTRIN_H_INCLUDED
+CUDAFLAGS+=-D_AVX512ERINTRIN_H_INCLUDED
 #
 # MPI
 #

From 68eb648f36ff627d8c47d5775c94821d9ed47cb4 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Thu, 12 Apr 2018 12:21:42 -0700
Subject: [PATCH 074/245] CUDA nstream type cleanup [ci skip] (#329)

---
 Cxx11/nstream-cuda.cu   |  7 +++----
 Cxx11/transpose-cuda.cu | 11 +++++------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/Cxx11/nstream-cuda.cu b/Cxx11/nstream-cuda.cu
index 646d637b0..4597021bb 100644
--- a/Cxx11/nstream-cuda.cu
+++ b/Cxx11/nstream-cuda.cu
@@ -66,7 +66,7 @@
 
 __global__ void nstream(const unsigned n, const prk_float scalar, prk_float * A, const prk_float * B, const prk_float * C)
 {
-    auto i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < n) {
         A[i] += B[i] + scalar * C[i];
     }
@@ -156,8 +156,7 @@ int main(int argc, char * argv[])
   prk::CUDA::check( cudaMemcpy(d_B, &(h_B[0]), bytes, cudaMemcpyHostToDevice) );
   prk::CUDA::check( cudaMemcpy(d_C, &(h_C[0]), bytes, cudaMemcpyHostToDevice) );
 
-  double scalar(3);
-
+  prk_float scalar(3);
   {
     for (auto iter = 0; iter<=iterations; iter++) {
 
@@ -215,7 +214,7 @@ int main(int argc, char * argv[])
   } else {
       std::cout << "Solution validates" << std::endl;
       double avgtime = nstream_time/iterations;
-      double nbytes = 4.0 * length * sizeof(double);
+      double nbytes = 4.0 * length * sizeof(prk_float);
       std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime
                 << " Avg time (s): " << avgtime << std::endl;
   }
diff --git a/Cxx11/transpose-cuda.cu b/Cxx11/transpose-cuda.cu
index 1efdab462..289fbb1a3 100644
--- a/Cxx11/transpose-cuda.cu
+++ b/Cxx11/transpose-cuda.cu
@@ -149,9 +149,9 @@ int main(int argc, char * argv[])
   std::cout << "Number of iterations  = " << iterations << std::endl;
   std::cout << "Matrix order          = " << order << std::endl;
 #if TILED
-  std::cout << "Tile size            = " << tile_dim << std::endl;
+  std::cout << "Tile size             = " << tile_dim << std::endl;
 #else
-  std::cout << "Tile size            = " << tile_size << std::endl;
+  std::cout << "Tile size             = " << tile_size << std::endl;
 #endif
 
 #if TILED
@@ -224,9 +224,8 @@ int main(int argc, char * argv[])
   /// Analyze and output results
   //////////////////////////////////////////////////////////////////////
 
-  // TODO: replace with std::generate, std::accumulate, or similar
-  const auto addit = (iterations+1.) * (iterations/2.);
-  auto abserr = 0.0;
+  const double addit = (iterations+1.) * (iterations/2.);
+  double abserr(0);
   for (auto j=0; j<order; j++) {
     for (auto i=0; i<order; i++) {
       const size_t ij = (size_t)i*(size_t)order+(size_t)j;
@@ -249,7 +248,7 @@ int main(int argc, char * argv[])
   if (abserr < epsilon) {
     std::cout << "Solution validates" << std::endl;
     auto avgtime = trans_time/iterations;
-    auto bytes = (size_t)order * (size_t)order * sizeof(double);
+    auto bytes = (size_t)order * (size_t)order * sizeof(prk_float);
     std::cout << "Rate (MB/s): " << 1.0e-6 * (2L*bytes)/avgtime
               << " Avg time (s): " << avgtime << std::endl;
   } else {

From dd0c94586445dcfeaaff43e3075bf4fa689dd2d1 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Thu, 19 Apr 2018 11:25:06 -0700
Subject: [PATCH 075/245] DGEMM batched (#330)

* add support for batched BLAS in CBLAS implementation

user requests batchsize n:
 0 = no batching
-n = loops over legacy BLAS
 n = uses batched BLAS

currently this is only tested with MKL

* cosmetic changes

* add batched BLAS

* add async and proper error checking to batched CUBLAS

* add OpenMP to loop over GEMMs

* update LLVM example

* support non-MKL, better printout
---
 Cxx11/dgemm-cblas.cc  | 215 +++++++++++++++++++++++++++++++-----------
 Cxx11/dgemm-cublas.cu | 180 ++++++++++++++++++++++++++---------
 common/make.defs.llvm |  16 ++--
 3 files changed, 300 insertions(+), 111 deletions(-)

diff --git a/Cxx11/dgemm-cblas.cc b/Cxx11/dgemm-cblas.cc
index fc5709812..cb0e44f51 100644
--- a/Cxx11/dgemm-cblas.cc
+++ b/Cxx11/dgemm-cblas.cc
@@ -1,5 +1,5 @@
 ///
-/// Copyright (c) 2017, Intel Corporation
+/// Copyright (c) 2018, Intel Corporation
 ///
 /// Redistribution and use in source and binary forms, with or without
 /// modification, are permitted provided that the following conditions
@@ -41,7 +41,7 @@
 ///          is carried out, and, optionally, a tile size for matrix
 ///          blocking
 ///
-///          <progname> <# iterations> <matrix order> [<tile size>]
+///          <progname> <# iterations> <matrix order> [<batches>]
 ///
 ///          The output consists of diagnostics to make sure the
 ///          algorithm worked, and of timing statistics.
@@ -51,7 +51,8 @@
 ///          Other than OpenMP or standard C functions, the following
 ///          functions are used in this program:
 ///
-///          wtime()
+///          cblas_dgemm()
+///          cblas_dgemm_batch()
 ///
 /// HISTORY: Written by Rob Van der Wijngaart, February 2009.
 ///          Converted to C++11 by Jeff Hammond, December, 2017.
@@ -79,9 +80,9 @@ void prk_dgemm_loops(const int order,
                const std::vector<double> & B,
                      std::vector<double> & C)
 {
-    for (auto i=0; i<order; ++i) {
-      for (auto j=0; j<order; ++j) {
-        for (auto k=0; k<order; ++k) {
+    for (int i=0; i<order; ++i) {
+      for (int j=0; j<order; ++j) {
+        for (int k=0; k<order; ++k) {
             C[i*order+j] += A[i*order+k] * B[k*order+j];
         }
       }
@@ -97,24 +98,103 @@ void prk_dgemm(const int order,
     const cblas_int n = order;
     const double alpha = 1.0;
     const double beta  = 1.0;
+
     cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                 n, n, n, alpha, &(A[0]), n, &(B[0]), n, beta, &(C[0]), n);
 }
 
+void prk_dgemm(const int order, const int batches,
+               const std::vector<std::vector<double>> & A,
+               const std::vector<std::vector<double>> & B,
+                     std::vector<std::vector<double>> & C)
+{
+    const cblas_int n = order;
+    const double alpha = 1.0;
+    const double beta  = 1.0;
+
+    for (int b=0; b<batches; ++b) {
+        cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                    n, n, n, alpha, &(A[b][0]), n, &(B[b][0]), n, beta, &(C[b][0]), n);
+    }
+}
+
+void prk_dgemm(const int order, const int batches, const int nt,
+               const std::vector<std::vector<double>> & A,
+               const std::vector<std::vector<double>> & B,
+                     std::vector<std::vector<double>> & C)
+{
+    const cblas_int n = order;
+    const double alpha = 1.0;
+    const double beta  = 1.0;
+
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static) num_threads(nt)
+#endif
+    for (int b=0; b<batches; ++b) {
+        cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                    n, n, n, alpha, &(A[b][0]), n, &(B[b][0]), n, beta, &(C[b][0]), n);
+    }
+}
+
+void prk_dgemm(const int order, const int batches,
+               double** & A,
+               double** & B,
+               double** & C)
+{
+    const cblas_int n = order;
+    const double alpha = 1.0;
+    const double beta  = 1.0;
+
+    const cblas_int group_count = 1;
+    const cblas_int group_size[group_count] = { batches };
+
+    const CBLAS_TRANSPOSE transa_array[group_count] = { CblasNoTrans };
+    const CBLAS_TRANSPOSE transb_array[group_count] = { CblasNoTrans };
+
+    const cblas_int n_array[group_count] = { n };
+
+    const double alpha_array[group_count] = { alpha };
+    const double beta_array[group_count]  = { beta };
+
+#ifdef MKL
+    cblas_dgemm_batch(CblasRowMajor, transa_array, transb_array,
+                      n_array, n_array, n_array,
+                      alpha_array,
+                      (const double**) A, n_array,
+                      (const double**) B, n_array,
+                      beta_array,
+                      C, n_array,
+                      group_count, group_size);
+#else // e.g. Accelerate does not have batched BLAS
+    for (int b=0; b<batches; ++b) {
+        cblas_dgemm(CblasRowMajor,
+                    transa_array[0], transb_array[0],
+                    n_array[0], n_array[0], n_array[0],
+                    alpha_array[0],
+                    A[b], n_array[0],
+                    B[b], n_array[0],
+                    beta_array[0],
+                    C[b], n_array[0]);
+    }
+#endif
+}
+
 int main(int argc, char * argv[])
 {
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11 CBLAS Dense matrix-matrix multiplication: C += A x B" << std::endl;
+
   //////////////////////////////////////////////////////////////////////
   /// Read and test input parameters
   //////////////////////////////////////////////////////////////////////
 
-  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
-  std::cout << "C++11 CBLAS Dense matrix-matrix multiplication: C += A x B" << std::endl;
-
   int iterations;
   int order;
+  int batches = 0;
+  int batch_threads = 1;
   try {
       if (argc < 2) {
-        throw "Usage: <# iterations> <matrix order>";
+        throw "Usage: <# iterations> <matrix order> [<batches> <batch threads>]";
       }
 
       iterations  = std::atoi(argv[1]);
@@ -128,6 +208,18 @@ int main(int argc, char * argv[])
       } else if (order > std::floor(std::sqrt(INT_MAX))) {
         throw "ERROR: matrix dimension too large - overflow risk";
       }
+
+      if (argc>3) {
+        batches = std::atoi(argv[3]);
+      }
+
+      if (argc>4) {
+        batch_threads = std::atoi(argv[4]);
+      } else {
+#ifdef _OPENMP
+        batch_threads = omp_get_max_threads();
+#endif
+      }
   }
   catch (const char * e) {
     std::cout << e << std::endl;
@@ -136,6 +228,22 @@ int main(int argc, char * argv[])
 
   std::cout << "Number of iterations = " << iterations << std::endl;
   std::cout << "Matrix order         = " << order << std::endl;
+  if (batches == 0) {
+      std::cout << "No batching" << std::endl;
+  } else if (batches > 0) {
+#ifdef MKL
+      std::cout << "Batch size           = " <<  batches << " (batched BLAS)" << std::endl;
+#else
+      std::cout << "Batch size           = " << std::abs(batches) << " (loop over legacy BLAS sequentially)" << std::endl;
+#endif
+  } else if (batches < 0) {
+      if (batch_threads > 1) {
+          std::cout << "Batch size           = " << std::abs(batches) << " (loop over legacy BLAS with "
+                    << batch_threads << " threads)" << std::endl;
+      } else {
+          std::cout << "Batch size           = " << std::abs(batches) << " (loop over legacy BLAS sequentially)" << std::endl;
+      }
+  }
 
   //////////////////////////////////////////////////////////////////////
   /// Allocate space for matrices
@@ -143,34 +251,44 @@ int main(int argc, char * argv[])
 
   double dgemm_time(0);
 
-  std::vector<double> A(order*order);
-  std::vector<double> B(order*order);
-  std::vector<double> C(order*order,0.0);
-#ifdef PRK_DEBUG
-  const unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
-  std::default_random_engine generator(seed);
-  std::uniform_real_distribution<double> uniform01(0.0, 1.0);
-  for (auto i=0; i<order; ++i) {
-    for (auto j=0; j<order; ++j) {
-       A[i*order+j] = uniform01(generator);
-       B[i*order+j] = uniform01(generator);
+  const int matrices = (batches==0 ? 1 : abs(batches));
+
+  std::vector<double> const M(order*order,0);
+  std::vector<std::vector<double>> A(matrices,M);
+  std::vector<std::vector<double>> B(matrices,M);
+  std::vector<std::vector<double>> C(matrices,M);
+  for (int b=0; b<matrices; ++b) {
+    for (int i=0; i<order; ++i) {
+      for (int j=0; j<order; ++j) {
+         A[b][i*order+j] = i;
+         B[b][i*order+j] = i;
+         C[b][i*order+j] = 0;
+      }
     }
   }
-#else
-  for (auto i=0; i<order; ++i) {
-    for (auto j=0; j<order; ++j) {
-       A[i*order+j] = i;
-       B[i*order+j] = i;
-    }
+
+  double ** pA = new double*[matrices];
+  double ** pB = new double*[matrices];
+  double ** pC = new double*[matrices];
+
+  for (int b=0; b<matrices; ++b) {
+     pA[b] = A[b].data();
+     pB[b] = B[b].data();
+     pC[b] = C[b].data();
   }
-#endif
 
   {
-    for (auto iter = 0; iter<=iterations; iter++) {
+    for (int iter = 0; iter<=iterations; iter++) {
 
       if (iter==1) dgemm_time = prk::wtime();
 
-      prk_dgemm(order, A, B, C);
+      if (batches == 0) {
+          prk_dgemm(order, A[0], B[0], C[0]);
+      } else if (batches < 0) {
+          prk_dgemm(order, matrices, batch_threads, A, B, C);
+      } else if (batches > 0) {
+          prk_dgemm(order, matrices, pA, pB, pC);
+      }
     }
     dgemm_time = prk::wtime() - dgemm_time;
   }
@@ -179,30 +297,15 @@ int main(int argc, char * argv[])
   /// Analyze and output results
   //////////////////////////////////////////////////////////////////////
 
-  const auto epsilon = 1.0e-8;
-  const auto forder = static_cast<double>(order);
-#ifdef PRK_DEBUG
-  std::vector<double> D(order*order,0.0);;
-  for (auto iter = 0; iter<=iterations; iter++) {
-    prk_dgemm_loops(order, A, B, D);
-  }
+  const double epsilon = 1.0e-8;
+  const double forder = static_cast<double>(order);
+  const double reference = 0.25 * std::pow(forder,3) * std::pow(forder-1.0,2) * (iterations+1);
   double residuum(0);
-  for (auto i=0; i<order; ++i) {
-    for (auto j=0; j<order; ++j) {
-        const auto diff = std::abs(C[i*order+j] - D[i*order+j]);
-        residuum += diff;
-        if (diff > epsilon) {
-            std::cout << i << "," << j << " = " << C[i*order+j] << ", " << D[i*order+j] << "\n";
-        }
-    }
+  for (int b=0; b<matrices; ++b) {
+      const auto checksum = prk_reduce(C[b].begin(), C[b].end(), 0.0);
+      residuum += std::abs(checksum-reference)/reference;
   }
-  const auto reference = prk_reduce(D.begin(), D.end(), 0.0);
-  const auto checksum  = prk_reduce(C.begin(), C.end(), 0.0);
-#else
-  const auto reference = 0.25 * std::pow(forder,3) * std::pow(forder-1.0,2) * (iterations+1);
-  const auto checksum = prk_reduce(C.begin(), C.end(), 0.0);
-  const auto residuum = std::abs(checksum-reference)/reference;
-#endif
+  residuum/=matrices;
 
   if (residuum < epsilon) {
 #if VERBOSE
@@ -210,17 +313,17 @@ int main(int argc, char * argv[])
               << "Actual checksum = " << checksum << std::endl;
 #endif
     std::cout << "Solution validates" << std::endl;
-    auto avgtime = dgemm_time/iterations;
+    auto avgtime = dgemm_time/iterations/matrices;
     auto nflops = 2.0 * std::pow(forder,3);
     std::cout << "Rate (MF/s): " << 1.0e-6 * nflops/avgtime
               << " Avg time (s): " << avgtime << std::endl;
   } else {
     std::cout << "Reference checksum = " << reference << "\n"
-              << "Actual checksum = " << checksum << std::endl;
+              << "Residuum           = " << residuum << std::endl;
 #if VERBOSE
     std::cout << "i, j, A, B, C, D" << std::endl;
-    for (auto i=0; i<order; ++i)
-      for (auto j=0; j<order; ++j)
+    for (int i=0; i<order; ++i)
+      for (int j=0; j<order; ++j)
         std::cout << i << "," << j << " = " << A[i*order+j] << ", " << B[i*order+j] << ", " << C[i*order+j] << ", " << D[i*order+j] << "\n";
     std::cout << std::endl;
 #endif
diff --git a/Cxx11/dgemm-cublas.cu b/Cxx11/dgemm-cublas.cu
index 08bcf80c7..bced87c6c 100644
--- a/Cxx11/dgemm-cublas.cu
+++ b/Cxx11/dgemm-cublas.cu
@@ -41,7 +41,7 @@
 ///          is carried out, and, optionally, a tile size for matrix
 ///          blocking
 ///
-///          <progname> <# iterations> <matrix order> [<tile size>]
+///          <progname> <# iterations> <matrix order> [<batches>]
 ///
 ///          The output consists of diagnostics to make sure the
 ///          algorithm worked, and of timing statistics.
@@ -51,7 +51,8 @@
 ///          Other than OpenMP or standard C functions, the following
 ///          functions are used in this program:
 ///
-///          wtime()
+///          cblasDgemm()
+///          cublasDgemmStridedBatched()
 ///
 /// HISTORY: Written by Rob Van der Wijngaart, February 2009.
 ///          Converted to C++11 by Jeff Hammond, December, 2017.
@@ -61,28 +62,91 @@
 #include "prk_util.h"
 #include "prk_cuda.h"
 
-__global__ void init(unsigned order, double * A, double * B, double * C)
+__global__ void init(int order, const int matrices, double * A, double * B, double * C)
 {
-    auto i = blockIdx.x * blockDim.x + threadIdx.x;
-    auto j = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if ((i<order) && (j<order)) {
-      A[i*order+j] = i;
-      B[i*order+j] = i;
-      C[i*order+j] = 0;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int j = blockIdx.y * blockDim.y + threadIdx.y;
+
+    for (int b=0; b<matrices; ++b) {
+      if ((i<order) && (j<order)) {
+        A[b*order*order+i*order+j] = i;
+        B[b*order*order+i*order+j] = i;
+        C[b*order*order+i*order+j] = 0;
+      }
     }
 }
 
-__global__ void init(unsigned order, double * C)
+__global__ void init(int order, const int matrices, double * C)
 {
-    auto i = blockIdx.x * blockDim.x + threadIdx.x;
-    auto j = blockIdx.y * blockDim.y + threadIdx.y;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int j = blockIdx.y * blockDim.y + threadIdx.y;
 
-    if ((i<order) && (j<order)) {
-      C[i*order+j] = 0;
+    for (int b=0; b<matrices; ++b) {
+      if ((i<order) && (j<order)) {
+        C[b*order*order+i*order+j] = 0;
+      }
     }
 }
 
+void prk_dgemm(const cublasHandle_t & h,
+               const int order,
+               const int batches,
+               double * A,
+               double * B,
+               double * C)
+{
+    const double alpha = 1.0;
+    const double beta  = 1.0;
+
+    for (int b=0; b<batches; ++b) {
+        double * pA = &(A[b*order*order]);
+        double * pB = &(B[b*order*order]);
+        double * pC = &(C[b*order*order]);
+        prk::CUDA::check( cublasDgemm(h,
+                                      CUBLAS_OP_N, CUBLAS_OP_N, // opA, opB
+                                      order, order, order,      // m, n, k
+                                      &alpha,                   // alpha
+                                      pA, order,                // A, lda
+                                      pB, order,                // B, ldb
+                                      &beta,                    // beta
+                                      pC, order) );             // C, ldc
+    }
+    prk::CUDA::check( cudaDeviceSynchronize() );
+}
+
+void prk_bgemm(const cublasHandle_t & h,
+               const int order,
+               const int batches,
+               double * A,
+               double * B,
+               double * C)
+{
+    const double alpha = 1.0;
+    const double beta  = 1.0;
+
+    prk::CUDA::check( cublasDgemmStridedBatched(h,
+                                                CUBLAS_OP_N, CUBLAS_OP_N,
+                                                order, order, order,
+                                                &alpha,
+                                                (const double *)A, order, order*order,
+                                                (const double *)B, order, order*order,
+                                                &beta,
+                                                C, order, order*order,
+                                                batches) );
+    prk::CUDA::check( cudaDeviceSynchronize() );
+
+    //  cublasStatus_t cublasDgemmBatched(cublasHandle_t handle,
+    //                                    cublasOperation_t transa,
+    //                                    cublasOperation_t transb,
+    //                                    int m, int n, int k,
+    //                                    const double          *alpha,
+    //                                    const double          *Aarray[], int lda,
+    //                                    const double          *Barray[], int ldb,
+    //                                    const double          *beta,
+    //                                    double          *Carray[], int ldc,
+    //                                    int batchCount)
+}
+
 int main(int argc, char * argv[])
 {
   std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
@@ -97,10 +161,11 @@ int main(int argc, char * argv[])
 
   int iterations;
   int order;
+  int batches = 0;
   int input_copy = 0;
   try {
-      if (argc < 3) {
-        throw "Usage: <# iterations> <matrix order> <copy input every iteration [0/1]>";
+      if (argc < 2) {
+        throw "Usage: <# iterations> <matrix order> [<batches>] [<copy input every iteration [0/1]>]";
       }
 
       iterations  = std::atoi(argv[1]);
@@ -115,7 +180,11 @@ int main(int argc, char * argv[])
         throw "ERROR: matrix dimension too large - overflow risk";
       }
 
-      if (argc > 3) {
+      if (argc>3) {
+        batches = std::atoi(argv[3]);
+      }
+
+      if (argc > 4) {
         input_copy = std::atoi(argv[3]);
       }
   }
@@ -126,12 +195,19 @@ int main(int argc, char * argv[])
 
   std::cout << "Number of iterations = " << iterations << std::endl;
   std::cout << "Matrix order         = " << order << std::endl;
+  if (batches == 0) {
+      std::cout << "No batching" << std::endl;
+  } else if (batches < 0) {
+      std::cout << "Batch size           = " << -batches << " (loop over legacy BLAS)" << std::endl;
+  } else if (batches > 0) {
+      std::cout << "Batch size           = " <<  batches << " (batched BLAS)" << std::endl;
+  }
   std::cout << "Input copy           = " << (input_copy ? "yes" : "no") << std::endl;
 
   cublasHandle_t h;
   prk::CUDA::check( cublasCreate(&h) );
 
-  int tile_size = 32;
+  const int tile_size = 32;
   dim3 dimGrid(prk::divceil(order,tile_size),prk::divceil(order,tile_size),1);
   dim3 dimBlock(tile_size, tile_size, 1);
 
@@ -143,6 +219,7 @@ int main(int argc, char * argv[])
 
   double dgemm_time(0);
 
+  const int matrices = (batches==0 ? 1 : abs(batches));
   const size_t nelems = (size_t)order * (size_t)order;
   const size_t bytes = nelems * sizeof(double);
 
@@ -152,15 +229,15 @@ int main(int argc, char * argv[])
   double * h_c;
   prk::CUDA::check( cudaMallocHost((void**)&h_a, bytes) );
   prk::CUDA::check( cudaMallocHost((void**)&h_b, bytes) );
-  prk::CUDA::check( cudaMallocHost((void**)&h_c, bytes) );
+  prk::CUDA::check( cudaMallocHost((void**)&h_c, matrices*bytes) );
 
   // device buffers
   double * d_a;
   double * d_b;
   double * d_c;
-  prk::CUDA::check( cudaMalloc((void**)&d_a, bytes) );
-  prk::CUDA::check( cudaMalloc((void**)&d_b, bytes) );
-  prk::CUDA::check( cudaMalloc((void**)&d_c, bytes) );
+  prk::CUDA::check( cudaMalloc((void**)&d_a, matrices*bytes) );
+  prk::CUDA::check( cudaMalloc((void**)&d_b, matrices*bytes) );
+  prk::CUDA::check( cudaMalloc((void**)&d_c, matrices*bytes) );
 
   if (input_copy) {
 
@@ -171,14 +248,17 @@ int main(int argc, char * argv[])
       }
     }
 
-    prk::CUDA::check( cudaMemcpy(d_a, &(h_a[0]), bytes, cudaMemcpyHostToDevice) );
-    prk::CUDA::check( cudaMemcpy(d_b, &(h_b[0]), bytes, cudaMemcpyHostToDevice) );
+    for (int b=0; b<matrices; ++b) {
+      prk::CUDA::check( cudaMemcpyAsync(&(d_a[b*order*order]), h_a, bytes, cudaMemcpyHostToDevice) );
+      prk::CUDA::check( cudaMemcpyAsync(&(d_b[b*order*order]), h_b, bytes, cudaMemcpyHostToDevice) );
+    }
+    prk::CUDA::check( cudaDeviceSynchronize() );
 
-    init<<<dimGrid, dimBlock>>>(order, d_c);
+    init<<<dimGrid, dimBlock>>>(order, matrices, d_c);
 
   } else {
 
-    init<<<dimGrid, dimBlock>>>(order, d_a, d_b, d_c);
+    init<<<dimGrid, dimBlock>>>(order, matrices, d_a, d_b, d_c);
 
   }
 
@@ -188,26 +268,26 @@ int main(int argc, char * argv[])
       if (iter==1) dgemm_time = prk::wtime();
 
       if (input_copy) {
-        prk::CUDA::check( cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice) );
-        prk::CUDA::check( cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice) );
+        for (int b=0; b<matrices; ++b) {
+          prk::CUDA::check( cudaMemcpyAsync(&(d_a[b*order*order]), h_a, bytes, cudaMemcpyHostToDevice) );
+          prk::CUDA::check( cudaMemcpyAsync(&(d_b[b*order*order]), h_b, bytes, cudaMemcpyHostToDevice) );
+        }
+        prk::CUDA::check( cudaDeviceSynchronize() );
       }
 
-      double one(1);
-      prk::CUDA::check( cublasDgemm(h,
-                                    CUBLAS_OP_N, CUBLAS_OP_N, // opA, opB
-                                    order, order, order,      // m, n, k
-                                    &one,                     // alpha
-                                    d_a, order,               // A, lda
-                                    d_b, order,               // B, ldb
-                                    &one,                     // beta
-                                    d_c, order) );            // C, ldc
-      prk::CUDA::check( cudaDeviceSynchronize() );
+      if (batches == 0) {
+        prk_dgemm(h, order, matrices, d_a, d_b, d_c);
+      } else if (batches < 0) {
+        prk_dgemm(h, order, matrices, d_a, d_b, d_c);
+      } else if (batches > 0) {
+        prk_bgemm(h, order, matrices, d_a, d_b, d_c);
+      }
     }
     dgemm_time = prk::wtime() - dgemm_time;
   }
 
   // copy output back to host
-  prk::CUDA::check( cudaMemcpy(&(h_c[0]), d_c, bytes, cudaMemcpyDeviceToHost) );
+  prk::CUDA::check( cudaMemcpyAsync(&(h_c[0]), d_c, matrices*bytes, cudaMemcpyDeviceToHost) );
 
   prk::CUDA::check( cudaFree(d_c) );
   prk::CUDA::check( cudaFree(d_b) );
@@ -218,15 +298,21 @@ int main(int argc, char * argv[])
 
   prk::CUDA::check( cublasDestroy(h) );
 
+  prk::CUDA::check( cudaDeviceSynchronize() );
+
   //////////////////////////////////////////////////////////////////////
   /// Analyze and output results
   //////////////////////////////////////////////////////////////////////
 
-  const auto epsilon = 1.0e-8;
-  const auto forder = static_cast<double>(order);
-  const auto reference = 0.25 * std::pow(forder,3) * std::pow(forder-1.0,2) * (iterations+1);
-  const auto checksum = prk_reduce( &(h_c[0]), &(h_c[nelems]), 0.0);
-  const auto residuum = std::abs(checksum-reference)/reference;
+  const double epsilon = 1.0e-8;
+  const double forder = static_cast<double>(order);
+  const double reference = 0.25 * std::pow(forder,3) * std::pow(forder-1.0,2) * (iterations+1);
+  double residuum(0);
+  for (int b=0; b<matrices; ++b) {
+      const auto checksum = prk_reduce( &(h_c[b*order*order+0]), &(h_c[b*order*order+nelems]), 0.0);
+      residuum += std::abs(checksum-reference)/reference;
+  }
+  residuum/=matrices;
 
   if (residuum < epsilon) {
 #if VERBOSE
@@ -234,13 +320,13 @@ int main(int argc, char * argv[])
               << "Actual checksum = " << checksum << std::endl;
 #endif
     std::cout << "Solution validates" << std::endl;
-    auto avgtime = dgemm_time/iterations;
+    auto avgtime = dgemm_time/iterations/matrices;
     auto nflops = 2.0 * std::pow(forder,3);
     std::cout << "Rate (MF/s): " << 1.0e-6 * nflops/avgtime
               << " Avg time (s): " << avgtime << std::endl;
   } else {
     std::cout << "Reference checksum = " << reference << "\n"
-              << "Actual checksum = " << checksum << std::endl;
+              << "Residuum           = " << residuum << std::endl;
     return 1;
   }
 
diff --git a/common/make.defs.llvm b/common/make.defs.llvm
index 2e1ab47de..0ae50e78b 100644
--- a/common/make.defs.llvm
+++ b/common/make.defs.llvm
@@ -4,7 +4,7 @@
 #
 # Base compilers and language options
 #
-LLVM_ROOT=/usr/local/Cellar/llvm/5.0.0
+LLVM_ROOT=/usr/local/Cellar/llvm/6.0.0
 LLVM_PATH=${LLVM_ROOT}/bin/
 #LLVM_PATH=/opt/llvm/HEAD/bin/
 # C99 is required in some implementations.
@@ -74,21 +74,21 @@ SYCLFLAG+=-std=c++14
 #
 # triSYCL
 # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
-#SYCLDIR=./triSYCL
-#SYCLCXX=${CXX} ${OPENMPFLAG}
-#SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include
+SYCLDIR=./triSYCL
+SYCLCXX=${CXX} ${OPENMPFLAG}
+SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include
 # ProGTX
 # https://github.com/ProGTX/sycl-gtx
-SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx
-SYCLCXX=${CXX} ${OPENMPFLAG}
-SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG}
+#SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx
+#SYCLCXX=${CXX} ${OPENMPFLAG}
+#SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG}
 #
 # OCCA
 OCCADIR=${HOME}/prk-repo/Cxx11/occa
 #
 # TBB
 #
-TBBDIR=/usr/local/Cellar/tbb/2018_U1
+TBBDIR=/usr/local/Cellar/tbb/2018_U3
 TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
 #
 # Parallel STL, Boost, etc.

From 7b24f6905f63d53b0f15eb92221b0c4bf88bfdaf Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Mon, 23 Apr 2018 11:15:28 +0300
Subject: [PATCH 076/245] trivial Kokkos cleanup (#331)

* disable verbose vectorization info

* kokkos finalize and whitespace
---
 Cxx11/nstream-kokkos.cc | 4 +++-
 Cxx11/stencil-kokkos.cc | 2 +-
 common/make.defs.gcc    | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/Cxx11/nstream-kokkos.cc b/Cxx11/nstream-kokkos.cc
index 4cb76f5fb..abf98772f 100644
--- a/Cxx11/nstream-kokkos.cc
+++ b/Cxx11/nstream-kokkos.cc
@@ -141,7 +141,7 @@ int main(int argc, char * argv[])
 
       if (iter==1) nstream_time = prk::wtime();
 
-      Kokkos::parallel_for ( length, KOKKOS_LAMBDA(const int i) {
+      Kokkos::parallel_for( length, KOKKOS_LAMBDA(const int i) {
           A[i] += B[i] + scalar * C[i];
       });
     }
@@ -181,6 +181,8 @@ int main(int argc, char * argv[])
                 << " Avg time (s): " << avgtime << std::endl;
   }
 
+  Kokkos::finalize();
+
   return 0;
 }
 
diff --git a/Cxx11/stencil-kokkos.cc b/Cxx11/stencil-kokkos.cc
index 4813294e5..be2514743 100644
--- a/Cxx11/stencil-kokkos.cc
+++ b/Cxx11/stencil-kokkos.cc
@@ -243,7 +243,7 @@ int main(int argc, char* argv[])
               << " Avg time (s): " << avgtime << std::endl;
   }
 
-  Kokkos::finalize ();
+  Kokkos::finalize();
 
   return 0;
 }
diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index 36732883f..3b3e6413a 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -25,7 +25,7 @@ DEFAULT_OPT_FLAGS=-g -O3 -mtune=native -ffast-math
 # DEFAULT_OPT_FLAGS=-g -O3 -march=knl
 # See https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html for details.
 #
-DEFAULT_OPT_FLAGS+=-fopt-info-vec-missed
+#DEFAULT_OPT_FLAGS+=-fopt-info-vec-missed
 DEFAULT_OPT_FLAGS+=-Wall
 #
 # OpenMP flags

From 09997a2851170003d266c87778837a16390e88b6 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Mon, 23 Apr 2018 11:16:14 +0300
Subject: [PATCH 077/245] tiled rangefor transpose (#332)

* add tiling to rangefor transpose
* hoist irange constructor
---
 Cxx11/transpose-vector-rangefor.cc | 50 ++++++++++++++++++++----------
 1 file changed, 33 insertions(+), 17 deletions(-)

diff --git a/Cxx11/transpose-vector-rangefor.cc b/Cxx11/transpose-vector-rangefor.cc
index ee0097026..e02047a6d 100644
--- a/Cxx11/transpose-vector-rangefor.cc
+++ b/Cxx11/transpose-vector-rangefor.cc
@@ -39,7 +39,10 @@
 /// USAGE:   Program input is the matrix order and the number of times to
 ///          repeat the operation:
 ///
-///          transpose <matrix_size> <# iterations>
+///          transpose <matrix_size> <# iterations> [tile size]
+///
+///          An optional parameter specifies the tile size used to divide the
+///          individual matrix blocks for improved cache and TLB performance.
 ///
 ///          The output consists of diagnostics to make sure the
 ///          transpose worked and timing statistics.
@@ -57,61 +60,72 @@ int main(int argc, char * argv[])
   std::cout << "C++11/range-for Matrix transpose: B = A^T" << std::endl;
 
   //////////////////////////////////////////////////////////////////////
-  /// Read and test input parameters
+  // Read and test input parameters
   //////////////////////////////////////////////////////////////////////
 
   int iterations;
   int order;
+  int tile_size;
   try {
       if (argc < 3) {
-        throw "Usage: <# iterations> <matrix order>";
+        throw "Usage: <# iterations> <matrix order> [tile size]";
       }
 
-      // number of times to do the transpose
       iterations  = std::atoi(argv[1]);
       if (iterations < 1) {
         throw "ERROR: iterations must be >= 1";
       }
 
-      // order of a the matrix
       order = std::atoi(argv[2]);
       if (order <= 0) {
         throw "ERROR: Matrix Order must be greater than 0";
       } else if (order > std::floor(std::sqrt(INT_MAX))) {
         throw "ERROR: matrix dimension too large - overflow risk";
       }
+
+      // default tile size for tiling of local transpose
+      tile_size = (argc>3) ? std::atoi(argv[3]) : 32;
+      // a negative tile size means no tiling of the local transpose
+      if (tile_size <= 0) tile_size = order;
   }
   catch (const char * e) {
     std::cout << e << std::endl;
     return 1;
   }
 
-  std::cout << "Number of iterations  = " << iterations << std::endl;
-  std::cout << "Matrix order          = " << order << std::endl;
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Matrix order         = " << order << std::endl;
+  std::cout << "Tile size            = " << tile_size << std::endl;
 
   //////////////////////////////////////////////////////////////////////
-  /// Allocate space for the input and transpose matrix
+  // Allocate space and perform the computation
   //////////////////////////////////////////////////////////////////////
 
+  auto trans_time = 0.0;
+
   std::vector<double> A(order*order);
   std::vector<double> B(order*order,0.0);
+
   // fill A with the sequence 0 to order^2-1 as doubles
   std::iota(A.begin(), A.end(), 0.0);
 
-  auto irange = boost::irange(0,order);
-  auto jrange = boost::irange(0,order);
-
-  auto trans_time = 0.0;
+  auto itrange = boost::irange(0,order,tile_size);
+  auto jtrange = boost::irange(0,order,tile_size);
 
   for (auto iter = 0; iter<=iterations; iter++) {
 
     if (iter==1) trans_time = prk::wtime();
 
-    // transpose
-    for (auto i : irange) {
-      for (auto j : jrange) {
-        B[i*order+j] += A[j*order+i];
-        A[j*order+i] += 1.0;
+    for (auto it : itrange) {
+      auto irange = boost::irange(it,std::min(order,it+tile_size));
+      for (auto jt : jtrange) {
+        auto jrange = boost::irange(jt,std::min(order,jt+tile_size));
+        for (auto i : irange) {
+          for (auto j : jrange) {
+            B[i*order+j] += A[j*order+i];
+            A[j*order+i] += 1.0;
+          }
+        }
       }
     }
   }
@@ -124,6 +138,8 @@ int main(int argc, char * argv[])
   // TODO: replace with std::generate, std::accumulate, or similar
   const auto addit = (iterations+1.) * (iterations/2.);
   auto abserr = 0.0;
+  auto irange = boost::irange(0,order);
+  auto jrange = boost::irange(0,order);
   for (auto i : irange) {
     for (auto j : jrange) {
       const int ij = i*order+j;

From a0dd2147d9b99e27b15a909c72ad12a2d56e279d Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 25 Apr 2018 12:59:46 +0300
Subject: [PATCH 078/245] Kokkos mdrange (#334)

* kokkos mdrange working in transpose and stencil
* Kokkos MDRange improvements
- remove Boost dependency
- use auto less, initialize double nicer
- whitespace and scoping for init/finalize destructor issue
---
 Cxx11/Makefile                |   6 +-
 Cxx11/generate-cxx-stencil.py |   8 +-
 Cxx11/nstream-kokkos.cc       | 182 ++++++------
 Cxx11/stencil-kokkos.cc       | 282 +++++++++---------
 Cxx11/stencil_kokkos.hpp      | 544 ++++++++++++++++------------------
 Cxx11/stencil_openmp.hpp      | 468 ++++++++++++++---------------
 Cxx11/stencil_pgnu.hpp        | 468 ++++++++++++++---------------
 Cxx11/stencil_pstl.hpp        | 468 ++++++++++++++---------------
 Cxx11/stencil_raja.hpp        | 468 ++++++++++++++---------------
 Cxx11/stencil_rangefor.hpp    | 468 ++++++++++++++---------------
 Cxx11/stencil_seq.hpp         | 468 ++++++++++++++---------------
 Cxx11/stencil_stl.hpp         | 468 ++++++++++++++---------------
 Cxx11/stencil_target.hpp      | 468 ++++++++++++++---------------
 Cxx11/stencil_taskloop.hpp    | 468 ++++++++++++++---------------
 Cxx11/stencil_tbb.hpp         | 468 ++++++++++++++---------------
 Cxx11/transpose-kokkos.cc     | 220 +++++++-------
 16 files changed, 2944 insertions(+), 2978 deletions(-)

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index 3af1ad2e7..5eb4b1526 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -43,7 +43,7 @@ BOOSTFLAGS = $(BOOSTFLAG)
 STLFLAGS = $(STLFLAG) $(BOOSTFLAGS)
 PSTLFLAGS = $(PSTLFLAG) $(BOOSTFLAGS)
 RAJAFLAGS = $(RAJAFLAG)
-KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) $(BOOSTFLAGS)
+KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG)
 ORNLACCFLAGS = $(ORNLACCFLAG)
 
 ifdef OCCADIR
@@ -63,7 +63,7 @@ else
   EXTRA += target
 endif
 
-all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sycl occa boost-compute $(EXTRA)
+all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sycl boost-compute $(EXTRA)
 
 p2p: p2p-vector p2p-doacross-vector-openmp p2p-hyperplane-vector-openmp p2p-tasks-openmp p2p-openmp-target \
      p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl p2p-hyperplane-vector-tbb \
@@ -162,7 +162,7 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h
 	$(CXX) $(CXXFLAGS) $< $(BOOSTFLAGS) -o $@
 
 %-boost-compute: %-boost-compute.cc prk_util.h
-	$(CXX) $(CXXFLAGS) $< $(BOOSTFLAGS) $(OPENCLFLAGS) -o $@
+	$(CXX) $(CXXFLAGS) $< $(BOOSTFLAGS) -DUSE_BOOST_COMPUTE $(OPENCLFLAGS) -o $@
 
 %-raja: %-raja.cc prk_util.h
 	$(CXX) $(CXXFLAGS) $< $(RAJAFLAGS) -o $@
diff --git a/Cxx11/generate-cxx-stencil.py b/Cxx11/generate-cxx-stencil.py
index 134cd0e89..39e66459a 100755
--- a/Cxx11/generate-cxx-stencil.py
+++ b/Cxx11/generate-cxx-stencil.py
@@ -66,9 +66,8 @@ def codegen(src,pattern,stencil_size,radius,W,model):
         src.write('      for (auto j=r.cols().begin(); j!=r.cols().end(); ++j ) {\n')
     elif (model=='kokkos'):
         src.write('void '+pattern+str(radius)+'(const int n, const int t, matrix & in, matrix & out) {\n')
-        src.write('    Kokkos::parallel_for ( Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>('+str(radius)+',n-'+str(radius)+'), KOKKOS_LAMBDA(const int i) {\n')
-        src.write('      PRAGMA_SIMD\n')
-        src.write('      for (auto j='+str(radius)+'; j<n-'+str(radius)+'; ++j) {\n')
+        src.write('    auto inside = Kokkos::MDRangePolicy<Kokkos::Rank<2>>({'+str(radius)+','+str(radius)+'},{n-'+str(radius)+',n-'+str(radius)+'},{t,t});\n')
+        src.write('    Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) {\n')
     elif (model=='cuda'):
         src.write('__global__ void '+pattern+str(radius)+'(const int n, const prk_float * in, prk_float * out) {\n')
         src.write('    const int i = blockIdx.x * blockDim.x + threadIdx.x;\n')
@@ -82,7 +81,7 @@ def codegen(src,pattern,stencil_size,radius,W,model):
         src.write('          PRAGMA_SIMD\n')
         src.write('          for (auto j=jt; j<std::min(n-'+str(radius)+',jt+t); ++j) {\n')
     if (model=='kokkos'):
-        src.write('            out(i,j) += ')
+        src.write('              out(i,j) += ')
     else:
         src.write('            out[i*n+j] += ')
     k = 0
@@ -106,7 +105,6 @@ def codegen(src,pattern,stencil_size,radius,W,model):
         src.write('       });\n')
         src.write('     });\n')
     elif (model=='kokkos'):
-        src.write('       }\n')
         src.write('     });\n')
     elif (model=='tbb'):
         src.write('      }\n')
diff --git a/Cxx11/nstream-kokkos.cc b/Cxx11/nstream-kokkos.cc
index abf98772f..7e468abcf 100644
--- a/Cxx11/nstream-kokkos.cc
+++ b/Cxx11/nstream-kokkos.cc
@@ -75,112 +75,114 @@ int main(int argc, char * argv[])
   std::cout << "C++11/Kokkos STREAM triad: A = B + scalar * C" << std::endl;
 
   Kokkos::initialize(argc, argv);
+  {
 
-  typedef Kokkos::PRK_KOKKOS_BACKEND Space;
-  //typedef Kokkos::TeamPolicy<Space>               team_policy;
-  //typedef Kokkos::TeamPolicy<Space>::member_type  member_type;
-
-  typedef Kokkos::View<double*, Space> vector;
-
-  //////////////////////////////////////////////////////////////////////
-  /// Read and test input parameters
-  //////////////////////////////////////////////////////////////////////
-
-  int iterations, offset;
-  size_t length;
-  try {
-      if (argc < 3) {
-        throw "Usage: <# iterations> <vector length>";
-      }
-
-      iterations  = std::atoi(argv[1]);
-      if (iterations < 1) {
-        throw "ERROR: iterations must be >= 1";
-      }
-
-      length = std::atol(argv[2]);
-      if (length <= 0) {
-        throw "ERROR: vector length must be positive";
-      }
-
-      offset = (argc>3) ? std::atoi(argv[3]) : 0;
-      if (length <= 0) {
-        throw "ERROR: offset must be nonnegative";
-      }
-  }
-  catch (const char * e) {
-    std::cout << e << std::endl;
-    return 1;
-  }
+    typedef Kokkos::PRK_KOKKOS_BACKEND Space;
+    //typedef Kokkos::TeamPolicy<Space>               team_policy;
+    //typedef Kokkos::TeamPolicy<Space>::member_type  member_type;
+
+    typedef Kokkos::View<double*, Space> vector;
+
+    //////////////////////////////////////////////////////////////////////
+    /// Read and test input parameters
+    //////////////////////////////////////////////////////////////////////
+
+    int iterations, offset;
+    size_t length;
+    try {
+        if (argc < 3) {
+          throw "Usage: <# iterations> <vector length>";
+        }
+
+        iterations  = std::atoi(argv[1]);
+        if (iterations < 1) {
+          throw "ERROR: iterations must be >= 1";
+        }
+
+        length = std::atol(argv[2]);
+        if (length <= 0) {
+          throw "ERROR: vector length must be positive";
+        }
+
+        offset = (argc>3) ? std::atoi(argv[3]) : 0;
+        if (length <= 0) {
+          throw "ERROR: offset must be nonnegative";
+        }
+    }
+    catch (const char * e) {
+      std::cout << e << std::endl;
+      return 1;
+    }
 
-  std::cout << "Number of iterations = " << iterations << std::endl;
-  std::cout << "Vector length        = " << length << std::endl;
-  std::cout << "Offset               = " << offset << std::endl;
-  std::cout << "Kokkos execution space: " << typeid(Kokkos::DefaultExecutionSpace).name() << std::endl;
+    std::cout << "Number of iterations = " << iterations << std::endl;
+    std::cout << "Vector length        = " << length << std::endl;
+    std::cout << "Offset               = " << offset << std::endl;
+    std::cout << "Kokkos execution space: " << typeid(Kokkos::DefaultExecutionSpace).name() << std::endl;
 
-  //////////////////////////////////////////////////////////////////////
-  // Allocate space and perform the computation
-  //////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////////////////////////////////////
+    // Allocate space and perform the computation
+    //////////////////////////////////////////////////////////////////////
 
-  auto nstream_time = 0.0;
+    double nstream_time(0);
 
-  vector A("A", length);
-  vector B("B", length);
-  vector C("C", length);
+    vector A("A", length);
+    vector B("B", length);
+    vector C("C", length);
 
-  double scalar(3);
+    const double scalar(3);
 
-  {
-    Kokkos::parallel_for ( length, KOKKOS_LAMBDA(const int i) {
-        A[i] = 0.0;
-        B[i] = 2.0;
-        C[i] = 2.0;
-    });
+    {
+      Kokkos::parallel_for(length, KOKKOS_LAMBDA(size_t const i) {
+          A[i] = 0.0;
+          B[i] = 2.0;
+          C[i] = 2.0;
+      });
 
-    for (auto iter = 0; iter<=iterations; iter++) {
+      for (int iter = 0; iter<=iterations; ++iter) {
 
-      if (iter==1) nstream_time = prk::wtime();
+        if (iter==1) nstream_time = prk::wtime();
 
-      Kokkos::parallel_for( length, KOKKOS_LAMBDA(const int i) {
-          A[i] += B[i] + scalar * C[i];
-      });
+        Kokkos::parallel_for(length, KOKKOS_LAMBDA(size_t const i) {
+            A[i] += B[i] + scalar * C[i];
+        });
+      }
+      nstream_time = prk::wtime() - nstream_time;
     }
-    nstream_time = prk::wtime() - nstream_time;
-  }
 
-  //////////////////////////////////////////////////////////////////////
-  /// Analyze and output results
-  //////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////////////////////////////////////
+    /// Analyze and output results
+    //////////////////////////////////////////////////////////////////////
 
-  double ar(0);
-  double br(2);
-  double cr(2);
-  for (auto i=0; i<=iterations; i++) {
-      ar += br + scalar * cr;
-  }
-
-  ar *= length;
+    double ar(0);
+    double br(2);
+    double cr(2);
+    for (int i=0; i<=iterations; i++) {
+        ar += br + scalar * cr;
+    }
 
-  double asum(0);
-  Kokkos::parallel_reduce( length, KOKKOS_LAMBDA(const int i, double & inner) {
-    inner += std::fabs(A(i));
-  }, asum);
+    ar *= length;
+
+    double asum(0);
+    Kokkos::parallel_reduce(length, KOKKOS_LAMBDA(size_t const i, double & inner) {
+      inner += std::fabs(A(i));
+    }, asum);
+
+    double epsilon(1.e-8);
+    if (std::fabs(ar-asum)/asum > epsilon) {
+        std::cout << "Failed Validation on output array\n"
+                  << "       Expected checksum: " << ar << "\n"
+                  << "       Observed checksum: " << asum << std::endl;
+        std::cout << "ERROR: solution did not validate" << std::endl;
+        return 1;
+    } else {
+        std::cout << "Solution validates" << std::endl;
+        double avgtime = nstream_time/iterations;
+        double nbytes = 4.0 * length * sizeof(double);
+        std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime
+                  << " Avg time (s): " << avgtime << std::endl;
+    }
 
-  double epsilon(1.e-8);
-  if (std::fabs(ar-asum)/asum > epsilon) {
-      std::cout << "Failed Validation on output array\n"
-                << "       Expected checksum: " << ar << "\n"
-                << "       Observed checksum: " << asum << std::endl;
-      std::cout << "ERROR: solution did not validate" << std::endl;
-      return 1;
-  } else {
-      std::cout << "Solution validates" << std::endl;
-      double avgtime = nstream_time/iterations;
-      double nbytes = 4.0 * length * sizeof(double);
-      std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime
-                << " Avg time (s): " << avgtime << std::endl;
   }
-
   Kokkos::finalize();
 
   return 0;
diff --git a/Cxx11/stencil-kokkos.cc b/Cxx11/stencil-kokkos.cc
index be2514743..d2eb5db2a 100644
--- a/Cxx11/stencil-kokkos.cc
+++ b/Cxx11/stencil-kokkos.cc
@@ -72,8 +72,6 @@ void nothing(const int n, const int t, matrix & in, matrix & out)
 {
     std::cout << "You are trying to use a stencil that does not exist." << std::endl;
     std::cout << "Please generate the new stencil using the code generator." << std::endl;
-    // n will never be zero - this is to silence compiler warnings.
-    if (n==0) std::cout << in.size() << out.size() << std::endl;
     std::abort();
 }
 
@@ -83,166 +81,158 @@ int main(int argc, char* argv[])
   std::cout << "C++11/Kokkos Stencil execution on 2D grid" << std::endl;
 
   Kokkos::initialize (argc, argv);
+  {
+    //////////////////////////////////////////////////////////////////////
+    // Process and test input parameters
+    //////////////////////////////////////////////////////////////////////
+
+    int iterations, n, radius, tile_size;
+    bool star = true;
+    try {
+        if (argc < 3) {
+          throw "Usage: <# iterations> <array dimension> [<tile_size> <star/grid> <radius>]";
+        }
+
+        // number of times to run the algorithm
+        iterations  = std::atoi(argv[1]);
+        if (iterations < 1) {
+          throw "ERROR: iterations must be >= 1";
+        }
+
+        // linear grid dimension
+        n  = std::atoi(argv[2]);
+        if (n < 1) {
+          throw "ERROR: grid dimension must be positive";
+        } else if (n > std::floor(std::sqrt(INT_MAX))) {
+          throw "ERROR: grid dimension too large - overflow risk";
+        }
+
+        // default tile size for tiling of local transpose
+        tile_size = 32;
+        if (argc > 3) {
+            tile_size = std::atoi(argv[3]);
+            if (tile_size <= 0) tile_size = n;
+            if (tile_size > n) tile_size = n;
+        }
+
+        // stencil pattern
+        if (argc > 4) {
+            auto stencil = std::string(argv[4]);
+            auto grid = std::string("grid");
+            star = (stencil == grid) ? false : true;
+        }
+
+        // stencil radius
+        radius = 2;
+        if (argc > 5) {
+            radius = std::atoi(argv[5]);
+        }
+
+        if ( (radius < 1) || (2*radius+1 > n) ) {
+          throw "ERROR: Stencil radius negative or too large";
+        }
+    }
+    catch (const char * e) {
+      std::cout << e << std::endl;
+      return 1;
+    }
 
-  //////////////////////////////////////////////////////////////////////
-  // Process and test input parameters
-  //////////////////////////////////////////////////////////////////////
-
-  int iterations, n, radius, tile_size;
-  bool star = true;
-  try {
-      if (argc < 3) {
-        throw "Usage: <# iterations> <array dimension> [<tile_size> <star/grid> <radius>]";
-      }
-
-      // number of times to run the algorithm
-      iterations  = std::atoi(argv[1]);
-      if (iterations < 1) {
-        throw "ERROR: iterations must be >= 1";
-      }
-
-      // linear grid dimension
-      n  = std::atoi(argv[2]);
-      if (n < 1) {
-        throw "ERROR: grid dimension must be positive";
-      } else if (n > std::floor(std::sqrt(INT_MAX))) {
-        throw "ERROR: grid dimension too large - overflow risk";
-      }
-
-      // default tile size for tiling of local transpose
-      tile_size = 32;
-      if (argc > 3) {
-          tile_size = std::atoi(argv[3]);
-          if (tile_size <= 0) tile_size = n;
-          if (tile_size > n) tile_size = n;
-      }
-
-      // stencil pattern
-      if (argc > 4) {
-          auto stencil = std::string(argv[4]);
-          auto grid = std::string("grid");
-          star = (stencil == grid) ? false : true;
-      }
-
-      // stencil radius
-      radius = 2;
-      if (argc > 5) {
-          radius = std::atoi(argv[5]);
-      }
-
-      if ( (radius < 1) || (2*radius+1 > n) ) {
-        throw "ERROR: Stencil radius negative or too large";
-      }
-  }
-  catch (const char * e) {
-    std::cout << e << std::endl;
-    return 1;
-  }
+    std::cout << "Number of iterations = " << iterations << std::endl;
+    std::cout << "Grid size            = " << n << std::endl;
+    std::cout << "Tile size            = " << tile_size << std::endl;
+    std::cout << "Type of stencil      = " << (star ? "star" : "grid") << std::endl;
+    std::cout << "Radius of stencil    = " << radius << std::endl;
+    std::cout << "Compact representation of stencil loop body" << std::endl;
+    std::cout << "Kokkos execution space: " << typeid(Kokkos::DefaultExecutionSpace).name() << std::endl;
+
+    auto stencil = nothing;
+    if (star) {
+        switch (radius) {
+            case 1: stencil = star1; break;
+            case 2: stencil = star2; break;
+            case 3: stencil = star3; break;
+            case 4: stencil = star4; break;
+            case 5: stencil = star5; break;
+        }
+    } else {
+        switch (radius) {
+            case 1: stencil = grid1; break;
+            case 2: stencil = grid2; break;
+            case 3: stencil = grid3; break;
+            case 4: stencil = grid4; break;
+            case 5: stencil = grid5; break;
+        }
+    }
 
-  std::cout << "Number of iterations = " << iterations << std::endl;
-  std::cout << "Grid size            = " << n << std::endl;
-  std::cout << "Tile size            = " << tile_size << std::endl;
-  std::cout << "Type of stencil      = " << (star ? "star" : "grid") << std::endl;
-  std::cout << "Radius of stencil    = " << radius << std::endl;
-  std::cout << "Compact representation of stencil loop body" << std::endl;
-  std::cout << "Kokkos execution space: " << typeid(Kokkos::DefaultExecutionSpace).name() << std::endl;
-
-  auto stencil = nothing;
-  if (star) {
-      switch (radius) {
-          case 1: stencil = star1; break;
-          case 2: stencil = star2; break;
-          case 3: stencil = star3; break;
-          case 4: stencil = star4; break;
-          case 5: stencil = star5; break;
-      }
-  } else {
-      switch (radius) {
-          case 1: stencil = grid1; break;
-          case 2: stencil = grid2; break;
-          case 3: stencil = grid3; break;
-          case 4: stencil = grid4; break;
-          case 5: stencil = grid5; break;
-      }
-  }
+    //////////////////////////////////////////////////////////////////////
+    // Allocate space and perform the computation
+    //////////////////////////////////////////////////////////////////////
 
-  //////////////////////////////////////////////////////////////////////
-  // Allocate space and perform the computation
-  //////////////////////////////////////////////////////////////////////
+    double stencil_time(0);
 
-  auto stencil_time = 0.0;
+    // row-major 2D array
+    matrix in("in", n, n);
+    matrix out("out", n, n);
 
-  // row-major 2D array
-  matrix in("in", n, n);
-  matrix out("out", n, n);
+    auto z2     = {0,0};
+    auto n2     = {n,n};
+    auto tile2  = {tile_size,tile_size};
+    auto full   = Kokkos::MDRangePolicy<Kokkos::Rank<2>>(z2,n2,tile2);
 
-  try {
-    Kokkos::parallel_for ( n,[&] (int i) {
-      for (auto j=0; j<n; ++j){
-          in(i,j) = static_cast<double>(i+j);
-          out(i,j) = 0.0;
-      }
+    Kokkos::parallel_for(full, KOKKOS_LAMBDA(int i, int j) {
+        in(i,j)  = static_cast<double>(i+j);
+        out(i,j) = 0.0;
     });
-  }
-  catch (const char * e) {
-    std::cout << e << std::endl;
-    return 1;
-  }
-  catch (std::exception const & e) {
-    std::cout << e.what() << std::endl;
-    return 1;
-  }
-
-  for (auto iter = 0; iter<=iterations; iter++) {
 
-    if (iter==1) stencil_time = prk::wtime();
-    // Apply the stencil operator
-    stencil(n, tile_size, in, out);
-    // Add constant to solution to force refresh of neighbor data, if any
-    Kokkos::parallel_for ( n,[&] (int i) {
-      for (auto j=0; j<n; ++j){
-        in(i,j) += 1.0;
-      }
-    });
-  }
+    for (int iter = 0; iter<=iterations; ++iter) {
 
-  stencil_time = prk::wtime() - stencil_time;
+      if (iter==1) stencil_time = prk::wtime();
 
-  //////////////////////////////////////////////////////////////////////
-  // Analyze and output results.
-  //////////////////////////////////////////////////////////////////////
+      stencil(n, tile_size, in, out);
 
-  size_t active_points = static_cast<size_t>(n-2*radius)*static_cast<size_t>(n-2*radius);
-  // compute L1 norm in parallel
-  double norm = 0.0;
-  auto inside = boost::irange(radius,n-radius);
-  for (auto i : inside) {
-    for (auto j : inside) {
-      norm += std::fabs(out(i,j));
+      Kokkos::parallel_for(full, KOKKOS_LAMBDA(int i, int j) {
+          in(i,j) += 1.0;
+      });
     }
-  }
-  norm /= active_points;
-
-  // verify correctness
-  const double epsilon = 1.0e-8;
-  double reference_norm = 2.*(iterations+1.);
-  if (std::fabs(norm-reference_norm) > epsilon) {
-    std::cout << "ERROR: L1 norm = " << norm
-              << " Reference L1 norm = " << reference_norm << std::endl;
-    return 1;
-  } else {
-    std::cout << "Solution validates" << std::endl;
+
+    stencil_time = prk::wtime() - stencil_time;
+
+    //////////////////////////////////////////////////////////////////////
+    // Analyze and output results.
+    //////////////////////////////////////////////////////////////////////
+
+    size_t active_points = static_cast<size_t>(n-2*radius)*static_cast<size_t>(n-2*radius);
+
+    double norm(0);
+    auto r2     = {radius,radius};
+    auto nr2    = {n-radius,n-radius};
+    auto inside = Kokkos::MDRangePolicy<Kokkos::Rank<2>>(r2,nr2,tile2);
+    Kokkos::parallel_reduce(inside, KOKKOS_LAMBDA(int i, int j, double & norm) {
+        norm += std::fabs(out(i,j));
+    }, norm);
+    norm /= active_points;
+
+    // verify correctness
+    double const epsilon(1.0e-8);
+    double reference_norm = 2.*(iterations+1.);
+    if (std::fabs(norm-reference_norm) > epsilon) {
+      std::cout << "ERROR: L1 norm = " << norm
+                << " Reference L1 norm = " << reference_norm << std::endl;
+      return 1;
+    } else {
+      std::cout << "Solution validates" << std::endl;
 #ifdef VERBOSE
-    std::cout << "L1 norm = " << norm
-              << " Reference L1 norm = " << reference_norm << std::endl;
+      std::cout << "L1 norm = " << norm
+                << " Reference L1 norm = " << reference_norm << std::endl;
 #endif
-    const int stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1);
-    size_t flops = (2L*(size_t)stencil_size+1L) * active_points;
-    auto avgtime = stencil_time/iterations;
-    std::cout << "Rate (MFlops/s): " << 1.0e-6 * static_cast<double>(flops)/avgtime
-              << " Avg time (s): " << avgtime << std::endl;
+      const int stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1);
+      size_t flops = (2.*stencil_size+1.) * active_points;
+      auto avgtime = stencil_time/iterations;
+      std::cout << "Rate (MFlops/s): " << 1.0e-6 * static_cast<double>(flops)/avgtime
+            << " Avg time (s): " << avgtime << std::endl;
   }
 
+  }
   Kokkos::finalize();
 
   return 0;
diff --git a/Cxx11/stencil_kokkos.hpp b/Cxx11/stencil_kokkos.hpp
index 5b67ee4ab..cb5009aae 100644
--- a/Cxx11/stencil_kokkos.hpp
+++ b/Cxx11/stencil_kokkos.hpp
@@ -1,20 +1,17 @@
 void star1(const int n, const int t, matrix & in, matrix & out) {
-    Kokkos::parallel_for ( Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(1,n-1), KOKKOS_LAMBDA(const int i) {
-      PRAGMA_SIMD
-      for (auto j=1; j<n-1; ++j) {
-            out(i,j) += +in(i+-1,j+0) * -0.5
+    auto inside = Kokkos::MDRangePolicy<Kokkos::Rank<2>>({1,1},{n-1,n-1},{t,t});
+    Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) {
+              out(i,j) += +in(i+-1,j+0) * -0.5
                           +in(i+0,j+-1) * -0.5
                           +in(i+0,j+1) * 0.5
                           +in(i+1,j+0) * 0.5;
-       }
      });
 }
 
 void star2(const int n, const int t, matrix & in, matrix & out) {
-    Kokkos::parallel_for ( Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(2,n-2), KOKKOS_LAMBDA(const int i) {
-      PRAGMA_SIMD
-      for (auto j=2; j<n-2; ++j) {
-            out(i,j) += +in(i+-2,j+0) * -0.125
+    auto inside = Kokkos::MDRangePolicy<Kokkos::Rank<2>>({2,2},{n-2,n-2},{t,t});
+    Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) {
+              out(i,j) += +in(i+-2,j+0) * -0.125
                           +in(i+-1,j+0) * -0.25
                           +in(i+0,j+-2) * -0.125
                           +in(i+0,j+-1) * -0.25
@@ -22,374 +19,357 @@ void star2(const int n, const int t, matrix & in, matrix & out) {
                           +in(i+0,j+2) * 0.125
                           +in(i+1,j+0) * 0.25
                           +in(i+2,j+0) * 0.125;
-       }
      });
 }
 
 void star3(const int n, const int t, matrix & in, matrix & out) {
-    Kokkos::parallel_for ( Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(3,n-3), KOKKOS_LAMBDA(const int i) {
-      PRAGMA_SIMD
-      for (auto j=3; j<n-3; ++j) {
-            out(i,j) += +in(i+-3,j+0) * -0.05555555555555555
-                          +in(i+-2,j+0) * -0.08333333333333333
-                          +in(i+-1,j+0) * -0.16666666666666666
-                          +in(i+0,j+-3) * -0.05555555555555555
-                          +in(i+0,j+-2) * -0.08333333333333333
-                          +in(i+0,j+-1) * -0.16666666666666666
-                          +in(i+0,j+1) * 0.16666666666666666
-                          +in(i+0,j+2) * 0.08333333333333333
-                          +in(i+0,j+3) * 0.05555555555555555
-                          +in(i+1,j+0) * 0.16666666666666666
-                          +in(i+2,j+0) * 0.08333333333333333
-                          +in(i+3,j+0) * 0.05555555555555555;
-       }
+    auto inside = Kokkos::MDRangePolicy<Kokkos::Rank<2>>({3,3},{n-3,n-3},{t,t});
+    Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) {
+              out(i,j) += +in(i+-3,j+0) * -0.0555555555556
+                          +in(i+-2,j+0) * -0.0833333333333
+                          +in(i+-1,j+0) * -0.166666666667
+                          +in(i+0,j+-3) * -0.0555555555556
+                          +in(i+0,j+-2) * -0.0833333333333
+                          +in(i+0,j+-1) * -0.166666666667
+                          +in(i+0,j+1) * 0.166666666667
+                          +in(i+0,j+2) * 0.0833333333333
+                          +in(i+0,j+3) * 0.0555555555556
+                          +in(i+1,j+0) * 0.166666666667
+                          +in(i+2,j+0) * 0.0833333333333
+                          +in(i+3,j+0) * 0.0555555555556;
      });
 }
 
 void star4(const int n, const int t, matrix & in, matrix & out) {
-    Kokkos::parallel_for ( Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(4,n-4), KOKKOS_LAMBDA(const int i) {
-      PRAGMA_SIMD
-      for (auto j=4; j<n-4; ++j) {
-            out(i,j) += +in(i+-4,j+0) * -0.03125
-                          +in(i+-3,j+0) * -0.041666666666666664
+    auto inside = Kokkos::MDRangePolicy<Kokkos::Rank<2>>({4,4},{n-4,n-4},{t,t});
+    Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) {
+              out(i,j) += +in(i+-4,j+0) * -0.03125
+                          +in(i+-3,j+0) * -0.0416666666667
                           +in(i+-2,j+0) * -0.0625
                           +in(i+-1,j+0) * -0.125
                           +in(i+0,j+-4) * -0.03125
-                          +in(i+0,j+-3) * -0.041666666666666664
+                          +in(i+0,j+-3) * -0.0416666666667
                           +in(i+0,j+-2) * -0.0625
                           +in(i+0,j+-1) * -0.125
                           +in(i+0,j+1) * 0.125
                           +in(i+0,j+2) * 0.0625
-                          +in(i+0,j+3) * 0.041666666666666664
+                          +in(i+0,j+3) * 0.0416666666667
                           +in(i+0,j+4) * 0.03125
                           +in(i+1,j+0) * 0.125
                           +in(i+2,j+0) * 0.0625
-                          +in(i+3,j+0) * 0.041666666666666664
+                          +in(i+3,j+0) * 0.0416666666667
                           +in(i+4,j+0) * 0.03125;
-       }
      });
 }
 
 void star5(const int n, const int t, matrix & in, matrix & out) {
-    Kokkos::parallel_for ( Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(5,n-5), KOKKOS_LAMBDA(const int i) {
-      PRAGMA_SIMD
-      for (auto j=5; j<n-5; ++j) {
-            out(i,j) += +in(i+-5,j+0) * -0.02
+    auto inside = Kokkos::MDRangePolicy<Kokkos::Rank<2>>({5,5},{n-5,n-5},{t,t});
+    Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) {
+              out(i,j) += +in(i+-5,j+0) * -0.02
                           +in(i+-4,j+0) * -0.025
-                          +in(i+-3,j+0) * -0.03333333333333333
+                          +in(i+-3,j+0) * -0.0333333333333
                           +in(i+-2,j+0) * -0.05
                           +in(i+-1,j+0) * -0.1
                           +in(i+0,j+-5) * -0.02
                           +in(i+0,j+-4) * -0.025
-                          +in(i+0,j+-3) * -0.03333333333333333
+                          +in(i+0,j+-3) * -0.0333333333333
                           +in(i+0,j+-2) * -0.05
                           +in(i+0,j+-1) * -0.1
                           +in(i+0,j+1) * 0.1
                           +in(i+0,j+2) * 0.05
-                          +in(i+0,j+3) * 0.03333333333333333
+                          +in(i+0,j+3) * 0.0333333333333
                           +in(i+0,j+4) * 0.025
                           +in(i+0,j+5) * 0.02
                           +in(i+1,j+0) * 0.1
                           +in(i+2,j+0) * 0.05
-                          +in(i+3,j+0) * 0.03333333333333333
+                          +in(i+3,j+0) * 0.0333333333333
                           +in(i+4,j+0) * 0.025
                           +in(i+5,j+0) * 0.02;
-       }
      });
 }
 
 void grid1(const int n, const int t, matrix & in, matrix & out) {
-    Kokkos::parallel_for ( Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(1,n-1), KOKKOS_LAMBDA(const int i) {
-      PRAGMA_SIMD
-      for (auto j=1; j<n-1; ++j) {
-            out(i,j) += +in(i+-1,j+-1) * -0.25
+    auto inside = Kokkos::MDRangePolicy<Kokkos::Rank<2>>({1,1},{n-1,n-1},{t,t});
+    Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) {
+              out(i,j) += +in(i+-1,j+-1) * -0.25
                           +in(i+-1,j+0) * -0.25
                           +in(i+0,j+-1) * -0.25
                           +in(i+0,j+1) * 0.25
                           +in(i+1,j+0) * 0.25
                           +in(i+1,j+1) * 0.25
                           ;
-       }
      });
 }
 
 void grid2(const int n, const int t, matrix & in, matrix & out) {
-    Kokkos::parallel_for ( Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(2,n-2), KOKKOS_LAMBDA(const int i) {
-      PRAGMA_SIMD
-      for (auto j=2; j<n-2; ++j) {
-            out(i,j) += +in(i+-2,j+-2) * -0.0625
-                          +in(i+-2,j+-1) * -0.020833333333333332
-                          +in(i+-2,j+0) * -0.020833333333333332
-                          +in(i+-2,j+1) * -0.020833333333333332
-                          +in(i+-1,j+-2) * -0.020833333333333332
+    auto inside = Kokkos::MDRangePolicy<Kokkos::Rank<2>>({2,2},{n-2,n-2},{t,t});
+    Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) {
+              out(i,j) += +in(i+-2,j+-2) * -0.0625
+                          +in(i+-2,j+-1) * -0.0208333333333
+                          +in(i+-2,j+0) * -0.0208333333333
+                          +in(i+-2,j+1) * -0.0208333333333
+                          +in(i+-1,j+-2) * -0.0208333333333
                           +in(i+-1,j+-1) * -0.125
                           +in(i+-1,j+0) * -0.125
-                          +in(i+-1,j+2) * 0.020833333333333332
-                          +in(i+0,j+-2) * -0.020833333333333332
+                          +in(i+-1,j+2) * 0.0208333333333
+                          +in(i+0,j+-2) * -0.0208333333333
                           +in(i+0,j+-1) * -0.125
                           +in(i+0,j+1) * 0.125
-                          +in(i+0,j+2) * 0.020833333333333332
-                          +in(i+1,j+-2) * -0.020833333333333332
+                          +in(i+0,j+2) * 0.0208333333333
+                          +in(i+1,j+-2) * -0.0208333333333
                           +in(i+1,j+0) * 0.125
                           +in(i+1,j+1) * 0.125
-                          +in(i+1,j+2) * 0.020833333333333332
-                          +in(i+2,j+-1) * 0.020833333333333332
-                          +in(i+2,j+0) * 0.020833333333333332
-                          +in(i+2,j+1) * 0.020833333333333332
+                          +in(i+1,j+2) * 0.0208333333333
+                          +in(i+2,j+-1) * 0.0208333333333
+                          +in(i+2,j+0) * 0.0208333333333
+                          +in(i+2,j+1) * 0.0208333333333
                           +in(i+2,j+2) * 0.0625
                           ;
-       }
      });
 }
 
 void grid3(const int n, const int t, matrix & in, matrix & out) {
-    Kokkos::parallel_for ( Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(3,n-3), KOKKOS_LAMBDA(const int i) {
-      PRAGMA_SIMD
-      for (auto j=3; j<n-3; ++j) {
-            out(i,j) += +in(i+-3,j+-3) * -0.027777777777777776
-                          +in(i+-3,j+-2) * -0.005555555555555556
-                          +in(i+-3,j+-1) * -0.005555555555555556
-                          +in(i+-3,j+0) * -0.005555555555555556
-                          +in(i+-3,j+1) * -0.005555555555555556
-                          +in(i+-3,j+2) * -0.005555555555555556
-                          +in(i+-2,j+-3) * -0.005555555555555556
-                          +in(i+-2,j+-2) * -0.041666666666666664
-                          +in(i+-2,j+-1) * -0.013888888888888888
-                          +in(i+-2,j+0) * -0.013888888888888888
-                          +in(i+-2,j+1) * -0.013888888888888888
-                          +in(i+-2,j+3) * 0.005555555555555556
-                          +in(i+-1,j+-3) * -0.005555555555555556
-                          +in(i+-1,j+-2) * -0.013888888888888888
-                          +in(i+-1,j+-1) * -0.08333333333333333
-                          +in(i+-1,j+0) * -0.08333333333333333
-                          +in(i+-1,j+2) * 0.013888888888888888
-                          +in(i+-1,j+3) * 0.005555555555555556
-                          +in(i+0,j+-3) * -0.005555555555555556
-                          +in(i+0,j+-2) * -0.013888888888888888
-                          +in(i+0,j+-1) * -0.08333333333333333
-                          +in(i+0,j+1) * 0.08333333333333333
-                          +in(i+0,j+2) * 0.013888888888888888
-                          +in(i+0,j+3) * 0.005555555555555556
-                          +in(i+1,j+-3) * -0.005555555555555556
-                          +in(i+1,j+-2) * -0.013888888888888888
-                          +in(i+1,j+0) * 0.08333333333333333
-                          +in(i+1,j+1) * 0.08333333333333333
-                          +in(i+1,j+2) * 0.013888888888888888
-                          +in(i+1,j+3) * 0.005555555555555556
-                          +in(i+2,j+-3) * -0.005555555555555556
-                          +in(i+2,j+-1) * 0.013888888888888888
-                          +in(i+2,j+0) * 0.013888888888888888
-                          +in(i+2,j+1) * 0.013888888888888888
-                          +in(i+2,j+2) * 0.041666666666666664
-                          +in(i+2,j+3) * 0.005555555555555556
-                          +in(i+3,j+-2) * 0.005555555555555556
-                          +in(i+3,j+-1) * 0.005555555555555556
-                          +in(i+3,j+0) * 0.005555555555555556
-                          +in(i+3,j+1) * 0.005555555555555556
-                          +in(i+3,j+2) * 0.005555555555555556
-                          +in(i+3,j+3) * 0.027777777777777776
+    auto inside = Kokkos::MDRangePolicy<Kokkos::Rank<2>>({3,3},{n-3,n-3},{t,t});
+    Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) {
+              out(i,j) += +in(i+-3,j+-3) * -0.0277777777778
+                          +in(i+-3,j+-2) * -0.00555555555556
+                          +in(i+-3,j+-1) * -0.00555555555556
+                          +in(i+-3,j+0) * -0.00555555555556
+                          +in(i+-3,j+1) * -0.00555555555556
+                          +in(i+-3,j+2) * -0.00555555555556
+                          +in(i+-2,j+-3) * -0.00555555555556
+                          +in(i+-2,j+-2) * -0.0416666666667
+                          +in(i+-2,j+-1) * -0.0138888888889
+                          +in(i+-2,j+0) * -0.0138888888889
+                          +in(i+-2,j+1) * -0.0138888888889
+                          +in(i+-2,j+3) * 0.00555555555556
+                          +in(i+-1,j+-3) * -0.00555555555556
+                          +in(i+-1,j+-2) * -0.0138888888889
+                          +in(i+-1,j+-1) * -0.0833333333333
+                          +in(i+-1,j+0) * -0.0833333333333
+                          +in(i+-1,j+2) * 0.0138888888889
+                          +in(i+-1,j+3) * 0.00555555555556
+                          +in(i+0,j+-3) * -0.00555555555556
+                          +in(i+0,j+-2) * -0.0138888888889
+                          +in(i+0,j+-1) * -0.0833333333333
+                          +in(i+0,j+1) * 0.0833333333333
+                          +in(i+0,j+2) * 0.0138888888889
+                          +in(i+0,j+3) * 0.00555555555556
+                          +in(i+1,j+-3) * -0.00555555555556
+                          +in(i+1,j+-2) * -0.0138888888889
+                          +in(i+1,j+0) * 0.0833333333333
+                          +in(i+1,j+1) * 0.0833333333333
+                          +in(i+1,j+2) * 0.0138888888889
+                          +in(i+1,j+3) * 0.00555555555556
+                          +in(i+2,j+-3) * -0.00555555555556
+                          +in(i+2,j+-1) * 0.0138888888889
+                          +in(i+2,j+0) * 0.0138888888889
+                          +in(i+2,j+1) * 0.0138888888889
+                          +in(i+2,j+2) * 0.0416666666667
+                          +in(i+2,j+3) * 0.00555555555556
+                          +in(i+3,j+-2) * 0.00555555555556
+                          +in(i+3,j+-1) * 0.00555555555556
+                          +in(i+3,j+0) * 0.00555555555556
+                          +in(i+3,j+1) * 0.00555555555556
+                          +in(i+3,j+2) * 0.00555555555556
+                          +in(i+3,j+3) * 0.0277777777778
                           ;
-       }
      });
 }
 
 void grid4(const int n, const int t, matrix & in, matrix & out) {
-    Kokkos::parallel_for ( Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(4,n-4), KOKKOS_LAMBDA(const int i) {
-      PRAGMA_SIMD
-      for (auto j=4; j<n-4; ++j) {
-            out(i,j) += +in(i+-4,j+-4) * -0.015625
-                          +in(i+-4,j+-3) * -0.002232142857142857
-                          +in(i+-4,j+-2) * -0.002232142857142857
-                          +in(i+-4,j+-1) * -0.002232142857142857
-                          +in(i+-4,j+0) * -0.002232142857142857
-                          +in(i+-4,j+1) * -0.002232142857142857
-                          +in(i+-4,j+2) * -0.002232142857142857
-                          +in(i+-4,j+3) * -0.002232142857142857
-                          +in(i+-3,j+-4) * -0.002232142857142857
-                          +in(i+-3,j+-3) * -0.020833333333333332
-                          +in(i+-3,j+-2) * -0.004166666666666667
-                          +in(i+-3,j+-1) * -0.004166666666666667
-                          +in(i+-3,j+0) * -0.004166666666666667
-                          +in(i+-3,j+1) * -0.004166666666666667
-                          +in(i+-3,j+2) * -0.004166666666666667
-                          +in(i+-3,j+4) * 0.002232142857142857
-                          +in(i+-2,j+-4) * -0.002232142857142857
-                          +in(i+-2,j+-3) * -0.004166666666666667
+    auto inside = Kokkos::MDRangePolicy<Kokkos::Rank<2>>({4,4},{n-4,n-4},{t,t});
+    Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) {
+              out(i,j) += +in(i+-4,j+-4) * -0.015625
+                          +in(i+-4,j+-3) * -0.00223214285714
+                          +in(i+-4,j+-2) * -0.00223214285714
+                          +in(i+-4,j+-1) * -0.00223214285714
+                          +in(i+-4,j+0) * -0.00223214285714
+                          +in(i+-4,j+1) * -0.00223214285714
+                          +in(i+-4,j+2) * -0.00223214285714
+                          +in(i+-4,j+3) * -0.00223214285714
+                          +in(i+-3,j+-4) * -0.00223214285714
+                          +in(i+-3,j+-3) * -0.0208333333333
+                          +in(i+-3,j+-2) * -0.00416666666667
+                          +in(i+-3,j+-1) * -0.00416666666667
+                          +in(i+-3,j+0) * -0.00416666666667
+                          +in(i+-3,j+1) * -0.00416666666667
+                          +in(i+-3,j+2) * -0.00416666666667
+                          +in(i+-3,j+4) * 0.00223214285714
+                          +in(i+-2,j+-4) * -0.00223214285714
+                          +in(i+-2,j+-3) * -0.00416666666667
                           +in(i+-2,j+-2) * -0.03125
-                          +in(i+-2,j+-1) * -0.010416666666666666
-                          +in(i+-2,j+0) * -0.010416666666666666
-                          +in(i+-2,j+1) * -0.010416666666666666
-                          +in(i+-2,j+3) * 0.004166666666666667
-                          +in(i+-2,j+4) * 0.002232142857142857
-                          +in(i+-1,j+-4) * -0.002232142857142857
-                          +in(i+-1,j+-3) * -0.004166666666666667
-                          +in(i+-1,j+-2) * -0.010416666666666666
+                          +in(i+-2,j+-1) * -0.0104166666667
+                          +in(i+-2,j+0) * -0.0104166666667
+                          +in(i+-2,j+1) * -0.0104166666667
+                          +in(i+-2,j+3) * 0.00416666666667
+                          +in(i+-2,j+4) * 0.00223214285714
+                          +in(i+-1,j+-4) * -0.00223214285714
+                          +in(i+-1,j+-3) * -0.00416666666667
+                          +in(i+-1,j+-2) * -0.0104166666667
                           +in(i+-1,j+-1) * -0.0625
                           +in(i+-1,j+0) * -0.0625
-                          +in(i+-1,j+2) * 0.010416666666666666
-                          +in(i+-1,j+3) * 0.004166666666666667
-                          +in(i+-1,j+4) * 0.002232142857142857
-                          +in(i+0,j+-4) * -0.002232142857142857
-                          +in(i+0,j+-3) * -0.004166666666666667
-                          +in(i+0,j+-2) * -0.010416666666666666
+                          +in(i+-1,j+2) * 0.0104166666667
+                          +in(i+-1,j+3) * 0.00416666666667
+                          +in(i+-1,j+4) * 0.00223214285714
+                          +in(i+0,j+-4) * -0.00223214285714
+                          +in(i+0,j+-3) * -0.00416666666667
+                          +in(i+0,j+-2) * -0.0104166666667
                           +in(i+0,j+-1) * -0.0625
                           +in(i+0,j+1) * 0.0625
-                          +in(i+0,j+2) * 0.010416666666666666
-                          +in(i+0,j+3) * 0.004166666666666667
-                          +in(i+0,j+4) * 0.002232142857142857
-                          +in(i+1,j+-4) * -0.002232142857142857
-                          +in(i+1,j+-3) * -0.004166666666666667
-                          +in(i+1,j+-2) * -0.010416666666666666
+                          +in(i+0,j+2) * 0.0104166666667
+                          +in(i+0,j+3) * 0.00416666666667
+                          +in(i+0,j+4) * 0.00223214285714
+                          +in(i+1,j+-4) * -0.00223214285714
+                          +in(i+1,j+-3) * -0.00416666666667
+                          +in(i+1,j+-2) * -0.0104166666667
                           +in(i+1,j+0) * 0.0625
                           +in(i+1,j+1) * 0.0625
-                          +in(i+1,j+2) * 0.010416666666666666
-                          +in(i+1,j+3) * 0.004166666666666667
-                          +in(i+1,j+4) * 0.002232142857142857
-                          +in(i+2,j+-4) * -0.002232142857142857
-                          +in(i+2,j+-3) * -0.004166666666666667
-                          +in(i+2,j+-1) * 0.010416666666666666
-                          +in(i+2,j+0) * 0.010416666666666666
-                          +in(i+2,j+1) * 0.010416666666666666
+                          +in(i+1,j+2) * 0.0104166666667
+                          +in(i+1,j+3) * 0.00416666666667
+                          +in(i+1,j+4) * 0.00223214285714
+                          +in(i+2,j+-4) * -0.00223214285714
+                          +in(i+2,j+-3) * -0.00416666666667
+                          +in(i+2,j+-1) * 0.0104166666667
+                          +in(i+2,j+0) * 0.0104166666667
+                          +in(i+2,j+1) * 0.0104166666667
                           +in(i+2,j+2) * 0.03125
-                          +in(i+2,j+3) * 0.004166666666666667
-                          +in(i+2,j+4) * 0.002232142857142857
-                          +in(i+3,j+-4) * -0.002232142857142857
-                          +in(i+3,j+-2) * 0.004166666666666667
-                          +in(i+3,j+-1) * 0.004166666666666667
-                          +in(i+3,j+0) * 0.004166666666666667
-                          +in(i+3,j+1) * 0.004166666666666667
-                          +in(i+3,j+2) * 0.004166666666666667
-                          +in(i+3,j+3) * 0.020833333333333332
-                          +in(i+3,j+4) * 0.002232142857142857
-                          +in(i+4,j+-3) * 0.002232142857142857
-                          +in(i+4,j+-2) * 0.002232142857142857
-                          +in(i+4,j+-1) * 0.002232142857142857
-                          +in(i+4,j+0) * 0.002232142857142857
-                          +in(i+4,j+1) * 0.002232142857142857
-                          +in(i+4,j+2) * 0.002232142857142857
-                          +in(i+4,j+3) * 0.002232142857142857
+                          +in(i+2,j+3) * 0.00416666666667
+                          +in(i+2,j+4) * 0.00223214285714
+                          +in(i+3,j+-4) * -0.00223214285714
+                          +in(i+3,j+-2) * 0.00416666666667
+                          +in(i+3,j+-1) * 0.00416666666667
+                          +in(i+3,j+0) * 0.00416666666667
+                          +in(i+3,j+1) * 0.00416666666667
+                          +in(i+3,j+2) * 0.00416666666667
+                          +in(i+3,j+3) * 0.0208333333333
+                          +in(i+3,j+4) * 0.00223214285714
+                          +in(i+4,j+-3) * 0.00223214285714
+                          +in(i+4,j+-2) * 0.00223214285714
+                          +in(i+4,j+-1) * 0.00223214285714
+                          +in(i+4,j+0) * 0.00223214285714
+                          +in(i+4,j+1) * 0.00223214285714
+                          +in(i+4,j+2) * 0.00223214285714
+                          +in(i+4,j+3) * 0.00223214285714
                           +in(i+4,j+4) * 0.015625
                           ;
-       }
      });
 }
 
 void grid5(const int n, const int t, matrix & in, matrix & out) {
-    Kokkos::parallel_for ( Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(5,n-5), KOKKOS_LAMBDA(const int i) {
-      PRAGMA_SIMD
-      for (auto j=5; j<n-5; ++j) {
-            out(i,j) += +in(i+-5,j+-5) * -0.01
-                          +in(i+-5,j+-4) * -0.0011111111111111111
-                          +in(i+-5,j+-3) * -0.0011111111111111111
-                          +in(i+-5,j+-2) * -0.0011111111111111111
-                          +in(i+-5,j+-1) * -0.0011111111111111111
-                          +in(i+-5,j+0) * -0.0011111111111111111
-                          +in(i+-5,j+1) * -0.0011111111111111111
-                          +in(i+-5,j+2) * -0.0011111111111111111
-                          +in(i+-5,j+3) * -0.0011111111111111111
-                          +in(i+-5,j+4) * -0.0011111111111111111
-                          +in(i+-4,j+-5) * -0.0011111111111111111
+    auto inside = Kokkos::MDRangePolicy<Kokkos::Rank<2>>({5,5},{n-5,n-5},{t,t});
+    Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) {
+              out(i,j) += +in(i+-5,j+-5) * -0.01
+                          +in(i+-5,j+-4) * -0.00111111111111
+                          +in(i+-5,j+-3) * -0.00111111111111
+                          +in(i+-5,j+-2) * -0.00111111111111
+                          +in(i+-5,j+-1) * -0.00111111111111
+                          +in(i+-5,j+0) * -0.00111111111111
+                          +in(i+-5,j+1) * -0.00111111111111
+                          +in(i+-5,j+2) * -0.00111111111111
+                          +in(i+-5,j+3) * -0.00111111111111
+                          +in(i+-5,j+4) * -0.00111111111111
+                          +in(i+-4,j+-5) * -0.00111111111111
                           +in(i+-4,j+-4) * -0.0125
-                          +in(i+-4,j+-3) * -0.0017857142857142857
-                          +in(i+-4,j+-2) * -0.0017857142857142857
-                          +in(i+-4,j+-1) * -0.0017857142857142857
-                          +in(i+-4,j+0) * -0.0017857142857142857
-                          +in(i+-4,j+1) * -0.0017857142857142857
-                          +in(i+-4,j+2) * -0.0017857142857142857
-                          +in(i+-4,j+3) * -0.0017857142857142857
-                          +in(i+-4,j+5) * 0.0011111111111111111
-                          +in(i+-3,j+-5) * -0.0011111111111111111
-                          +in(i+-3,j+-4) * -0.0017857142857142857
-                          +in(i+-3,j+-3) * -0.016666666666666666
-                          +in(i+-3,j+-2) * -0.0033333333333333335
-                          +in(i+-3,j+-1) * -0.0033333333333333335
-                          +in(i+-3,j+0) * -0.0033333333333333335
-                          +in(i+-3,j+1) * -0.0033333333333333335
-                          +in(i+-3,j+2) * -0.0033333333333333335
-                          +in(i+-3,j+4) * 0.0017857142857142857
-                          +in(i+-3,j+5) * 0.0011111111111111111
-                          +in(i+-2,j+-5) * -0.0011111111111111111
-                          +in(i+-2,j+-4) * -0.0017857142857142857
-                          +in(i+-2,j+-3) * -0.0033333333333333335
+                          +in(i+-4,j+-3) * -0.00178571428571
+                          +in(i+-4,j+-2) * -0.00178571428571
+                          +in(i+-4,j+-1) * -0.00178571428571
+                          +in(i+-4,j+0) * -0.00178571428571
+                          +in(i+-4,j+1) * -0.00178571428571
+                          +in(i+-4,j+2) * -0.00178571428571
+                          +in(i+-4,j+3) * -0.00178571428571
+                          +in(i+-4,j+5) * 0.00111111111111
+                          +in(i+-3,j+-5) * -0.00111111111111
+                          +in(i+-3,j+-4) * -0.00178571428571
+                          +in(i+-3,j+-3) * -0.0166666666667
+                          +in(i+-3,j+-2) * -0.00333333333333
+                          +in(i+-3,j+-1) * -0.00333333333333
+                          +in(i+-3,j+0) * -0.00333333333333
+                          +in(i+-3,j+1) * -0.00333333333333
+                          +in(i+-3,j+2) * -0.00333333333333
+                          +in(i+-3,j+4) * 0.00178571428571
+                          +in(i+-3,j+5) * 0.00111111111111
+                          +in(i+-2,j+-5) * -0.00111111111111
+                          +in(i+-2,j+-4) * -0.00178571428571
+                          +in(i+-2,j+-3) * -0.00333333333333
                           +in(i+-2,j+-2) * -0.025
-                          +in(i+-2,j+-1) * -0.008333333333333333
-                          +in(i+-2,j+0) * -0.008333333333333333
-                          +in(i+-2,j+1) * -0.008333333333333333
-                          +in(i+-2,j+3) * 0.0033333333333333335
-                          +in(i+-2,j+4) * 0.0017857142857142857
-                          +in(i+-2,j+5) * 0.0011111111111111111
-                          +in(i+-1,j+-5) * -0.0011111111111111111
-                          +in(i+-1,j+-4) * -0.0017857142857142857
-                          +in(i+-1,j+-3) * -0.0033333333333333335
-                          +in(i+-1,j+-2) * -0.008333333333333333
+                          +in(i+-2,j+-1) * -0.00833333333333
+                          +in(i+-2,j+0) * -0.00833333333333
+                          +in(i+-2,j+1) * -0.00833333333333
+                          +in(i+-2,j+3) * 0.00333333333333
+                          +in(i+-2,j+4) * 0.00178571428571
+                          +in(i+-2,j+5) * 0.00111111111111
+                          +in(i+-1,j+-5) * -0.00111111111111
+                          +in(i+-1,j+-4) * -0.00178571428571
+                          +in(i+-1,j+-3) * -0.00333333333333
+                          +in(i+-1,j+-2) * -0.00833333333333
                           +in(i+-1,j+-1) * -0.05
                           +in(i+-1,j+0) * -0.05
-                          +in(i+-1,j+2) * 0.008333333333333333
-                          +in(i+-1,j+3) * 0.0033333333333333335
-                          +in(i+-1,j+4) * 0.0017857142857142857
-                          +in(i+-1,j+5) * 0.0011111111111111111
-                          +in(i+0,j+-5) * -0.0011111111111111111
-                          +in(i+0,j+-4) * -0.0017857142857142857
-                          +in(i+0,j+-3) * -0.0033333333333333335
-                          +in(i+0,j+-2) * -0.008333333333333333
+                          +in(i+-1,j+2) * 0.00833333333333
+                          +in(i+-1,j+3) * 0.00333333333333
+                          +in(i+-1,j+4) * 0.00178571428571
+                          +in(i+-1,j+5) * 0.00111111111111
+                          +in(i+0,j+-5) * -0.00111111111111
+                          +in(i+0,j+-4) * -0.00178571428571
+                          +in(i+0,j+-3) * -0.00333333333333
+                          +in(i+0,j+-2) * -0.00833333333333
                           +in(i+0,j+-1) * -0.05
                           +in(i+0,j+1) * 0.05
-                          +in(i+0,j+2) * 0.008333333333333333
-                          +in(i+0,j+3) * 0.0033333333333333335
-                          +in(i+0,j+4) * 0.0017857142857142857
-                          +in(i+0,j+5) * 0.0011111111111111111
-                          +in(i+1,j+-5) * -0.0011111111111111111
-                          +in(i+1,j+-4) * -0.0017857142857142857
-                          +in(i+1,j+-3) * -0.0033333333333333335
-                          +in(i+1,j+-2) * -0.008333333333333333
+                          +in(i+0,j+2) * 0.00833333333333
+                          +in(i+0,j+3) * 0.00333333333333
+                          +in(i+0,j+4) * 0.00178571428571
+                          +in(i+0,j+5) * 0.00111111111111
+                          +in(i+1,j+-5) * -0.00111111111111
+                          +in(i+1,j+-4) * -0.00178571428571
+                          +in(i+1,j+-3) * -0.00333333333333
+                          +in(i+1,j+-2) * -0.00833333333333
                           +in(i+1,j+0) * 0.05
                           +in(i+1,j+1) * 0.05
-                          +in(i+1,j+2) * 0.008333333333333333
-                          +in(i+1,j+3) * 0.0033333333333333335
-                          +in(i+1,j+4) * 0.0017857142857142857
-                          +in(i+1,j+5) * 0.0011111111111111111
-                          +in(i+2,j+-5) * -0.0011111111111111111
-                          +in(i+2,j+-4) * -0.0017857142857142857
-                          +in(i+2,j+-3) * -0.0033333333333333335
-                          +in(i+2,j+-1) * 0.008333333333333333
-                          +in(i+2,j+0) * 0.008333333333333333
-                          +in(i+2,j+1) * 0.008333333333333333
+                          +in(i+1,j+2) * 0.00833333333333
+                          +in(i+1,j+3) * 0.00333333333333
+                          +in(i+1,j+4) * 0.00178571428571
+                          +in(i+1,j+5) * 0.00111111111111
+                          +in(i+2,j+-5) * -0.00111111111111
+                          +in(i+2,j+-4) * -0.00178571428571
+                          +in(i+2,j+-3) * -0.00333333333333
+                          +in(i+2,j+-1) * 0.00833333333333
+                          +in(i+2,j+0) * 0.00833333333333
+                          +in(i+2,j+1) * 0.00833333333333
                           +in(i+2,j+2) * 0.025
-                          +in(i+2,j+3) * 0.0033333333333333335
-                          +in(i+2,j+4) * 0.0017857142857142857
-                          +in(i+2,j+5) * 0.0011111111111111111
-                          +in(i+3,j+-5) * -0.0011111111111111111
-                          +in(i+3,j+-4) * -0.0017857142857142857
-                          +in(i+3,j+-2) * 0.0033333333333333335
-                          +in(i+3,j+-1) * 0.0033333333333333335
-                          +in(i+3,j+0) * 0.0033333333333333335
-                          +in(i+3,j+1) * 0.0033333333333333335
-                          +in(i+3,j+2) * 0.0033333333333333335
-                          +in(i+3,j+3) * 0.016666666666666666
-                          +in(i+3,j+4) * 0.0017857142857142857
-                          +in(i+3,j+5) * 0.0011111111111111111
-                          +in(i+4,j+-5) * -0.0011111111111111111
-                          +in(i+4,j+-3) * 0.0017857142857142857
-                          +in(i+4,j+-2) * 0.0017857142857142857
-                          +in(i+4,j+-1) * 0.0017857142857142857
-                          +in(i+4,j+0) * 0.0017857142857142857
-                          +in(i+4,j+1) * 0.0017857142857142857
-                          +in(i+4,j+2) * 0.0017857142857142857
-                          +in(i+4,j+3) * 0.0017857142857142857
+                          +in(i+2,j+3) * 0.00333333333333
+                          +in(i+2,j+4) * 0.00178571428571
+                          +in(i+2,j+5) * 0.00111111111111
+                          +in(i+3,j+-5) * -0.00111111111111
+                          +in(i+3,j+-4) * -0.00178571428571
+                          +in(i+3,j+-2) * 0.00333333333333
+                          +in(i+3,j+-1) * 0.00333333333333
+                          +in(i+3,j+0) * 0.00333333333333
+                          +in(i+3,j+1) * 0.00333333333333
+                          +in(i+3,j+2) * 0.00333333333333
+                          +in(i+3,j+3) * 0.0166666666667
+                          +in(i+3,j+4) * 0.00178571428571
+                          +in(i+3,j+5) * 0.00111111111111
+                          +in(i+4,j+-5) * -0.00111111111111
+                          +in(i+4,j+-3) * 0.00178571428571
+                          +in(i+4,j+-2) * 0.00178571428571
+                          +in(i+4,j+-1) * 0.00178571428571
+                          +in(i+4,j+0) * 0.00178571428571
+                          +in(i+4,j+1) * 0.00178571428571
+                          +in(i+4,j+2) * 0.00178571428571
+                          +in(i+4,j+3) * 0.00178571428571
                           +in(i+4,j+4) * 0.0125
-                          +in(i+4,j+5) * 0.0011111111111111111
-                          +in(i+5,j+-4) * 0.0011111111111111111
-                          +in(i+5,j+-3) * 0.0011111111111111111
-                          +in(i+5,j+-2) * 0.0011111111111111111
-                          +in(i+5,j+-1) * 0.0011111111111111111
-                          +in(i+5,j+0) * 0.0011111111111111111
-                          +in(i+5,j+1) * 0.0011111111111111111
-                          +in(i+5,j+2) * 0.0011111111111111111
-                          +in(i+5,j+3) * 0.0011111111111111111
-                          +in(i+5,j+4) * 0.0011111111111111111
+                          +in(i+4,j+5) * 0.00111111111111
+                          +in(i+5,j+-4) * 0.00111111111111
+                          +in(i+5,j+-3) * 0.00111111111111
+                          +in(i+5,j+-2) * 0.00111111111111
+                          +in(i+5,j+-1) * 0.00111111111111
+                          +in(i+5,j+0) * 0.00111111111111
+                          +in(i+5,j+1) * 0.00111111111111
+                          +in(i+5,j+2) * 0.00111111111111
+                          +in(i+5,j+3) * 0.00111111111111
+                          +in(i+5,j+4) * 0.00111111111111
                           +in(i+5,j+5) * 0.01
                           ;
-       }
      });
 }
 
diff --git a/Cxx11/stencil_openmp.hpp b/Cxx11/stencil_openmp.hpp
index 2d01b06e7..42edf4570 100644
--- a/Cxx11/stencil_openmp.hpp
+++ b/Cxx11/stencil_openmp.hpp
@@ -43,18 +43,18 @@ void star3(const int n, const int t, std::vector<double> & in, std::vector<doubl
         for (auto i=it; i<std::min(n-3,it+t); ++i) {
           OMP_SIMD
           for (auto j=jt; j<std::min(n-3,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.05555555555555555
-                          +in[(i+-2)*n+(j+0)] * -0.08333333333333333
-                          +in[(i+-1)*n+(j+0)] * -0.16666666666666666
-                          +in[(i+0)*n+(j+-3)] * -0.05555555555555555
-                          +in[(i+0)*n+(j+-2)] * -0.08333333333333333
-                          +in[(i+0)*n+(j+-1)] * -0.16666666666666666
-                          +in[(i+0)*n+(j+1)] * 0.16666666666666666
-                          +in[(i+0)*n+(j+2)] * 0.08333333333333333
-                          +in[(i+0)*n+(j+3)] * 0.05555555555555555
-                          +in[(i+1)*n+(j+0)] * 0.16666666666666666
-                          +in[(i+2)*n+(j+0)] * 0.08333333333333333
-                          +in[(i+3)*n+(j+0)] * 0.05555555555555555;
+            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556
+                          +in[(i+-2)*n+(j+0)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+0)] * -0.166666666667
+                          +in[(i+0)*n+(j+-3)] * -0.0555555555556
+                          +in[(i+0)*n+(j+-2)] * -0.0833333333333
+                          +in[(i+0)*n+(j+-1)] * -0.166666666667
+                          +in[(i+0)*n+(j+1)] * 0.166666666667
+                          +in[(i+0)*n+(j+2)] * 0.0833333333333
+                          +in[(i+0)*n+(j+3)] * 0.0555555555556
+                          +in[(i+1)*n+(j+0)] * 0.166666666667
+                          +in[(i+2)*n+(j+0)] * 0.0833333333333
+                          +in[(i+3)*n+(j+0)] * 0.0555555555556;
            }
          }
        }
@@ -69,20 +69,20 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
           OMP_SIMD
           for (auto j=jt; j<std::min(n-4,jt+t); ++j) {
             out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125
-                          +in[(i+-3)*n+(j+0)] * -0.041666666666666664
+                          +in[(i+-3)*n+(j+0)] * -0.0416666666667
                           +in[(i+-2)*n+(j+0)] * -0.0625
                           +in[(i+-1)*n+(j+0)] * -0.125
                           +in[(i+0)*n+(j+-4)] * -0.03125
-                          +in[(i+0)*n+(j+-3)] * -0.041666666666666664
+                          +in[(i+0)*n+(j+-3)] * -0.0416666666667
                           +in[(i+0)*n+(j+-2)] * -0.0625
                           +in[(i+0)*n+(j+-1)] * -0.125
                           +in[(i+0)*n+(j+1)] * 0.125
                           +in[(i+0)*n+(j+2)] * 0.0625
-                          +in[(i+0)*n+(j+3)] * 0.041666666666666664
+                          +in[(i+0)*n+(j+3)] * 0.0416666666667
                           +in[(i+0)*n+(j+4)] * 0.03125
                           +in[(i+1)*n+(j+0)] * 0.125
                           +in[(i+2)*n+(j+0)] * 0.0625
-                          +in[(i+3)*n+(j+0)] * 0.041666666666666664
+                          +in[(i+3)*n+(j+0)] * 0.0416666666667
                           +in[(i+4)*n+(j+0)] * 0.03125;
            }
          }
@@ -99,22 +99,22 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
           for (auto j=jt; j<std::min(n-5,jt+t); ++j) {
             out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02
                           +in[(i+-4)*n+(j+0)] * -0.025
-                          +in[(i+-3)*n+(j+0)] * -0.03333333333333333
+                          +in[(i+-3)*n+(j+0)] * -0.0333333333333
                           +in[(i+-2)*n+(j+0)] * -0.05
                           +in[(i+-1)*n+(j+0)] * -0.1
                           +in[(i+0)*n+(j+-5)] * -0.02
                           +in[(i+0)*n+(j+-4)] * -0.025
-                          +in[(i+0)*n+(j+-3)] * -0.03333333333333333
+                          +in[(i+0)*n+(j+-3)] * -0.0333333333333
                           +in[(i+0)*n+(j+-2)] * -0.05
                           +in[(i+0)*n+(j+-1)] * -0.1
                           +in[(i+0)*n+(j+1)] * 0.1
                           +in[(i+0)*n+(j+2)] * 0.05
-                          +in[(i+0)*n+(j+3)] * 0.03333333333333333
+                          +in[(i+0)*n+(j+3)] * 0.0333333333333
                           +in[(i+0)*n+(j+4)] * 0.025
                           +in[(i+0)*n+(j+5)] * 0.02
                           +in[(i+1)*n+(j+0)] * 0.1
                           +in[(i+2)*n+(j+0)] * 0.05
-                          +in[(i+3)*n+(j+0)] * 0.03333333333333333
+                          +in[(i+3)*n+(j+0)] * 0.0333333333333
                           +in[(i+4)*n+(j+0)] * 0.025
                           +in[(i+5)*n+(j+0)] * 0.02;
            }
@@ -151,24 +151,24 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
           OMP_SIMD
           for (auto j=jt; j<std::min(n-2,jt+t); ++j) {
             out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625
-                          +in[(i+-2)*n+(j+-1)] * -0.020833333333333332
-                          +in[(i+-2)*n+(j+0)] * -0.020833333333333332
-                          +in[(i+-2)*n+(j+1)] * -0.020833333333333332
-                          +in[(i+-1)*n+(j+-2)] * -0.020833333333333332
+                          +in[(i+-2)*n+(j+-1)] * -0.0208333333333
+                          +in[(i+-2)*n+(j+0)] * -0.0208333333333
+                          +in[(i+-2)*n+(j+1)] * -0.0208333333333
+                          +in[(i+-1)*n+(j+-2)] * -0.0208333333333
                           +in[(i+-1)*n+(j+-1)] * -0.125
                           +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+2)] * 0.020833333333333332
-                          +in[(i+0)*n+(j+-2)] * -0.020833333333333332
+                          +in[(i+-1)*n+(j+2)] * 0.0208333333333
+                          +in[(i+0)*n+(j+-2)] * -0.0208333333333
                           +in[(i+0)*n+(j+-1)] * -0.125
                           +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.020833333333333332
-                          +in[(i+1)*n+(j+-2)] * -0.020833333333333332
+                          +in[(i+0)*n+(j+2)] * 0.0208333333333
+                          +in[(i+1)*n+(j+-2)] * -0.0208333333333
                           +in[(i+1)*n+(j+0)] * 0.125
                           +in[(i+1)*n+(j+1)] * 0.125
-                          +in[(i+1)*n+(j+2)] * 0.020833333333333332
-                          +in[(i+2)*n+(j+-1)] * 0.020833333333333332
-                          +in[(i+2)*n+(j+0)] * 0.020833333333333332
-                          +in[(i+2)*n+(j+1)] * 0.020833333333333332
+                          +in[(i+1)*n+(j+2)] * 0.0208333333333
+                          +in[(i+2)*n+(j+-1)] * 0.0208333333333
+                          +in[(i+2)*n+(j+0)] * 0.0208333333333
+                          +in[(i+2)*n+(j+1)] * 0.0208333333333
                           +in[(i+2)*n+(j+2)] * 0.0625
                           ;
            }
@@ -184,48 +184,48 @@ void grid3(const int n, const int t, std::vector<double> & in, std::vector<doubl
         for (auto i=it; i<std::min(n-3,it+t); ++i) {
           OMP_SIMD
           for (auto j=jt; j<std::min(n-3,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.027777777777777776
-                          +in[(i+-3)*n+(j+-2)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+-1)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+0)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+1)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+2)] * -0.005555555555555556
-                          +in[(i+-2)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+-2)*n+(j+-2)] * -0.041666666666666664
-                          +in[(i+-2)*n+(j+-1)] * -0.013888888888888888
-                          +in[(i+-2)*n+(j+0)] * -0.013888888888888888
-                          +in[(i+-2)*n+(j+1)] * -0.013888888888888888
-                          +in[(i+-2)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+-1)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+-1)*n+(j+-2)] * -0.013888888888888888
-                          +in[(i+-1)*n+(j+-1)] * -0.08333333333333333
-                          +in[(i+-1)*n+(j+0)] * -0.08333333333333333
-                          +in[(i+-1)*n+(j+2)] * 0.013888888888888888
-                          +in[(i+-1)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+0)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.013888888888888888
-                          +in[(i+0)*n+(j+-1)] * -0.08333333333333333
-                          +in[(i+0)*n+(j+1)] * 0.08333333333333333
-                          +in[(i+0)*n+(j+2)] * 0.013888888888888888
-                          +in[(i+0)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+1)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+1)*n+(j+-2)] * -0.013888888888888888
-                          +in[(i+1)*n+(j+0)] * 0.08333333333333333
-                          +in[(i+1)*n+(j+1)] * 0.08333333333333333
-                          +in[(i+1)*n+(j+2)] * 0.013888888888888888
-                          +in[(i+1)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+2)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+2)*n+(j+-1)] * 0.013888888888888888
-                          +in[(i+2)*n+(j+0)] * 0.013888888888888888
-                          +in[(i+2)*n+(j+1)] * 0.013888888888888888
-                          +in[(i+2)*n+(j+2)] * 0.041666666666666664
-                          +in[(i+2)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+-2)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+-1)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+0)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+1)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+2)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+3)] * 0.027777777777777776
+            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778
+                          +in[(i+-3)*n+(j+-2)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+-1)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+0)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+1)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+2)] * -0.00555555555556
+                          +in[(i+-2)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+-2)*n+(j+-2)] * -0.0416666666667
+                          +in[(i+-2)*n+(j+-1)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+0)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+1)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+3)] * 0.00555555555556
+                          +in[(i+-1)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+-1)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+-1)*n+(j+-1)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+0)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+-1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+0)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+0)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+0)*n+(j+-1)] * -0.0833333333333
+                          +in[(i+0)*n+(j+1)] * 0.0833333333333
+                          +in[(i+0)*n+(j+2)] * 0.0138888888889
+                          +in[(i+0)*n+(j+3)] * 0.00555555555556
+                          +in[(i+1)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+1)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+1)*n+(j+0)] * 0.0833333333333
+                          +in[(i+1)*n+(j+1)] * 0.0833333333333
+                          +in[(i+1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+2)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+2)*n+(j+-1)] * 0.0138888888889
+                          +in[(i+2)*n+(j+0)] * 0.0138888888889
+                          +in[(i+2)*n+(j+1)] * 0.0138888888889
+                          +in[(i+2)*n+(j+2)] * 0.0416666666667
+                          +in[(i+2)*n+(j+3)] * 0.00555555555556
+                          +in[(i+3)*n+(j+-2)] * 0.00555555555556
+                          +in[(i+3)*n+(j+-1)] * 0.00555555555556
+                          +in[(i+3)*n+(j+0)] * 0.00555555555556
+                          +in[(i+3)*n+(j+1)] * 0.00555555555556
+                          +in[(i+3)*n+(j+2)] * 0.00555555555556
+                          +in[(i+3)*n+(j+3)] * 0.0277777777778
                           ;
            }
          }
@@ -241,76 +241,76 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
           OMP_SIMD
           for (auto j=jt; j<std::min(n-4,jt+t); ++j) {
             out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625
-                          +in[(i+-4)*n+(j+-3)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+-2)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+-1)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+0)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+1)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+2)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+3)] * -0.002232142857142857
-                          +in[(i+-3)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+-3)*n+(j+-3)] * -0.020833333333333332
-                          +in[(i+-3)*n+(j+-2)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+-1)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+0)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+1)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+2)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+-2)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+-2)*n+(j+-3)] * -0.004166666666666667
+                          +in[(i+-4)*n+(j+-3)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+-2)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+-1)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+0)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+1)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+2)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+3)] * -0.00223214285714
+                          +in[(i+-3)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-3)*n+(j+-3)] * -0.0208333333333
+                          +in[(i+-3)*n+(j+-2)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+-1)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+0)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+1)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+2)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+4)] * 0.00223214285714
+                          +in[(i+-2)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-2)*n+(j+-3)] * -0.00416666666667
                           +in[(i+-2)*n+(j+-2)] * -0.03125
-                          +in[(i+-2)*n+(j+-1)] * -0.010416666666666666
-                          +in[(i+-2)*n+(j+0)] * -0.010416666666666666
-                          +in[(i+-2)*n+(j+1)] * -0.010416666666666666
-                          +in[(i+-2)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+-2)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+-1)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+-1)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+-1)*n+(j+-2)] * -0.010416666666666666
+                          +in[(i+-2)*n+(j+-1)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+0)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+1)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+-2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+-1)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-1)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+-1)*n+(j+-2)] * -0.0104166666667
                           +in[(i+-1)*n+(j+-1)] * -0.0625
                           +in[(i+-1)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+2)] * 0.010416666666666666
-                          +in[(i+-1)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+-1)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+0)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+0)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+0)*n+(j+-2)] * -0.010416666666666666
+                          +in[(i+-1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+-1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+-1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+0)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+0)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+0)*n+(j+-2)] * -0.0104166666667
                           +in[(i+0)*n+(j+-1)] * -0.0625
                           +in[(i+0)*n+(j+1)] * 0.0625
-                          +in[(i+0)*n+(j+2)] * 0.010416666666666666
-                          +in[(i+0)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+0)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+1)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+1)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+1)*n+(j+-2)] * -0.010416666666666666
+                          +in[(i+0)*n+(j+2)] * 0.0104166666667
+                          +in[(i+0)*n+(j+3)] * 0.00416666666667
+                          +in[(i+0)*n+(j+4)] * 0.00223214285714
+                          +in[(i+1)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+1)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+1)*n+(j+-2)] * -0.0104166666667
                           +in[(i+1)*n+(j+0)] * 0.0625
                           +in[(i+1)*n+(j+1)] * 0.0625
-                          +in[(i+1)*n+(j+2)] * 0.010416666666666666
-                          +in[(i+1)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+1)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+2)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+2)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+2)*n+(j+-1)] * 0.010416666666666666
-                          +in[(i+2)*n+(j+0)] * 0.010416666666666666
-                          +in[(i+2)*n+(j+1)] * 0.010416666666666666
+                          +in[(i+1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+2)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+2)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+2)*n+(j+-1)] * 0.0104166666667
+                          +in[(i+2)*n+(j+0)] * 0.0104166666667
+                          +in[(i+2)*n+(j+1)] * 0.0104166666667
                           +in[(i+2)*n+(j+2)] * 0.03125
-                          +in[(i+2)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+2)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+3)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+3)*n+(j+-2)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+-1)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+0)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+1)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+2)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+3)] * 0.020833333333333332
-                          +in[(i+3)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+-3)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+-2)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+-1)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+0)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+1)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+2)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+3)] * 0.002232142857142857
+                          +in[(i+2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+3)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+3)*n+(j+-2)] * 0.00416666666667
+                          +in[(i+3)*n+(j+-1)] * 0.00416666666667
+                          +in[(i+3)*n+(j+0)] * 0.00416666666667
+                          +in[(i+3)*n+(j+1)] * 0.00416666666667
+                          +in[(i+3)*n+(j+2)] * 0.00416666666667
+                          +in[(i+3)*n+(j+3)] * 0.0208333333333
+                          +in[(i+3)*n+(j+4)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-3)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-2)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-1)] * 0.00223214285714
+                          +in[(i+4)*n+(j+0)] * 0.00223214285714
+                          +in[(i+4)*n+(j+1)] * 0.00223214285714
+                          +in[(i+4)*n+(j+2)] * 0.00223214285714
+                          +in[(i+4)*n+(j+3)] * 0.00223214285714
                           +in[(i+4)*n+(j+4)] * 0.015625
                           ;
            }
@@ -327,114 +327,114 @@ void grid5(const int n, const int t, std::vector<double> & in, std::vector<doubl
           OMP_SIMD
           for (auto j=jt; j<std::min(n-5,jt+t); ++j) {
             out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01
-                          +in[(i+-5)*n+(j+-4)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+-3)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+-2)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+-1)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+0)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+1)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+2)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+3)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+4)] * -0.0011111111111111111
-                          +in[(i+-4)*n+(j+-5)] * -0.0011111111111111111
+                          +in[(i+-5)*n+(j+-4)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-3)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-2)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-1)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+0)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+1)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+2)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+3)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+4)] * -0.00111111111111
+                          +in[(i+-4)*n+(j+-5)] * -0.00111111111111
                           +in[(i+-4)*n+(j+-4)] * -0.0125
-                          +in[(i+-4)*n+(j+-3)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+-2)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+-1)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+0)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+1)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+2)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+3)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+-3)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+-3)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+-3)*n+(j+-3)] * -0.016666666666666666
-                          +in[(i+-3)*n+(j+-2)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+-1)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+0)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+1)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+2)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+-3)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+-2)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+-2)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+-2)*n+(j+-3)] * -0.0033333333333333335
+                          +in[(i+-4)*n+(j+-3)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+-2)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+-1)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+0)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+1)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+2)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+3)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-3)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-3)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-3)*n+(j+-3)] * -0.0166666666667
+                          +in[(i+-3)*n+(j+-2)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+-1)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+0)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+1)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+2)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-2)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-2)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-2)*n+(j+-3)] * -0.00333333333333
                           +in[(i+-2)*n+(j+-2)] * -0.025
-                          +in[(i+-2)*n+(j+-1)] * -0.008333333333333333
-                          +in[(i+-2)*n+(j+0)] * -0.008333333333333333
-                          +in[(i+-2)*n+(j+1)] * -0.008333333333333333
-                          +in[(i+-2)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+-2)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+-2)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+-1)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+-1)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+-1)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+-1)*n+(j+-2)] * -0.008333333333333333
+                          +in[(i+-2)*n+(j+-1)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+0)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+1)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+-2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-1)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-1)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-1)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+-1)*n+(j+-2)] * -0.00833333333333
                           +in[(i+-1)*n+(j+-1)] * -0.05
                           +in[(i+-1)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+2)] * 0.008333333333333333
-                          +in[(i+-1)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+-1)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+-1)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+0)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+0)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+0)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+0)*n+(j+-2)] * -0.008333333333333333
+                          +in[(i+-1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+-1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+-1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+0)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+0)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+0)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+0)*n+(j+-2)] * -0.00833333333333
                           +in[(i+0)*n+(j+-1)] * -0.05
                           +in[(i+0)*n+(j+1)] * 0.05
-                          +in[(i+0)*n+(j+2)] * 0.008333333333333333
-                          +in[(i+0)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+0)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+0)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+1)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+1)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+1)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+1)*n+(j+-2)] * -0.008333333333333333
+                          +in[(i+0)*n+(j+2)] * 0.00833333333333
+                          +in[(i+0)*n+(j+3)] * 0.00333333333333
+                          +in[(i+0)*n+(j+4)] * 0.00178571428571
+                          +in[(i+0)*n+(j+5)] * 0.00111111111111
+                          +in[(i+1)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+1)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+1)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+1)*n+(j+-2)] * -0.00833333333333
                           +in[(i+1)*n+(j+0)] * 0.05
                           +in[(i+1)*n+(j+1)] * 0.05
-                          +in[(i+1)*n+(j+2)] * 0.008333333333333333
-                          +in[(i+1)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+1)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+1)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+2)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+2)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+2)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+2)*n+(j+-1)] * 0.008333333333333333
-                          +in[(i+2)*n+(j+0)] * 0.008333333333333333
-                          +in[(i+2)*n+(j+1)] * 0.008333333333333333
+                          +in[(i+1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+2)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+2)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+2)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+2)*n+(j+-1)] * 0.00833333333333
+                          +in[(i+2)*n+(j+0)] * 0.00833333333333
+                          +in[(i+2)*n+(j+1)] * 0.00833333333333
                           +in[(i+2)*n+(j+2)] * 0.025
-                          +in[(i+2)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+2)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+2)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+3)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+3)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+3)*n+(j+-2)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+-1)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+0)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+1)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+2)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+3)] * 0.016666666666666666
-                          +in[(i+3)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+3)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+4)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+4)*n+(j+-3)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+-2)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+-1)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+0)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+1)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+2)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+3)] * 0.0017857142857142857
+                          +in[(i+2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+3)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+3)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+3)*n+(j+-2)] * 0.00333333333333
+                          +in[(i+3)*n+(j+-1)] * 0.00333333333333
+                          +in[(i+3)*n+(j+0)] * 0.00333333333333
+                          +in[(i+3)*n+(j+1)] * 0.00333333333333
+                          +in[(i+3)*n+(j+2)] * 0.00333333333333
+                          +in[(i+3)*n+(j+3)] * 0.0166666666667
+                          +in[(i+3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+4)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+4)*n+(j+-3)] * 0.00178571428571
+                          +in[(i+4)*n+(j+-2)] * 0.00178571428571
+                          +in[(i+4)*n+(j+-1)] * 0.00178571428571
+                          +in[(i+4)*n+(j+0)] * 0.00178571428571
+                          +in[(i+4)*n+(j+1)] * 0.00178571428571
+                          +in[(i+4)*n+(j+2)] * 0.00178571428571
+                          +in[(i+4)*n+(j+3)] * 0.00178571428571
                           +in[(i+4)*n+(j+4)] * 0.0125
-                          +in[(i+4)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-4)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-3)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-2)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-1)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+0)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+1)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+2)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+3)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+4)] * 0.0011111111111111111
+                          +in[(i+4)*n+(j+5)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-4)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-3)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-2)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-1)] * 0.00111111111111
+                          +in[(i+5)*n+(j+0)] * 0.00111111111111
+                          +in[(i+5)*n+(j+1)] * 0.00111111111111
+                          +in[(i+5)*n+(j+2)] * 0.00111111111111
+                          +in[(i+5)*n+(j+3)] * 0.00111111111111
+                          +in[(i+5)*n+(j+4)] * 0.00111111111111
                           +in[(i+5)*n+(j+5)] * 0.01
                           ;
            }
diff --git a/Cxx11/stencil_pgnu.hpp b/Cxx11/stencil_pgnu.hpp
index 4cc9ca20f..c1236b120 100644
--- a/Cxx11/stencil_pgnu.hpp
+++ b/Cxx11/stencil_pgnu.hpp
@@ -30,18 +30,18 @@ void star3(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = boost::irange(3,n-3);
     __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.05555555555555555
-                          +in[(i+-2)*n+(j+0)] * -0.08333333333333333
-                          +in[(i+-1)*n+(j+0)] * -0.16666666666666666
-                          +in[(i+0)*n+(j+-3)] * -0.05555555555555555
-                          +in[(i+0)*n+(j+-2)] * -0.08333333333333333
-                          +in[(i+0)*n+(j+-1)] * -0.16666666666666666
-                          +in[(i+0)*n+(j+1)] * 0.16666666666666666
-                          +in[(i+0)*n+(j+2)] * 0.08333333333333333
-                          +in[(i+0)*n+(j+3)] * 0.05555555555555555
-                          +in[(i+1)*n+(j+0)] * 0.16666666666666666
-                          +in[(i+2)*n+(j+0)] * 0.08333333333333333
-                          +in[(i+3)*n+(j+0)] * 0.05555555555555555;
+            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556
+                          +in[(i+-2)*n+(j+0)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+0)] * -0.166666666667
+                          +in[(i+0)*n+(j+-3)] * -0.0555555555556
+                          +in[(i+0)*n+(j+-2)] * -0.0833333333333
+                          +in[(i+0)*n+(j+-1)] * -0.166666666667
+                          +in[(i+0)*n+(j+1)] * 0.166666666667
+                          +in[(i+0)*n+(j+2)] * 0.0833333333333
+                          +in[(i+0)*n+(j+3)] * 0.0555555555556
+                          +in[(i+1)*n+(j+0)] * 0.166666666667
+                          +in[(i+2)*n+(j+0)] * 0.0833333333333
+                          +in[(i+3)*n+(j+0)] * 0.0555555555556;
        });
      });
 }
@@ -51,20 +51,20 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
     __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125
-                          +in[(i+-3)*n+(j+0)] * -0.041666666666666664
+                          +in[(i+-3)*n+(j+0)] * -0.0416666666667
                           +in[(i+-2)*n+(j+0)] * -0.0625
                           +in[(i+-1)*n+(j+0)] * -0.125
                           +in[(i+0)*n+(j+-4)] * -0.03125
-                          +in[(i+0)*n+(j+-3)] * -0.041666666666666664
+                          +in[(i+0)*n+(j+-3)] * -0.0416666666667
                           +in[(i+0)*n+(j+-2)] * -0.0625
                           +in[(i+0)*n+(j+-1)] * -0.125
                           +in[(i+0)*n+(j+1)] * 0.125
                           +in[(i+0)*n+(j+2)] * 0.0625
-                          +in[(i+0)*n+(j+3)] * 0.041666666666666664
+                          +in[(i+0)*n+(j+3)] * 0.0416666666667
                           +in[(i+0)*n+(j+4)] * 0.03125
                           +in[(i+1)*n+(j+0)] * 0.125
                           +in[(i+2)*n+(j+0)] * 0.0625
-                          +in[(i+3)*n+(j+0)] * 0.041666666666666664
+                          +in[(i+3)*n+(j+0)] * 0.0416666666667
                           +in[(i+4)*n+(j+0)] * 0.03125;
        });
      });
@@ -76,22 +76,22 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02
                           +in[(i+-4)*n+(j+0)] * -0.025
-                          +in[(i+-3)*n+(j+0)] * -0.03333333333333333
+                          +in[(i+-3)*n+(j+0)] * -0.0333333333333
                           +in[(i+-2)*n+(j+0)] * -0.05
                           +in[(i+-1)*n+(j+0)] * -0.1
                           +in[(i+0)*n+(j+-5)] * -0.02
                           +in[(i+0)*n+(j+-4)] * -0.025
-                          +in[(i+0)*n+(j+-3)] * -0.03333333333333333
+                          +in[(i+0)*n+(j+-3)] * -0.0333333333333
                           +in[(i+0)*n+(j+-2)] * -0.05
                           +in[(i+0)*n+(j+-1)] * -0.1
                           +in[(i+0)*n+(j+1)] * 0.1
                           +in[(i+0)*n+(j+2)] * 0.05
-                          +in[(i+0)*n+(j+3)] * 0.03333333333333333
+                          +in[(i+0)*n+(j+3)] * 0.0333333333333
                           +in[(i+0)*n+(j+4)] * 0.025
                           +in[(i+0)*n+(j+5)] * 0.02
                           +in[(i+1)*n+(j+0)] * 0.1
                           +in[(i+2)*n+(j+0)] * 0.05
-                          +in[(i+3)*n+(j+0)] * 0.03333333333333333
+                          +in[(i+3)*n+(j+0)] * 0.0333333333333
                           +in[(i+4)*n+(j+0)] * 0.025
                           +in[(i+5)*n+(j+0)] * 0.02;
        });
@@ -118,24 +118,24 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
     __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625
-                          +in[(i+-2)*n+(j+-1)] * -0.020833333333333332
-                          +in[(i+-2)*n+(j+0)] * -0.020833333333333332
-                          +in[(i+-2)*n+(j+1)] * -0.020833333333333332
-                          +in[(i+-1)*n+(j+-2)] * -0.020833333333333332
+                          +in[(i+-2)*n+(j+-1)] * -0.0208333333333
+                          +in[(i+-2)*n+(j+0)] * -0.0208333333333
+                          +in[(i+-2)*n+(j+1)] * -0.0208333333333
+                          +in[(i+-1)*n+(j+-2)] * -0.0208333333333
                           +in[(i+-1)*n+(j+-1)] * -0.125
                           +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+2)] * 0.020833333333333332
-                          +in[(i+0)*n+(j+-2)] * -0.020833333333333332
+                          +in[(i+-1)*n+(j+2)] * 0.0208333333333
+                          +in[(i+0)*n+(j+-2)] * -0.0208333333333
                           +in[(i+0)*n+(j+-1)] * -0.125
                           +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.020833333333333332
-                          +in[(i+1)*n+(j+-2)] * -0.020833333333333332
+                          +in[(i+0)*n+(j+2)] * 0.0208333333333
+                          +in[(i+1)*n+(j+-2)] * -0.0208333333333
                           +in[(i+1)*n+(j+0)] * 0.125
                           +in[(i+1)*n+(j+1)] * 0.125
-                          +in[(i+1)*n+(j+2)] * 0.020833333333333332
-                          +in[(i+2)*n+(j+-1)] * 0.020833333333333332
-                          +in[(i+2)*n+(j+0)] * 0.020833333333333332
-                          +in[(i+2)*n+(j+1)] * 0.020833333333333332
+                          +in[(i+1)*n+(j+2)] * 0.0208333333333
+                          +in[(i+2)*n+(j+-1)] * 0.0208333333333
+                          +in[(i+2)*n+(j+0)] * 0.0208333333333
+                          +in[(i+2)*n+(j+1)] * 0.0208333333333
                           +in[(i+2)*n+(j+2)] * 0.0625
                           ;
        });
@@ -146,48 +146,48 @@ void grid3(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = boost::irange(3,n-3);
     __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.027777777777777776
-                          +in[(i+-3)*n+(j+-2)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+-1)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+0)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+1)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+2)] * -0.005555555555555556
-                          +in[(i+-2)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+-2)*n+(j+-2)] * -0.041666666666666664
-                          +in[(i+-2)*n+(j+-1)] * -0.013888888888888888
-                          +in[(i+-2)*n+(j+0)] * -0.013888888888888888
-                          +in[(i+-2)*n+(j+1)] * -0.013888888888888888
-                          +in[(i+-2)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+-1)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+-1)*n+(j+-2)] * -0.013888888888888888
-                          +in[(i+-1)*n+(j+-1)] * -0.08333333333333333
-                          +in[(i+-1)*n+(j+0)] * -0.08333333333333333
-                          +in[(i+-1)*n+(j+2)] * 0.013888888888888888
-                          +in[(i+-1)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+0)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.013888888888888888
-                          +in[(i+0)*n+(j+-1)] * -0.08333333333333333
-                          +in[(i+0)*n+(j+1)] * 0.08333333333333333
-                          +in[(i+0)*n+(j+2)] * 0.013888888888888888
-                          +in[(i+0)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+1)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+1)*n+(j+-2)] * -0.013888888888888888
-                          +in[(i+1)*n+(j+0)] * 0.08333333333333333
-                          +in[(i+1)*n+(j+1)] * 0.08333333333333333
-                          +in[(i+1)*n+(j+2)] * 0.013888888888888888
-                          +in[(i+1)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+2)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+2)*n+(j+-1)] * 0.013888888888888888
-                          +in[(i+2)*n+(j+0)] * 0.013888888888888888
-                          +in[(i+2)*n+(j+1)] * 0.013888888888888888
-                          +in[(i+2)*n+(j+2)] * 0.041666666666666664
-                          +in[(i+2)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+-2)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+-1)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+0)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+1)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+2)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+3)] * 0.027777777777777776
+            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778
+                          +in[(i+-3)*n+(j+-2)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+-1)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+0)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+1)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+2)] * -0.00555555555556
+                          +in[(i+-2)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+-2)*n+(j+-2)] * -0.0416666666667
+                          +in[(i+-2)*n+(j+-1)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+0)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+1)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+3)] * 0.00555555555556
+                          +in[(i+-1)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+-1)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+-1)*n+(j+-1)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+0)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+-1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+0)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+0)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+0)*n+(j+-1)] * -0.0833333333333
+                          +in[(i+0)*n+(j+1)] * 0.0833333333333
+                          +in[(i+0)*n+(j+2)] * 0.0138888888889
+                          +in[(i+0)*n+(j+3)] * 0.00555555555556
+                          +in[(i+1)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+1)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+1)*n+(j+0)] * 0.0833333333333
+                          +in[(i+1)*n+(j+1)] * 0.0833333333333
+                          +in[(i+1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+2)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+2)*n+(j+-1)] * 0.0138888888889
+                          +in[(i+2)*n+(j+0)] * 0.0138888888889
+                          +in[(i+2)*n+(j+1)] * 0.0138888888889
+                          +in[(i+2)*n+(j+2)] * 0.0416666666667
+                          +in[(i+2)*n+(j+3)] * 0.00555555555556
+                          +in[(i+3)*n+(j+-2)] * 0.00555555555556
+                          +in[(i+3)*n+(j+-1)] * 0.00555555555556
+                          +in[(i+3)*n+(j+0)] * 0.00555555555556
+                          +in[(i+3)*n+(j+1)] * 0.00555555555556
+                          +in[(i+3)*n+(j+2)] * 0.00555555555556
+                          +in[(i+3)*n+(j+3)] * 0.0277777777778
                           ;
        });
      });
@@ -198,76 +198,76 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
     __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625
-                          +in[(i+-4)*n+(j+-3)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+-2)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+-1)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+0)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+1)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+2)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+3)] * -0.002232142857142857
-                          +in[(i+-3)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+-3)*n+(j+-3)] * -0.020833333333333332
-                          +in[(i+-3)*n+(j+-2)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+-1)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+0)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+1)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+2)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+-2)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+-2)*n+(j+-3)] * -0.004166666666666667
+                          +in[(i+-4)*n+(j+-3)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+-2)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+-1)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+0)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+1)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+2)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+3)] * -0.00223214285714
+                          +in[(i+-3)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-3)*n+(j+-3)] * -0.0208333333333
+                          +in[(i+-3)*n+(j+-2)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+-1)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+0)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+1)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+2)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+4)] * 0.00223214285714
+                          +in[(i+-2)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-2)*n+(j+-3)] * -0.00416666666667
                           +in[(i+-2)*n+(j+-2)] * -0.03125
-                          +in[(i+-2)*n+(j+-1)] * -0.010416666666666666
-                          +in[(i+-2)*n+(j+0)] * -0.010416666666666666
-                          +in[(i+-2)*n+(j+1)] * -0.010416666666666666
-                          +in[(i+-2)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+-2)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+-1)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+-1)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+-1)*n+(j+-2)] * -0.010416666666666666
+                          +in[(i+-2)*n+(j+-1)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+0)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+1)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+-2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+-1)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-1)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+-1)*n+(j+-2)] * -0.0104166666667
                           +in[(i+-1)*n+(j+-1)] * -0.0625
                           +in[(i+-1)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+2)] * 0.010416666666666666
-                          +in[(i+-1)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+-1)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+0)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+0)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+0)*n+(j+-2)] * -0.010416666666666666
+                          +in[(i+-1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+-1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+-1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+0)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+0)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+0)*n+(j+-2)] * -0.0104166666667
                           +in[(i+0)*n+(j+-1)] * -0.0625
                           +in[(i+0)*n+(j+1)] * 0.0625
-                          +in[(i+0)*n+(j+2)] * 0.010416666666666666
-                          +in[(i+0)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+0)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+1)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+1)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+1)*n+(j+-2)] * -0.010416666666666666
+                          +in[(i+0)*n+(j+2)] * 0.0104166666667
+                          +in[(i+0)*n+(j+3)] * 0.00416666666667
+                          +in[(i+0)*n+(j+4)] * 0.00223214285714
+                          +in[(i+1)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+1)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+1)*n+(j+-2)] * -0.0104166666667
                           +in[(i+1)*n+(j+0)] * 0.0625
                           +in[(i+1)*n+(j+1)] * 0.0625
-                          +in[(i+1)*n+(j+2)] * 0.010416666666666666
-                          +in[(i+1)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+1)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+2)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+2)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+2)*n+(j+-1)] * 0.010416666666666666
-                          +in[(i+2)*n+(j+0)] * 0.010416666666666666
-                          +in[(i+2)*n+(j+1)] * 0.010416666666666666
+                          +in[(i+1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+2)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+2)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+2)*n+(j+-1)] * 0.0104166666667
+                          +in[(i+2)*n+(j+0)] * 0.0104166666667
+                          +in[(i+2)*n+(j+1)] * 0.0104166666667
                           +in[(i+2)*n+(j+2)] * 0.03125
-                          +in[(i+2)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+2)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+3)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+3)*n+(j+-2)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+-1)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+0)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+1)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+2)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+3)] * 0.020833333333333332
-                          +in[(i+3)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+-3)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+-2)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+-1)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+0)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+1)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+2)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+3)] * 0.002232142857142857
+                          +in[(i+2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+3)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+3)*n+(j+-2)] * 0.00416666666667
+                          +in[(i+3)*n+(j+-1)] * 0.00416666666667
+                          +in[(i+3)*n+(j+0)] * 0.00416666666667
+                          +in[(i+3)*n+(j+1)] * 0.00416666666667
+                          +in[(i+3)*n+(j+2)] * 0.00416666666667
+                          +in[(i+3)*n+(j+3)] * 0.0208333333333
+                          +in[(i+3)*n+(j+4)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-3)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-2)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-1)] * 0.00223214285714
+                          +in[(i+4)*n+(j+0)] * 0.00223214285714
+                          +in[(i+4)*n+(j+1)] * 0.00223214285714
+                          +in[(i+4)*n+(j+2)] * 0.00223214285714
+                          +in[(i+4)*n+(j+3)] * 0.00223214285714
                           +in[(i+4)*n+(j+4)] * 0.015625
                           ;
        });
@@ -279,114 +279,114 @@ void grid5(const int n, const int t, std::vector<double> & in, std::vector<doubl
     __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01
-                          +in[(i+-5)*n+(j+-4)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+-3)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+-2)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+-1)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+0)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+1)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+2)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+3)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+4)] * -0.0011111111111111111
-                          +in[(i+-4)*n+(j+-5)] * -0.0011111111111111111
+                          +in[(i+-5)*n+(j+-4)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-3)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-2)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-1)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+0)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+1)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+2)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+3)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+4)] * -0.00111111111111
+                          +in[(i+-4)*n+(j+-5)] * -0.00111111111111
                           +in[(i+-4)*n+(j+-4)] * -0.0125
-                          +in[(i+-4)*n+(j+-3)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+-2)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+-1)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+0)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+1)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+2)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+3)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+-3)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+-3)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+-3)*n+(j+-3)] * -0.016666666666666666
-                          +in[(i+-3)*n+(j+-2)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+-1)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+0)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+1)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+2)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+-3)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+-2)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+-2)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+-2)*n+(j+-3)] * -0.0033333333333333335
+                          +in[(i+-4)*n+(j+-3)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+-2)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+-1)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+0)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+1)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+2)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+3)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-3)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-3)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-3)*n+(j+-3)] * -0.0166666666667
+                          +in[(i+-3)*n+(j+-2)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+-1)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+0)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+1)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+2)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-2)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-2)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-2)*n+(j+-3)] * -0.00333333333333
                           +in[(i+-2)*n+(j+-2)] * -0.025
-                          +in[(i+-2)*n+(j+-1)] * -0.008333333333333333
-                          +in[(i+-2)*n+(j+0)] * -0.008333333333333333
-                          +in[(i+-2)*n+(j+1)] * -0.008333333333333333
-                          +in[(i+-2)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+-2)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+-2)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+-1)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+-1)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+-1)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+-1)*n+(j+-2)] * -0.008333333333333333
+                          +in[(i+-2)*n+(j+-1)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+0)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+1)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+-2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-1)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-1)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-1)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+-1)*n+(j+-2)] * -0.00833333333333
                           +in[(i+-1)*n+(j+-1)] * -0.05
                           +in[(i+-1)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+2)] * 0.008333333333333333
-                          +in[(i+-1)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+-1)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+-1)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+0)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+0)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+0)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+0)*n+(j+-2)] * -0.008333333333333333
+                          +in[(i+-1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+-1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+-1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+0)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+0)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+0)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+0)*n+(j+-2)] * -0.00833333333333
                           +in[(i+0)*n+(j+-1)] * -0.05
                           +in[(i+0)*n+(j+1)] * 0.05
-                          +in[(i+0)*n+(j+2)] * 0.008333333333333333
-                          +in[(i+0)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+0)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+0)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+1)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+1)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+1)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+1)*n+(j+-2)] * -0.008333333333333333
+                          +in[(i+0)*n+(j+2)] * 0.00833333333333
+                          +in[(i+0)*n+(j+3)] * 0.00333333333333
+                          +in[(i+0)*n+(j+4)] * 0.00178571428571
+                          +in[(i+0)*n+(j+5)] * 0.00111111111111
+                          +in[(i+1)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+1)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+1)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+1)*n+(j+-2)] * -0.00833333333333
                           +in[(i+1)*n+(j+0)] * 0.05
                           +in[(i+1)*n+(j+1)] * 0.05
-                          +in[(i+1)*n+(j+2)] * 0.008333333333333333
-                          +in[(i+1)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+1)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+1)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+2)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+2)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+2)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+2)*n+(j+-1)] * 0.008333333333333333
-                          +in[(i+2)*n+(j+0)] * 0.008333333333333333
-                          +in[(i+2)*n+(j+1)] * 0.008333333333333333
+                          +in[(i+1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+2)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+2)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+2)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+2)*n+(j+-1)] * 0.00833333333333
+                          +in[(i+2)*n+(j+0)] * 0.00833333333333
+                          +in[(i+2)*n+(j+1)] * 0.00833333333333
                           +in[(i+2)*n+(j+2)] * 0.025
-                          +in[(i+2)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+2)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+2)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+3)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+3)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+3)*n+(j+-2)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+-1)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+0)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+1)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+2)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+3)] * 0.016666666666666666
-                          +in[(i+3)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+3)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+4)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+4)*n+(j+-3)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+-2)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+-1)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+0)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+1)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+2)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+3)] * 0.0017857142857142857
+                          +in[(i+2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+3)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+3)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+3)*n+(j+-2)] * 0.00333333333333
+                          +in[(i+3)*n+(j+-1)] * 0.00333333333333
+                          +in[(i+3)*n+(j+0)] * 0.00333333333333
+                          +in[(i+3)*n+(j+1)] * 0.00333333333333
+                          +in[(i+3)*n+(j+2)] * 0.00333333333333
+                          +in[(i+3)*n+(j+3)] * 0.0166666666667
+                          +in[(i+3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+4)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+4)*n+(j+-3)] * 0.00178571428571
+                          +in[(i+4)*n+(j+-2)] * 0.00178571428571
+                          +in[(i+4)*n+(j+-1)] * 0.00178571428571
+                          +in[(i+4)*n+(j+0)] * 0.00178571428571
+                          +in[(i+4)*n+(j+1)] * 0.00178571428571
+                          +in[(i+4)*n+(j+2)] * 0.00178571428571
+                          +in[(i+4)*n+(j+3)] * 0.00178571428571
                           +in[(i+4)*n+(j+4)] * 0.0125
-                          +in[(i+4)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-4)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-3)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-2)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-1)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+0)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+1)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+2)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+3)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+4)] * 0.0011111111111111111
+                          +in[(i+4)*n+(j+5)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-4)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-3)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-2)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-1)] * 0.00111111111111
+                          +in[(i+5)*n+(j+0)] * 0.00111111111111
+                          +in[(i+5)*n+(j+1)] * 0.00111111111111
+                          +in[(i+5)*n+(j+2)] * 0.00111111111111
+                          +in[(i+5)*n+(j+3)] * 0.00111111111111
+                          +in[(i+5)*n+(j+4)] * 0.00111111111111
                           +in[(i+5)*n+(j+5)] * 0.01
                           ;
        });
diff --git a/Cxx11/stencil_pstl.hpp b/Cxx11/stencil_pstl.hpp
index 4731f6faf..70ad6cf66 100644
--- a/Cxx11/stencil_pstl.hpp
+++ b/Cxx11/stencil_pstl.hpp
@@ -30,18 +30,18 @@ void star3(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = boost::irange(3,n-3);
     std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.05555555555555555
-                          +in[(i+-2)*n+(j+0)] * -0.08333333333333333
-                          +in[(i+-1)*n+(j+0)] * -0.16666666666666666
-                          +in[(i+0)*n+(j+-3)] * -0.05555555555555555
-                          +in[(i+0)*n+(j+-2)] * -0.08333333333333333
-                          +in[(i+0)*n+(j+-1)] * -0.16666666666666666
-                          +in[(i+0)*n+(j+1)] * 0.16666666666666666
-                          +in[(i+0)*n+(j+2)] * 0.08333333333333333
-                          +in[(i+0)*n+(j+3)] * 0.05555555555555555
-                          +in[(i+1)*n+(j+0)] * 0.16666666666666666
-                          +in[(i+2)*n+(j+0)] * 0.08333333333333333
-                          +in[(i+3)*n+(j+0)] * 0.05555555555555555;
+            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556
+                          +in[(i+-2)*n+(j+0)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+0)] * -0.166666666667
+                          +in[(i+0)*n+(j+-3)] * -0.0555555555556
+                          +in[(i+0)*n+(j+-2)] * -0.0833333333333
+                          +in[(i+0)*n+(j+-1)] * -0.166666666667
+                          +in[(i+0)*n+(j+1)] * 0.166666666667
+                          +in[(i+0)*n+(j+2)] * 0.0833333333333
+                          +in[(i+0)*n+(j+3)] * 0.0555555555556
+                          +in[(i+1)*n+(j+0)] * 0.166666666667
+                          +in[(i+2)*n+(j+0)] * 0.0833333333333
+                          +in[(i+3)*n+(j+0)] * 0.0555555555556;
        });
      });
 }
@@ -51,20 +51,20 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
     std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125
-                          +in[(i+-3)*n+(j+0)] * -0.041666666666666664
+                          +in[(i+-3)*n+(j+0)] * -0.0416666666667
                           +in[(i+-2)*n+(j+0)] * -0.0625
                           +in[(i+-1)*n+(j+0)] * -0.125
                           +in[(i+0)*n+(j+-4)] * -0.03125
-                          +in[(i+0)*n+(j+-3)] * -0.041666666666666664
+                          +in[(i+0)*n+(j+-3)] * -0.0416666666667
                           +in[(i+0)*n+(j+-2)] * -0.0625
                           +in[(i+0)*n+(j+-1)] * -0.125
                           +in[(i+0)*n+(j+1)] * 0.125
                           +in[(i+0)*n+(j+2)] * 0.0625
-                          +in[(i+0)*n+(j+3)] * 0.041666666666666664
+                          +in[(i+0)*n+(j+3)] * 0.0416666666667
                           +in[(i+0)*n+(j+4)] * 0.03125
                           +in[(i+1)*n+(j+0)] * 0.125
                           +in[(i+2)*n+(j+0)] * 0.0625
-                          +in[(i+3)*n+(j+0)] * 0.041666666666666664
+                          +in[(i+3)*n+(j+0)] * 0.0416666666667
                           +in[(i+4)*n+(j+0)] * 0.03125;
        });
      });
@@ -76,22 +76,22 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
       std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02
                           +in[(i+-4)*n+(j+0)] * -0.025
-                          +in[(i+-3)*n+(j+0)] * -0.03333333333333333
+                          +in[(i+-3)*n+(j+0)] * -0.0333333333333
                           +in[(i+-2)*n+(j+0)] * -0.05
                           +in[(i+-1)*n+(j+0)] * -0.1
                           +in[(i+0)*n+(j+-5)] * -0.02
                           +in[(i+0)*n+(j+-4)] * -0.025
-                          +in[(i+0)*n+(j+-3)] * -0.03333333333333333
+                          +in[(i+0)*n+(j+-3)] * -0.0333333333333
                           +in[(i+0)*n+(j+-2)] * -0.05
                           +in[(i+0)*n+(j+-1)] * -0.1
                           +in[(i+0)*n+(j+1)] * 0.1
                           +in[(i+0)*n+(j+2)] * 0.05
-                          +in[(i+0)*n+(j+3)] * 0.03333333333333333
+                          +in[(i+0)*n+(j+3)] * 0.0333333333333
                           +in[(i+0)*n+(j+4)] * 0.025
                           +in[(i+0)*n+(j+5)] * 0.02
                           +in[(i+1)*n+(j+0)] * 0.1
                           +in[(i+2)*n+(j+0)] * 0.05
-                          +in[(i+3)*n+(j+0)] * 0.03333333333333333
+                          +in[(i+3)*n+(j+0)] * 0.0333333333333
                           +in[(i+4)*n+(j+0)] * 0.025
                           +in[(i+5)*n+(j+0)] * 0.02;
        });
@@ -118,24 +118,24 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
     std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625
-                          +in[(i+-2)*n+(j+-1)] * -0.020833333333333332
-                          +in[(i+-2)*n+(j+0)] * -0.020833333333333332
-                          +in[(i+-2)*n+(j+1)] * -0.020833333333333332
-                          +in[(i+-1)*n+(j+-2)] * -0.020833333333333332
+                          +in[(i+-2)*n+(j+-1)] * -0.0208333333333
+                          +in[(i+-2)*n+(j+0)] * -0.0208333333333
+                          +in[(i+-2)*n+(j+1)] * -0.0208333333333
+                          +in[(i+-1)*n+(j+-2)] * -0.0208333333333
                           +in[(i+-1)*n+(j+-1)] * -0.125
                           +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+2)] * 0.020833333333333332
-                          +in[(i+0)*n+(j+-2)] * -0.020833333333333332
+                          +in[(i+-1)*n+(j+2)] * 0.0208333333333
+                          +in[(i+0)*n+(j+-2)] * -0.0208333333333
                           +in[(i+0)*n+(j+-1)] * -0.125
                           +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.020833333333333332
-                          +in[(i+1)*n+(j+-2)] * -0.020833333333333332
+                          +in[(i+0)*n+(j+2)] * 0.0208333333333
+                          +in[(i+1)*n+(j+-2)] * -0.0208333333333
                           +in[(i+1)*n+(j+0)] * 0.125
                           +in[(i+1)*n+(j+1)] * 0.125
-                          +in[(i+1)*n+(j+2)] * 0.020833333333333332
-                          +in[(i+2)*n+(j+-1)] * 0.020833333333333332
-                          +in[(i+2)*n+(j+0)] * 0.020833333333333332
-                          +in[(i+2)*n+(j+1)] * 0.020833333333333332
+                          +in[(i+1)*n+(j+2)] * 0.0208333333333
+                          +in[(i+2)*n+(j+-1)] * 0.0208333333333
+                          +in[(i+2)*n+(j+0)] * 0.0208333333333
+                          +in[(i+2)*n+(j+1)] * 0.0208333333333
                           +in[(i+2)*n+(j+2)] * 0.0625
                           ;
        });
@@ -146,48 +146,48 @@ void grid3(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = boost::irange(3,n-3);
     std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.027777777777777776
-                          +in[(i+-3)*n+(j+-2)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+-1)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+0)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+1)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+2)] * -0.005555555555555556
-                          +in[(i+-2)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+-2)*n+(j+-2)] * -0.041666666666666664
-                          +in[(i+-2)*n+(j+-1)] * -0.013888888888888888
-                          +in[(i+-2)*n+(j+0)] * -0.013888888888888888
-                          +in[(i+-2)*n+(j+1)] * -0.013888888888888888
-                          +in[(i+-2)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+-1)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+-1)*n+(j+-2)] * -0.013888888888888888
-                          +in[(i+-1)*n+(j+-1)] * -0.08333333333333333
-                          +in[(i+-1)*n+(j+0)] * -0.08333333333333333
-                          +in[(i+-1)*n+(j+2)] * 0.013888888888888888
-                          +in[(i+-1)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+0)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.013888888888888888
-                          +in[(i+0)*n+(j+-1)] * -0.08333333333333333
-                          +in[(i+0)*n+(j+1)] * 0.08333333333333333
-                          +in[(i+0)*n+(j+2)] * 0.013888888888888888
-                          +in[(i+0)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+1)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+1)*n+(j+-2)] * -0.013888888888888888
-                          +in[(i+1)*n+(j+0)] * 0.08333333333333333
-                          +in[(i+1)*n+(j+1)] * 0.08333333333333333
-                          +in[(i+1)*n+(j+2)] * 0.013888888888888888
-                          +in[(i+1)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+2)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+2)*n+(j+-1)] * 0.013888888888888888
-                          +in[(i+2)*n+(j+0)] * 0.013888888888888888
-                          +in[(i+2)*n+(j+1)] * 0.013888888888888888
-                          +in[(i+2)*n+(j+2)] * 0.041666666666666664
-                          +in[(i+2)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+-2)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+-1)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+0)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+1)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+2)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+3)] * 0.027777777777777776
+            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778
+                          +in[(i+-3)*n+(j+-2)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+-1)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+0)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+1)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+2)] * -0.00555555555556
+                          +in[(i+-2)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+-2)*n+(j+-2)] * -0.0416666666667
+                          +in[(i+-2)*n+(j+-1)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+0)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+1)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+3)] * 0.00555555555556
+                          +in[(i+-1)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+-1)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+-1)*n+(j+-1)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+0)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+-1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+0)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+0)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+0)*n+(j+-1)] * -0.0833333333333
+                          +in[(i+0)*n+(j+1)] * 0.0833333333333
+                          +in[(i+0)*n+(j+2)] * 0.0138888888889
+                          +in[(i+0)*n+(j+3)] * 0.00555555555556
+                          +in[(i+1)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+1)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+1)*n+(j+0)] * 0.0833333333333
+                          +in[(i+1)*n+(j+1)] * 0.0833333333333
+                          +in[(i+1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+2)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+2)*n+(j+-1)] * 0.0138888888889
+                          +in[(i+2)*n+(j+0)] * 0.0138888888889
+                          +in[(i+2)*n+(j+1)] * 0.0138888888889
+                          +in[(i+2)*n+(j+2)] * 0.0416666666667
+                          +in[(i+2)*n+(j+3)] * 0.00555555555556
+                          +in[(i+3)*n+(j+-2)] * 0.00555555555556
+                          +in[(i+3)*n+(j+-1)] * 0.00555555555556
+                          +in[(i+3)*n+(j+0)] * 0.00555555555556
+                          +in[(i+3)*n+(j+1)] * 0.00555555555556
+                          +in[(i+3)*n+(j+2)] * 0.00555555555556
+                          +in[(i+3)*n+(j+3)] * 0.0277777777778
                           ;
        });
      });
@@ -198,76 +198,76 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
     std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625
-                          +in[(i+-4)*n+(j+-3)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+-2)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+-1)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+0)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+1)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+2)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+3)] * -0.002232142857142857
-                          +in[(i+-3)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+-3)*n+(j+-3)] * -0.020833333333333332
-                          +in[(i+-3)*n+(j+-2)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+-1)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+0)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+1)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+2)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+-2)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+-2)*n+(j+-3)] * -0.004166666666666667
+                          +in[(i+-4)*n+(j+-3)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+-2)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+-1)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+0)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+1)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+2)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+3)] * -0.00223214285714
+                          +in[(i+-3)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-3)*n+(j+-3)] * -0.0208333333333
+                          +in[(i+-3)*n+(j+-2)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+-1)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+0)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+1)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+2)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+4)] * 0.00223214285714
+                          +in[(i+-2)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-2)*n+(j+-3)] * -0.00416666666667
                           +in[(i+-2)*n+(j+-2)] * -0.03125
-                          +in[(i+-2)*n+(j+-1)] * -0.010416666666666666
-                          +in[(i+-2)*n+(j+0)] * -0.010416666666666666
-                          +in[(i+-2)*n+(j+1)] * -0.010416666666666666
-                          +in[(i+-2)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+-2)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+-1)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+-1)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+-1)*n+(j+-2)] * -0.010416666666666666
+                          +in[(i+-2)*n+(j+-1)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+0)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+1)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+-2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+-1)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-1)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+-1)*n+(j+-2)] * -0.0104166666667
                           +in[(i+-1)*n+(j+-1)] * -0.0625
                           +in[(i+-1)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+2)] * 0.010416666666666666
-                          +in[(i+-1)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+-1)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+0)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+0)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+0)*n+(j+-2)] * -0.010416666666666666
+                          +in[(i+-1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+-1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+-1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+0)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+0)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+0)*n+(j+-2)] * -0.0104166666667
                           +in[(i+0)*n+(j+-1)] * -0.0625
                           +in[(i+0)*n+(j+1)] * 0.0625
-                          +in[(i+0)*n+(j+2)] * 0.010416666666666666
-                          +in[(i+0)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+0)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+1)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+1)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+1)*n+(j+-2)] * -0.010416666666666666
+                          +in[(i+0)*n+(j+2)] * 0.0104166666667
+                          +in[(i+0)*n+(j+3)] * 0.00416666666667
+                          +in[(i+0)*n+(j+4)] * 0.00223214285714
+                          +in[(i+1)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+1)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+1)*n+(j+-2)] * -0.0104166666667
                           +in[(i+1)*n+(j+0)] * 0.0625
                           +in[(i+1)*n+(j+1)] * 0.0625
-                          +in[(i+1)*n+(j+2)] * 0.010416666666666666
-                          +in[(i+1)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+1)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+2)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+2)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+2)*n+(j+-1)] * 0.010416666666666666
-                          +in[(i+2)*n+(j+0)] * 0.010416666666666666
-                          +in[(i+2)*n+(j+1)] * 0.010416666666666666
+                          +in[(i+1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+2)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+2)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+2)*n+(j+-1)] * 0.0104166666667
+                          +in[(i+2)*n+(j+0)] * 0.0104166666667
+                          +in[(i+2)*n+(j+1)] * 0.0104166666667
                           +in[(i+2)*n+(j+2)] * 0.03125
-                          +in[(i+2)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+2)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+3)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+3)*n+(j+-2)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+-1)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+0)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+1)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+2)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+3)] * 0.020833333333333332
-                          +in[(i+3)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+-3)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+-2)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+-1)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+0)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+1)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+2)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+3)] * 0.002232142857142857
+                          +in[(i+2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+3)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+3)*n+(j+-2)] * 0.00416666666667
+                          +in[(i+3)*n+(j+-1)] * 0.00416666666667
+                          +in[(i+3)*n+(j+0)] * 0.00416666666667
+                          +in[(i+3)*n+(j+1)] * 0.00416666666667
+                          +in[(i+3)*n+(j+2)] * 0.00416666666667
+                          +in[(i+3)*n+(j+3)] * 0.0208333333333
+                          +in[(i+3)*n+(j+4)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-3)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-2)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-1)] * 0.00223214285714
+                          +in[(i+4)*n+(j+0)] * 0.00223214285714
+                          +in[(i+4)*n+(j+1)] * 0.00223214285714
+                          +in[(i+4)*n+(j+2)] * 0.00223214285714
+                          +in[(i+4)*n+(j+3)] * 0.00223214285714
                           +in[(i+4)*n+(j+4)] * 0.015625
                           ;
        });
@@ -279,114 +279,114 @@ void grid5(const int n, const int t, std::vector<double> & in, std::vector<doubl
     std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01
-                          +in[(i+-5)*n+(j+-4)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+-3)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+-2)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+-1)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+0)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+1)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+2)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+3)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+4)] * -0.0011111111111111111
-                          +in[(i+-4)*n+(j+-5)] * -0.0011111111111111111
+                          +in[(i+-5)*n+(j+-4)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-3)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-2)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-1)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+0)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+1)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+2)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+3)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+4)] * -0.00111111111111
+                          +in[(i+-4)*n+(j+-5)] * -0.00111111111111
                           +in[(i+-4)*n+(j+-4)] * -0.0125
-                          +in[(i+-4)*n+(j+-3)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+-2)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+-1)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+0)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+1)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+2)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+3)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+-3)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+-3)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+-3)*n+(j+-3)] * -0.016666666666666666
-                          +in[(i+-3)*n+(j+-2)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+-1)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+0)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+1)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+2)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+-3)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+-2)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+-2)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+-2)*n+(j+-3)] * -0.0033333333333333335
+                          +in[(i+-4)*n+(j+-3)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+-2)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+-1)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+0)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+1)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+2)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+3)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-3)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-3)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-3)*n+(j+-3)] * -0.0166666666667
+                          +in[(i+-3)*n+(j+-2)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+-1)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+0)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+1)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+2)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-2)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-2)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-2)*n+(j+-3)] * -0.00333333333333
                           +in[(i+-2)*n+(j+-2)] * -0.025
-                          +in[(i+-2)*n+(j+-1)] * -0.008333333333333333
-                          +in[(i+-2)*n+(j+0)] * -0.008333333333333333
-                          +in[(i+-2)*n+(j+1)] * -0.008333333333333333
-                          +in[(i+-2)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+-2)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+-2)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+-1)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+-1)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+-1)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+-1)*n+(j+-2)] * -0.008333333333333333
+                          +in[(i+-2)*n+(j+-1)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+0)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+1)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+-2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-1)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-1)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-1)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+-1)*n+(j+-2)] * -0.00833333333333
                           +in[(i+-1)*n+(j+-1)] * -0.05
                           +in[(i+-1)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+2)] * 0.008333333333333333
-                          +in[(i+-1)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+-1)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+-1)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+0)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+0)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+0)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+0)*n+(j+-2)] * -0.008333333333333333
+                          +in[(i+-1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+-1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+-1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+0)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+0)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+0)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+0)*n+(j+-2)] * -0.00833333333333
                           +in[(i+0)*n+(j+-1)] * -0.05
                           +in[(i+0)*n+(j+1)] * 0.05
-                          +in[(i+0)*n+(j+2)] * 0.008333333333333333
-                          +in[(i+0)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+0)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+0)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+1)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+1)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+1)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+1)*n+(j+-2)] * -0.008333333333333333
+                          +in[(i+0)*n+(j+2)] * 0.00833333333333
+                          +in[(i+0)*n+(j+3)] * 0.00333333333333
+                          +in[(i+0)*n+(j+4)] * 0.00178571428571
+                          +in[(i+0)*n+(j+5)] * 0.00111111111111
+                          +in[(i+1)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+1)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+1)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+1)*n+(j+-2)] * -0.00833333333333
                           +in[(i+1)*n+(j+0)] * 0.05
                           +in[(i+1)*n+(j+1)] * 0.05
-                          +in[(i+1)*n+(j+2)] * 0.008333333333333333
-                          +in[(i+1)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+1)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+1)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+2)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+2)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+2)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+2)*n+(j+-1)] * 0.008333333333333333
-                          +in[(i+2)*n+(j+0)] * 0.008333333333333333
-                          +in[(i+2)*n+(j+1)] * 0.008333333333333333
+                          +in[(i+1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+2)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+2)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+2)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+2)*n+(j+-1)] * 0.00833333333333
+                          +in[(i+2)*n+(j+0)] * 0.00833333333333
+                          +in[(i+2)*n+(j+1)] * 0.00833333333333
                           +in[(i+2)*n+(j+2)] * 0.025
-                          +in[(i+2)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+2)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+2)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+3)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+3)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+3)*n+(j+-2)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+-1)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+0)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+1)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+2)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+3)] * 0.016666666666666666
-                          +in[(i+3)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+3)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+4)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+4)*n+(j+-3)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+-2)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+-1)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+0)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+1)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+2)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+3)] * 0.0017857142857142857
+                          +in[(i+2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+3)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+3)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+3)*n+(j+-2)] * 0.00333333333333
+                          +in[(i+3)*n+(j+-1)] * 0.00333333333333
+                          +in[(i+3)*n+(j+0)] * 0.00333333333333
+                          +in[(i+3)*n+(j+1)] * 0.00333333333333
+                          +in[(i+3)*n+(j+2)] * 0.00333333333333
+                          +in[(i+3)*n+(j+3)] * 0.0166666666667
+                          +in[(i+3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+4)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+4)*n+(j+-3)] * 0.00178571428571
+                          +in[(i+4)*n+(j+-2)] * 0.00178571428571
+                          +in[(i+4)*n+(j+-1)] * 0.00178571428571
+                          +in[(i+4)*n+(j+0)] * 0.00178571428571
+                          +in[(i+4)*n+(j+1)] * 0.00178571428571
+                          +in[(i+4)*n+(j+2)] * 0.00178571428571
+                          +in[(i+4)*n+(j+3)] * 0.00178571428571
                           +in[(i+4)*n+(j+4)] * 0.0125
-                          +in[(i+4)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-4)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-3)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-2)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-1)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+0)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+1)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+2)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+3)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+4)] * 0.0011111111111111111
+                          +in[(i+4)*n+(j+5)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-4)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-3)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-2)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-1)] * 0.00111111111111
+                          +in[(i+5)*n+(j+0)] * 0.00111111111111
+                          +in[(i+5)*n+(j+1)] * 0.00111111111111
+                          +in[(i+5)*n+(j+2)] * 0.00111111111111
+                          +in[(i+5)*n+(j+3)] * 0.00111111111111
+                          +in[(i+5)*n+(j+4)] * 0.00111111111111
                           +in[(i+5)*n+(j+5)] * 0.01
                           ;
        });
diff --git a/Cxx11/stencil_raja.hpp b/Cxx11/stencil_raja.hpp
index 1b39966f7..f3065e85e 100644
--- a/Cxx11/stencil_raja.hpp
+++ b/Cxx11/stencil_raja.hpp
@@ -27,18 +27,18 @@ void star2(const int n, const int t, std::vector<double> & in, std::vector<doubl
 void star3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
     RAJA::forall<thread_exec>(RAJA::Index_type(3), RAJA::Index_type(n-3), [&](RAJA::Index_type i) {
       RAJA::forall<RAJA::simd_exec>(RAJA::Index_type(3), RAJA::Index_type(n-3), [&](RAJA::Index_type j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.05555555555555555
-                          +in[(i+-2)*n+(j+0)] * -0.08333333333333333
-                          +in[(i+-1)*n+(j+0)] * -0.16666666666666666
-                          +in[(i+0)*n+(j+-3)] * -0.05555555555555555
-                          +in[(i+0)*n+(j+-2)] * -0.08333333333333333
-                          +in[(i+0)*n+(j+-1)] * -0.16666666666666666
-                          +in[(i+0)*n+(j+1)] * 0.16666666666666666
-                          +in[(i+0)*n+(j+2)] * 0.08333333333333333
-                          +in[(i+0)*n+(j+3)] * 0.05555555555555555
-                          +in[(i+1)*n+(j+0)] * 0.16666666666666666
-                          +in[(i+2)*n+(j+0)] * 0.08333333333333333
-                          +in[(i+3)*n+(j+0)] * 0.05555555555555555;
+            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556
+                          +in[(i+-2)*n+(j+0)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+0)] * -0.166666666667
+                          +in[(i+0)*n+(j+-3)] * -0.0555555555556
+                          +in[(i+0)*n+(j+-2)] * -0.0833333333333
+                          +in[(i+0)*n+(j+-1)] * -0.166666666667
+                          +in[(i+0)*n+(j+1)] * 0.166666666667
+                          +in[(i+0)*n+(j+2)] * 0.0833333333333
+                          +in[(i+0)*n+(j+3)] * 0.0555555555556
+                          +in[(i+1)*n+(j+0)] * 0.166666666667
+                          +in[(i+2)*n+(j+0)] * 0.0833333333333
+                          +in[(i+3)*n+(j+0)] * 0.0555555555556;
        });
      });
 }
@@ -47,20 +47,20 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
     RAJA::forall<thread_exec>(RAJA::Index_type(4), RAJA::Index_type(n-4), [&](RAJA::Index_type i) {
       RAJA::forall<RAJA::simd_exec>(RAJA::Index_type(4), RAJA::Index_type(n-4), [&](RAJA::Index_type j) {
             out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125
-                          +in[(i+-3)*n+(j+0)] * -0.041666666666666664
+                          +in[(i+-3)*n+(j+0)] * -0.0416666666667
                           +in[(i+-2)*n+(j+0)] * -0.0625
                           +in[(i+-1)*n+(j+0)] * -0.125
                           +in[(i+0)*n+(j+-4)] * -0.03125
-                          +in[(i+0)*n+(j+-3)] * -0.041666666666666664
+                          +in[(i+0)*n+(j+-3)] * -0.0416666666667
                           +in[(i+0)*n+(j+-2)] * -0.0625
                           +in[(i+0)*n+(j+-1)] * -0.125
                           +in[(i+0)*n+(j+1)] * 0.125
                           +in[(i+0)*n+(j+2)] * 0.0625
-                          +in[(i+0)*n+(j+3)] * 0.041666666666666664
+                          +in[(i+0)*n+(j+3)] * 0.0416666666667
                           +in[(i+0)*n+(j+4)] * 0.03125
                           +in[(i+1)*n+(j+0)] * 0.125
                           +in[(i+2)*n+(j+0)] * 0.0625
-                          +in[(i+3)*n+(j+0)] * 0.041666666666666664
+                          +in[(i+3)*n+(j+0)] * 0.0416666666667
                           +in[(i+4)*n+(j+0)] * 0.03125;
        });
      });
@@ -71,22 +71,22 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
       RAJA::forall<RAJA::simd_exec>(RAJA::Index_type(5), RAJA::Index_type(n-5), [&](RAJA::Index_type j) {
             out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02
                           +in[(i+-4)*n+(j+0)] * -0.025
-                          +in[(i+-3)*n+(j+0)] * -0.03333333333333333
+                          +in[(i+-3)*n+(j+0)] * -0.0333333333333
                           +in[(i+-2)*n+(j+0)] * -0.05
                           +in[(i+-1)*n+(j+0)] * -0.1
                           +in[(i+0)*n+(j+-5)] * -0.02
                           +in[(i+0)*n+(j+-4)] * -0.025
-                          +in[(i+0)*n+(j+-3)] * -0.03333333333333333
+                          +in[(i+0)*n+(j+-3)] * -0.0333333333333
                           +in[(i+0)*n+(j+-2)] * -0.05
                           +in[(i+0)*n+(j+-1)] * -0.1
                           +in[(i+0)*n+(j+1)] * 0.1
                           +in[(i+0)*n+(j+2)] * 0.05
-                          +in[(i+0)*n+(j+3)] * 0.03333333333333333
+                          +in[(i+0)*n+(j+3)] * 0.0333333333333
                           +in[(i+0)*n+(j+4)] * 0.025
                           +in[(i+0)*n+(j+5)] * 0.02
                           +in[(i+1)*n+(j+0)] * 0.1
                           +in[(i+2)*n+(j+0)] * 0.05
-                          +in[(i+3)*n+(j+0)] * 0.03333333333333333
+                          +in[(i+3)*n+(j+0)] * 0.0333333333333
                           +in[(i+4)*n+(j+0)] * 0.025
                           +in[(i+5)*n+(j+0)] * 0.02;
        });
@@ -111,24 +111,24 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
     RAJA::forall<thread_exec>(RAJA::Index_type(2), RAJA::Index_type(n-2), [&](RAJA::Index_type i) {
       RAJA::forall<RAJA::simd_exec>(RAJA::Index_type(2), RAJA::Index_type(n-2), [&](RAJA::Index_type j) {
             out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625
-                          +in[(i+-2)*n+(j+-1)] * -0.020833333333333332
-                          +in[(i+-2)*n+(j+0)] * -0.020833333333333332
-                          +in[(i+-2)*n+(j+1)] * -0.020833333333333332
-                          +in[(i+-1)*n+(j+-2)] * -0.020833333333333332
+                          +in[(i+-2)*n+(j+-1)] * -0.0208333333333
+                          +in[(i+-2)*n+(j+0)] * -0.0208333333333
+                          +in[(i+-2)*n+(j+1)] * -0.0208333333333
+                          +in[(i+-1)*n+(j+-2)] * -0.0208333333333
                           +in[(i+-1)*n+(j+-1)] * -0.125
                           +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+2)] * 0.020833333333333332
-                          +in[(i+0)*n+(j+-2)] * -0.020833333333333332
+                          +in[(i+-1)*n+(j+2)] * 0.0208333333333
+                          +in[(i+0)*n+(j+-2)] * -0.0208333333333
                           +in[(i+0)*n+(j+-1)] * -0.125
                           +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.020833333333333332
-                          +in[(i+1)*n+(j+-2)] * -0.020833333333333332
+                          +in[(i+0)*n+(j+2)] * 0.0208333333333
+                          +in[(i+1)*n+(j+-2)] * -0.0208333333333
                           +in[(i+1)*n+(j+0)] * 0.125
                           +in[(i+1)*n+(j+1)] * 0.125
-                          +in[(i+1)*n+(j+2)] * 0.020833333333333332
-                          +in[(i+2)*n+(j+-1)] * 0.020833333333333332
-                          +in[(i+2)*n+(j+0)] * 0.020833333333333332
-                          +in[(i+2)*n+(j+1)] * 0.020833333333333332
+                          +in[(i+1)*n+(j+2)] * 0.0208333333333
+                          +in[(i+2)*n+(j+-1)] * 0.0208333333333
+                          +in[(i+2)*n+(j+0)] * 0.0208333333333
+                          +in[(i+2)*n+(j+1)] * 0.0208333333333
                           +in[(i+2)*n+(j+2)] * 0.0625
                           ;
        });
@@ -138,48 +138,48 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
 void grid3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
     RAJA::forall<thread_exec>(RAJA::Index_type(3), RAJA::Index_type(n-3), [&](RAJA::Index_type i) {
       RAJA::forall<RAJA::simd_exec>(RAJA::Index_type(3), RAJA::Index_type(n-3), [&](RAJA::Index_type j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.027777777777777776
-                          +in[(i+-3)*n+(j+-2)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+-1)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+0)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+1)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+2)] * -0.005555555555555556
-                          +in[(i+-2)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+-2)*n+(j+-2)] * -0.041666666666666664
-                          +in[(i+-2)*n+(j+-1)] * -0.013888888888888888
-                          +in[(i+-2)*n+(j+0)] * -0.013888888888888888
-                          +in[(i+-2)*n+(j+1)] * -0.013888888888888888
-                          +in[(i+-2)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+-1)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+-1)*n+(j+-2)] * -0.013888888888888888
-                          +in[(i+-1)*n+(j+-1)] * -0.08333333333333333
-                          +in[(i+-1)*n+(j+0)] * -0.08333333333333333
-                          +in[(i+-1)*n+(j+2)] * 0.013888888888888888
-                          +in[(i+-1)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+0)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.013888888888888888
-                          +in[(i+0)*n+(j+-1)] * -0.08333333333333333
-                          +in[(i+0)*n+(j+1)] * 0.08333333333333333
-                          +in[(i+0)*n+(j+2)] * 0.013888888888888888
-                          +in[(i+0)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+1)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+1)*n+(j+-2)] * -0.013888888888888888
-                          +in[(i+1)*n+(j+0)] * 0.08333333333333333
-                          +in[(i+1)*n+(j+1)] * 0.08333333333333333
-                          +in[(i+1)*n+(j+2)] * 0.013888888888888888
-                          +in[(i+1)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+2)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+2)*n+(j+-1)] * 0.013888888888888888
-                          +in[(i+2)*n+(j+0)] * 0.013888888888888888
-                          +in[(i+2)*n+(j+1)] * 0.013888888888888888
-                          +in[(i+2)*n+(j+2)] * 0.041666666666666664
-                          +in[(i+2)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+-2)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+-1)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+0)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+1)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+2)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+3)] * 0.027777777777777776
+            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778
+                          +in[(i+-3)*n+(j+-2)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+-1)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+0)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+1)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+2)] * -0.00555555555556
+                          +in[(i+-2)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+-2)*n+(j+-2)] * -0.0416666666667
+                          +in[(i+-2)*n+(j+-1)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+0)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+1)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+3)] * 0.00555555555556
+                          +in[(i+-1)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+-1)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+-1)*n+(j+-1)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+0)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+-1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+0)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+0)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+0)*n+(j+-1)] * -0.0833333333333
+                          +in[(i+0)*n+(j+1)] * 0.0833333333333
+                          +in[(i+0)*n+(j+2)] * 0.0138888888889
+                          +in[(i+0)*n+(j+3)] * 0.00555555555556
+                          +in[(i+1)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+1)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+1)*n+(j+0)] * 0.0833333333333
+                          +in[(i+1)*n+(j+1)] * 0.0833333333333
+                          +in[(i+1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+2)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+2)*n+(j+-1)] * 0.0138888888889
+                          +in[(i+2)*n+(j+0)] * 0.0138888888889
+                          +in[(i+2)*n+(j+1)] * 0.0138888888889
+                          +in[(i+2)*n+(j+2)] * 0.0416666666667
+                          +in[(i+2)*n+(j+3)] * 0.00555555555556
+                          +in[(i+3)*n+(j+-2)] * 0.00555555555556
+                          +in[(i+3)*n+(j+-1)] * 0.00555555555556
+                          +in[(i+3)*n+(j+0)] * 0.00555555555556
+                          +in[(i+3)*n+(j+1)] * 0.00555555555556
+                          +in[(i+3)*n+(j+2)] * 0.00555555555556
+                          +in[(i+3)*n+(j+3)] * 0.0277777777778
                           ;
        });
      });
@@ -189,76 +189,76 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
     RAJA::forall<thread_exec>(RAJA::Index_type(4), RAJA::Index_type(n-4), [&](RAJA::Index_type i) {
       RAJA::forall<RAJA::simd_exec>(RAJA::Index_type(4), RAJA::Index_type(n-4), [&](RAJA::Index_type j) {
             out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625
-                          +in[(i+-4)*n+(j+-3)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+-2)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+-1)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+0)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+1)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+2)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+3)] * -0.002232142857142857
-                          +in[(i+-3)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+-3)*n+(j+-3)] * -0.020833333333333332
-                          +in[(i+-3)*n+(j+-2)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+-1)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+0)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+1)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+2)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+-2)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+-2)*n+(j+-3)] * -0.004166666666666667
+                          +in[(i+-4)*n+(j+-3)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+-2)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+-1)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+0)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+1)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+2)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+3)] * -0.00223214285714
+                          +in[(i+-3)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-3)*n+(j+-3)] * -0.0208333333333
+                          +in[(i+-3)*n+(j+-2)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+-1)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+0)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+1)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+2)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+4)] * 0.00223214285714
+                          +in[(i+-2)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-2)*n+(j+-3)] * -0.00416666666667
                           +in[(i+-2)*n+(j+-2)] * -0.03125
-                          +in[(i+-2)*n+(j+-1)] * -0.010416666666666666
-                          +in[(i+-2)*n+(j+0)] * -0.010416666666666666
-                          +in[(i+-2)*n+(j+1)] * -0.010416666666666666
-                          +in[(i+-2)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+-2)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+-1)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+-1)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+-1)*n+(j+-2)] * -0.010416666666666666
+                          +in[(i+-2)*n+(j+-1)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+0)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+1)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+-2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+-1)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-1)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+-1)*n+(j+-2)] * -0.0104166666667
                           +in[(i+-1)*n+(j+-1)] * -0.0625
                           +in[(i+-1)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+2)] * 0.010416666666666666
-                          +in[(i+-1)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+-1)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+0)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+0)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+0)*n+(j+-2)] * -0.010416666666666666
+                          +in[(i+-1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+-1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+-1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+0)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+0)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+0)*n+(j+-2)] * -0.0104166666667
                           +in[(i+0)*n+(j+-1)] * -0.0625
                           +in[(i+0)*n+(j+1)] * 0.0625
-                          +in[(i+0)*n+(j+2)] * 0.010416666666666666
-                          +in[(i+0)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+0)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+1)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+1)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+1)*n+(j+-2)] * -0.010416666666666666
+                          +in[(i+0)*n+(j+2)] * 0.0104166666667
+                          +in[(i+0)*n+(j+3)] * 0.00416666666667
+                          +in[(i+0)*n+(j+4)] * 0.00223214285714
+                          +in[(i+1)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+1)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+1)*n+(j+-2)] * -0.0104166666667
                           +in[(i+1)*n+(j+0)] * 0.0625
                           +in[(i+1)*n+(j+1)] * 0.0625
-                          +in[(i+1)*n+(j+2)] * 0.010416666666666666
-                          +in[(i+1)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+1)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+2)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+2)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+2)*n+(j+-1)] * 0.010416666666666666
-                          +in[(i+2)*n+(j+0)] * 0.010416666666666666
-                          +in[(i+2)*n+(j+1)] * 0.010416666666666666
+                          +in[(i+1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+2)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+2)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+2)*n+(j+-1)] * 0.0104166666667
+                          +in[(i+2)*n+(j+0)] * 0.0104166666667
+                          +in[(i+2)*n+(j+1)] * 0.0104166666667
                           +in[(i+2)*n+(j+2)] * 0.03125
-                          +in[(i+2)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+2)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+3)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+3)*n+(j+-2)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+-1)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+0)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+1)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+2)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+3)] * 0.020833333333333332
-                          +in[(i+3)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+-3)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+-2)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+-1)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+0)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+1)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+2)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+3)] * 0.002232142857142857
+                          +in[(i+2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+3)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+3)*n+(j+-2)] * 0.00416666666667
+                          +in[(i+3)*n+(j+-1)] * 0.00416666666667
+                          +in[(i+3)*n+(j+0)] * 0.00416666666667
+                          +in[(i+3)*n+(j+1)] * 0.00416666666667
+                          +in[(i+3)*n+(j+2)] * 0.00416666666667
+                          +in[(i+3)*n+(j+3)] * 0.0208333333333
+                          +in[(i+3)*n+(j+4)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-3)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-2)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-1)] * 0.00223214285714
+                          +in[(i+4)*n+(j+0)] * 0.00223214285714
+                          +in[(i+4)*n+(j+1)] * 0.00223214285714
+                          +in[(i+4)*n+(j+2)] * 0.00223214285714
+                          +in[(i+4)*n+(j+3)] * 0.00223214285714
                           +in[(i+4)*n+(j+4)] * 0.015625
                           ;
        });
@@ -269,114 +269,114 @@ void grid5(const int n, const int t, std::vector<double> & in, std::vector<doubl
     RAJA::forall<thread_exec>(RAJA::Index_type(5), RAJA::Index_type(n-5), [&](RAJA::Index_type i) {
       RAJA::forall<RAJA::simd_exec>(RAJA::Index_type(5), RAJA::Index_type(n-5), [&](RAJA::Index_type j) {
             out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01
-                          +in[(i+-5)*n+(j+-4)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+-3)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+-2)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+-1)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+0)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+1)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+2)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+3)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+4)] * -0.0011111111111111111
-                          +in[(i+-4)*n+(j+-5)] * -0.0011111111111111111
+                          +in[(i+-5)*n+(j+-4)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-3)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-2)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-1)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+0)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+1)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+2)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+3)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+4)] * -0.00111111111111
+                          +in[(i+-4)*n+(j+-5)] * -0.00111111111111
                           +in[(i+-4)*n+(j+-4)] * -0.0125
-                          +in[(i+-4)*n+(j+-3)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+-2)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+-1)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+0)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+1)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+2)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+3)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+-3)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+-3)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+-3)*n+(j+-3)] * -0.016666666666666666
-                          +in[(i+-3)*n+(j+-2)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+-1)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+0)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+1)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+2)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+-3)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+-2)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+-2)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+-2)*n+(j+-3)] * -0.0033333333333333335
+                          +in[(i+-4)*n+(j+-3)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+-2)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+-1)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+0)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+1)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+2)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+3)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-3)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-3)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-3)*n+(j+-3)] * -0.0166666666667
+                          +in[(i+-3)*n+(j+-2)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+-1)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+0)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+1)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+2)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-2)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-2)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-2)*n+(j+-3)] * -0.00333333333333
                           +in[(i+-2)*n+(j+-2)] * -0.025
-                          +in[(i+-2)*n+(j+-1)] * -0.008333333333333333
-                          +in[(i+-2)*n+(j+0)] * -0.008333333333333333
-                          +in[(i+-2)*n+(j+1)] * -0.008333333333333333
-                          +in[(i+-2)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+-2)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+-2)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+-1)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+-1)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+-1)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+-1)*n+(j+-2)] * -0.008333333333333333
+                          +in[(i+-2)*n+(j+-1)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+0)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+1)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+-2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-1)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-1)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-1)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+-1)*n+(j+-2)] * -0.00833333333333
                           +in[(i+-1)*n+(j+-1)] * -0.05
                           +in[(i+-1)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+2)] * 0.008333333333333333
-                          +in[(i+-1)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+-1)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+-1)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+0)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+0)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+0)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+0)*n+(j+-2)] * -0.008333333333333333
+                          +in[(i+-1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+-1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+-1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+0)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+0)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+0)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+0)*n+(j+-2)] * -0.00833333333333
                           +in[(i+0)*n+(j+-1)] * -0.05
                           +in[(i+0)*n+(j+1)] * 0.05
-                          +in[(i+0)*n+(j+2)] * 0.008333333333333333
-                          +in[(i+0)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+0)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+0)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+1)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+1)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+1)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+1)*n+(j+-2)] * -0.008333333333333333
+                          +in[(i+0)*n+(j+2)] * 0.00833333333333
+                          +in[(i+0)*n+(j+3)] * 0.00333333333333
+                          +in[(i+0)*n+(j+4)] * 0.00178571428571
+                          +in[(i+0)*n+(j+5)] * 0.00111111111111
+                          +in[(i+1)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+1)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+1)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+1)*n+(j+-2)] * -0.00833333333333
                           +in[(i+1)*n+(j+0)] * 0.05
                           +in[(i+1)*n+(j+1)] * 0.05
-                          +in[(i+1)*n+(j+2)] * 0.008333333333333333
-                          +in[(i+1)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+1)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+1)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+2)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+2)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+2)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+2)*n+(j+-1)] * 0.008333333333333333
-                          +in[(i+2)*n+(j+0)] * 0.008333333333333333
-                          +in[(i+2)*n+(j+1)] * 0.008333333333333333
+                          +in[(i+1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+2)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+2)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+2)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+2)*n+(j+-1)] * 0.00833333333333
+                          +in[(i+2)*n+(j+0)] * 0.00833333333333
+                          +in[(i+2)*n+(j+1)] * 0.00833333333333
                           +in[(i+2)*n+(j+2)] * 0.025
-                          +in[(i+2)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+2)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+2)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+3)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+3)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+3)*n+(j+-2)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+-1)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+0)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+1)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+2)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+3)] * 0.016666666666666666
-                          +in[(i+3)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+3)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+4)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+4)*n+(j+-3)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+-2)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+-1)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+0)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+1)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+2)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+3)] * 0.0017857142857142857
+                          +in[(i+2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+3)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+3)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+3)*n+(j+-2)] * 0.00333333333333
+                          +in[(i+3)*n+(j+-1)] * 0.00333333333333
+                          +in[(i+3)*n+(j+0)] * 0.00333333333333
+                          +in[(i+3)*n+(j+1)] * 0.00333333333333
+                          +in[(i+3)*n+(j+2)] * 0.00333333333333
+                          +in[(i+3)*n+(j+3)] * 0.0166666666667
+                          +in[(i+3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+4)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+4)*n+(j+-3)] * 0.00178571428571
+                          +in[(i+4)*n+(j+-2)] * 0.00178571428571
+                          +in[(i+4)*n+(j+-1)] * 0.00178571428571
+                          +in[(i+4)*n+(j+0)] * 0.00178571428571
+                          +in[(i+4)*n+(j+1)] * 0.00178571428571
+                          +in[(i+4)*n+(j+2)] * 0.00178571428571
+                          +in[(i+4)*n+(j+3)] * 0.00178571428571
                           +in[(i+4)*n+(j+4)] * 0.0125
-                          +in[(i+4)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-4)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-3)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-2)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-1)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+0)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+1)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+2)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+3)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+4)] * 0.0011111111111111111
+                          +in[(i+4)*n+(j+5)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-4)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-3)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-2)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-1)] * 0.00111111111111
+                          +in[(i+5)*n+(j+0)] * 0.00111111111111
+                          +in[(i+5)*n+(j+1)] * 0.00111111111111
+                          +in[(i+5)*n+(j+2)] * 0.00111111111111
+                          +in[(i+5)*n+(j+3)] * 0.00111111111111
+                          +in[(i+5)*n+(j+4)] * 0.00111111111111
                           +in[(i+5)*n+(j+5)] * 0.01
                           ;
        });
diff --git a/Cxx11/stencil_rangefor.hpp b/Cxx11/stencil_rangefor.hpp
index e9580e1fa..b6bf57581 100644
--- a/Cxx11/stencil_rangefor.hpp
+++ b/Cxx11/stencil_rangefor.hpp
@@ -37,18 +37,18 @@ void star3(const int n, const int t, std::vector<double> & in, std::vector<doubl
     for (auto i : inside) {
       PRAGMA_SIMD
       for (auto j : inside) {
-            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.05555555555555555
-                          +in[(i+-2)*n+(j+0)] * -0.08333333333333333
-                          +in[(i+-1)*n+(j+0)] * -0.16666666666666666
-                          +in[(i+0)*n+(j+-3)] * -0.05555555555555555
-                          +in[(i+0)*n+(j+-2)] * -0.08333333333333333
-                          +in[(i+0)*n+(j+-1)] * -0.16666666666666666
-                          +in[(i+0)*n+(j+1)] * 0.16666666666666666
-                          +in[(i+0)*n+(j+2)] * 0.08333333333333333
-                          +in[(i+0)*n+(j+3)] * 0.05555555555555555
-                          +in[(i+1)*n+(j+0)] * 0.16666666666666666
-                          +in[(i+2)*n+(j+0)] * 0.08333333333333333
-                          +in[(i+3)*n+(j+0)] * 0.05555555555555555;
+            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556
+                          +in[(i+-2)*n+(j+0)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+0)] * -0.166666666667
+                          +in[(i+0)*n+(j+-3)] * -0.0555555555556
+                          +in[(i+0)*n+(j+-2)] * -0.0833333333333
+                          +in[(i+0)*n+(j+-1)] * -0.166666666667
+                          +in[(i+0)*n+(j+1)] * 0.166666666667
+                          +in[(i+0)*n+(j+2)] * 0.0833333333333
+                          +in[(i+0)*n+(j+3)] * 0.0555555555556
+                          +in[(i+1)*n+(j+0)] * 0.166666666667
+                          +in[(i+2)*n+(j+0)] * 0.0833333333333
+                          +in[(i+3)*n+(j+0)] * 0.0555555555556;
            }
          }
        }
@@ -61,20 +61,20 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
       PRAGMA_SIMD
       for (auto j : inside) {
             out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125
-                          +in[(i+-3)*n+(j+0)] * -0.041666666666666664
+                          +in[(i+-3)*n+(j+0)] * -0.0416666666667
                           +in[(i+-2)*n+(j+0)] * -0.0625
                           +in[(i+-1)*n+(j+0)] * -0.125
                           +in[(i+0)*n+(j+-4)] * -0.03125
-                          +in[(i+0)*n+(j+-3)] * -0.041666666666666664
+                          +in[(i+0)*n+(j+-3)] * -0.0416666666667
                           +in[(i+0)*n+(j+-2)] * -0.0625
                           +in[(i+0)*n+(j+-1)] * -0.125
                           +in[(i+0)*n+(j+1)] * 0.125
                           +in[(i+0)*n+(j+2)] * 0.0625
-                          +in[(i+0)*n+(j+3)] * 0.041666666666666664
+                          +in[(i+0)*n+(j+3)] * 0.0416666666667
                           +in[(i+0)*n+(j+4)] * 0.03125
                           +in[(i+1)*n+(j+0)] * 0.125
                           +in[(i+2)*n+(j+0)] * 0.0625
-                          +in[(i+3)*n+(j+0)] * 0.041666666666666664
+                          +in[(i+3)*n+(j+0)] * 0.0416666666667
                           +in[(i+4)*n+(j+0)] * 0.03125;
            }
          }
@@ -89,22 +89,22 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
       for (auto j : inside) {
             out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02
                           +in[(i+-4)*n+(j+0)] * -0.025
-                          +in[(i+-3)*n+(j+0)] * -0.03333333333333333
+                          +in[(i+-3)*n+(j+0)] * -0.0333333333333
                           +in[(i+-2)*n+(j+0)] * -0.05
                           +in[(i+-1)*n+(j+0)] * -0.1
                           +in[(i+0)*n+(j+-5)] * -0.02
                           +in[(i+0)*n+(j+-4)] * -0.025
-                          +in[(i+0)*n+(j+-3)] * -0.03333333333333333
+                          +in[(i+0)*n+(j+-3)] * -0.0333333333333
                           +in[(i+0)*n+(j+-2)] * -0.05
                           +in[(i+0)*n+(j+-1)] * -0.1
                           +in[(i+0)*n+(j+1)] * 0.1
                           +in[(i+0)*n+(j+2)] * 0.05
-                          +in[(i+0)*n+(j+3)] * 0.03333333333333333
+                          +in[(i+0)*n+(j+3)] * 0.0333333333333
                           +in[(i+0)*n+(j+4)] * 0.025
                           +in[(i+0)*n+(j+5)] * 0.02
                           +in[(i+1)*n+(j+0)] * 0.1
                           +in[(i+2)*n+(j+0)] * 0.05
-                          +in[(i+3)*n+(j+0)] * 0.03333333333333333
+                          +in[(i+3)*n+(j+0)] * 0.0333333333333
                           +in[(i+4)*n+(j+0)] * 0.025
                           +in[(i+5)*n+(j+0)] * 0.02;
            }
@@ -137,24 +137,24 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
       PRAGMA_SIMD
       for (auto j : inside) {
             out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625
-                          +in[(i+-2)*n+(j+-1)] * -0.020833333333333332
-                          +in[(i+-2)*n+(j+0)] * -0.020833333333333332
-                          +in[(i+-2)*n+(j+1)] * -0.020833333333333332
-                          +in[(i+-1)*n+(j+-2)] * -0.020833333333333332
+                          +in[(i+-2)*n+(j+-1)] * -0.0208333333333
+                          +in[(i+-2)*n+(j+0)] * -0.0208333333333
+                          +in[(i+-2)*n+(j+1)] * -0.0208333333333
+                          +in[(i+-1)*n+(j+-2)] * -0.0208333333333
                           +in[(i+-1)*n+(j+-1)] * -0.125
                           +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+2)] * 0.020833333333333332
-                          +in[(i+0)*n+(j+-2)] * -0.020833333333333332
+                          +in[(i+-1)*n+(j+2)] * 0.0208333333333
+                          +in[(i+0)*n+(j+-2)] * -0.0208333333333
                           +in[(i+0)*n+(j+-1)] * -0.125
                           +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.020833333333333332
-                          +in[(i+1)*n+(j+-2)] * -0.020833333333333332
+                          +in[(i+0)*n+(j+2)] * 0.0208333333333
+                          +in[(i+1)*n+(j+-2)] * -0.0208333333333
                           +in[(i+1)*n+(j+0)] * 0.125
                           +in[(i+1)*n+(j+1)] * 0.125
-                          +in[(i+1)*n+(j+2)] * 0.020833333333333332
-                          +in[(i+2)*n+(j+-1)] * 0.020833333333333332
-                          +in[(i+2)*n+(j+0)] * 0.020833333333333332
-                          +in[(i+2)*n+(j+1)] * 0.020833333333333332
+                          +in[(i+1)*n+(j+2)] * 0.0208333333333
+                          +in[(i+2)*n+(j+-1)] * 0.0208333333333
+                          +in[(i+2)*n+(j+0)] * 0.0208333333333
+                          +in[(i+2)*n+(j+1)] * 0.0208333333333
                           +in[(i+2)*n+(j+2)] * 0.0625
                           ;
            }
@@ -168,48 +168,48 @@ void grid3(const int n, const int t, std::vector<double> & in, std::vector<doubl
     for (auto i : inside) {
       PRAGMA_SIMD
       for (auto j : inside) {
-            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.027777777777777776
-                          +in[(i+-3)*n+(j+-2)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+-1)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+0)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+1)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+2)] * -0.005555555555555556
-                          +in[(i+-2)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+-2)*n+(j+-2)] * -0.041666666666666664
-                          +in[(i+-2)*n+(j+-1)] * -0.013888888888888888
-                          +in[(i+-2)*n+(j+0)] * -0.013888888888888888
-                          +in[(i+-2)*n+(j+1)] * -0.013888888888888888
-                          +in[(i+-2)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+-1)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+-1)*n+(j+-2)] * -0.013888888888888888
-                          +in[(i+-1)*n+(j+-1)] * -0.08333333333333333
-                          +in[(i+-1)*n+(j+0)] * -0.08333333333333333
-                          +in[(i+-1)*n+(j+2)] * 0.013888888888888888
-                          +in[(i+-1)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+0)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.013888888888888888
-                          +in[(i+0)*n+(j+-1)] * -0.08333333333333333
-                          +in[(i+0)*n+(j+1)] * 0.08333333333333333
-                          +in[(i+0)*n+(j+2)] * 0.013888888888888888
-                          +in[(i+0)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+1)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+1)*n+(j+-2)] * -0.013888888888888888
-                          +in[(i+1)*n+(j+0)] * 0.08333333333333333
-                          +in[(i+1)*n+(j+1)] * 0.08333333333333333
-                          +in[(i+1)*n+(j+2)] * 0.013888888888888888
-                          +in[(i+1)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+2)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+2)*n+(j+-1)] * 0.013888888888888888
-                          +in[(i+2)*n+(j+0)] * 0.013888888888888888
-                          +in[(i+2)*n+(j+1)] * 0.013888888888888888
-                          +in[(i+2)*n+(j+2)] * 0.041666666666666664
-                          +in[(i+2)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+-2)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+-1)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+0)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+1)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+2)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+3)] * 0.027777777777777776
+            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778
+                          +in[(i+-3)*n+(j+-2)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+-1)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+0)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+1)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+2)] * -0.00555555555556
+                          +in[(i+-2)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+-2)*n+(j+-2)] * -0.0416666666667
+                          +in[(i+-2)*n+(j+-1)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+0)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+1)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+3)] * 0.00555555555556
+                          +in[(i+-1)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+-1)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+-1)*n+(j+-1)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+0)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+-1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+0)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+0)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+0)*n+(j+-1)] * -0.0833333333333
+                          +in[(i+0)*n+(j+1)] * 0.0833333333333
+                          +in[(i+0)*n+(j+2)] * 0.0138888888889
+                          +in[(i+0)*n+(j+3)] * 0.00555555555556
+                          +in[(i+1)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+1)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+1)*n+(j+0)] * 0.0833333333333
+                          +in[(i+1)*n+(j+1)] * 0.0833333333333
+                          +in[(i+1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+2)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+2)*n+(j+-1)] * 0.0138888888889
+                          +in[(i+2)*n+(j+0)] * 0.0138888888889
+                          +in[(i+2)*n+(j+1)] * 0.0138888888889
+                          +in[(i+2)*n+(j+2)] * 0.0416666666667
+                          +in[(i+2)*n+(j+3)] * 0.00555555555556
+                          +in[(i+3)*n+(j+-2)] * 0.00555555555556
+                          +in[(i+3)*n+(j+-1)] * 0.00555555555556
+                          +in[(i+3)*n+(j+0)] * 0.00555555555556
+                          +in[(i+3)*n+(j+1)] * 0.00555555555556
+                          +in[(i+3)*n+(j+2)] * 0.00555555555556
+                          +in[(i+3)*n+(j+3)] * 0.0277777777778
                           ;
            }
          }
@@ -223,76 +223,76 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
       PRAGMA_SIMD
       for (auto j : inside) {
             out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625
-                          +in[(i+-4)*n+(j+-3)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+-2)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+-1)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+0)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+1)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+2)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+3)] * -0.002232142857142857
-                          +in[(i+-3)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+-3)*n+(j+-3)] * -0.020833333333333332
-                          +in[(i+-3)*n+(j+-2)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+-1)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+0)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+1)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+2)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+-2)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+-2)*n+(j+-3)] * -0.004166666666666667
+                          +in[(i+-4)*n+(j+-3)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+-2)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+-1)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+0)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+1)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+2)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+3)] * -0.00223214285714
+                          +in[(i+-3)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-3)*n+(j+-3)] * -0.0208333333333
+                          +in[(i+-3)*n+(j+-2)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+-1)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+0)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+1)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+2)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+4)] * 0.00223214285714
+                          +in[(i+-2)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-2)*n+(j+-3)] * -0.00416666666667
                           +in[(i+-2)*n+(j+-2)] * -0.03125
-                          +in[(i+-2)*n+(j+-1)] * -0.010416666666666666
-                          +in[(i+-2)*n+(j+0)] * -0.010416666666666666
-                          +in[(i+-2)*n+(j+1)] * -0.010416666666666666
-                          +in[(i+-2)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+-2)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+-1)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+-1)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+-1)*n+(j+-2)] * -0.010416666666666666
+                          +in[(i+-2)*n+(j+-1)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+0)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+1)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+-2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+-1)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-1)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+-1)*n+(j+-2)] * -0.0104166666667
                           +in[(i+-1)*n+(j+-1)] * -0.0625
                           +in[(i+-1)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+2)] * 0.010416666666666666
-                          +in[(i+-1)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+-1)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+0)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+0)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+0)*n+(j+-2)] * -0.010416666666666666
+                          +in[(i+-1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+-1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+-1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+0)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+0)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+0)*n+(j+-2)] * -0.0104166666667
                           +in[(i+0)*n+(j+-1)] * -0.0625
                           +in[(i+0)*n+(j+1)] * 0.0625
-                          +in[(i+0)*n+(j+2)] * 0.010416666666666666
-                          +in[(i+0)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+0)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+1)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+1)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+1)*n+(j+-2)] * -0.010416666666666666
+                          +in[(i+0)*n+(j+2)] * 0.0104166666667
+                          +in[(i+0)*n+(j+3)] * 0.00416666666667
+                          +in[(i+0)*n+(j+4)] * 0.00223214285714
+                          +in[(i+1)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+1)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+1)*n+(j+-2)] * -0.0104166666667
                           +in[(i+1)*n+(j+0)] * 0.0625
                           +in[(i+1)*n+(j+1)] * 0.0625
-                          +in[(i+1)*n+(j+2)] * 0.010416666666666666
-                          +in[(i+1)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+1)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+2)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+2)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+2)*n+(j+-1)] * 0.010416666666666666
-                          +in[(i+2)*n+(j+0)] * 0.010416666666666666
-                          +in[(i+2)*n+(j+1)] * 0.010416666666666666
+                          +in[(i+1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+2)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+2)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+2)*n+(j+-1)] * 0.0104166666667
+                          +in[(i+2)*n+(j+0)] * 0.0104166666667
+                          +in[(i+2)*n+(j+1)] * 0.0104166666667
                           +in[(i+2)*n+(j+2)] * 0.03125
-                          +in[(i+2)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+2)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+3)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+3)*n+(j+-2)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+-1)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+0)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+1)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+2)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+3)] * 0.020833333333333332
-                          +in[(i+3)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+-3)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+-2)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+-1)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+0)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+1)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+2)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+3)] * 0.002232142857142857
+                          +in[(i+2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+3)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+3)*n+(j+-2)] * 0.00416666666667
+                          +in[(i+3)*n+(j+-1)] * 0.00416666666667
+                          +in[(i+3)*n+(j+0)] * 0.00416666666667
+                          +in[(i+3)*n+(j+1)] * 0.00416666666667
+                          +in[(i+3)*n+(j+2)] * 0.00416666666667
+                          +in[(i+3)*n+(j+3)] * 0.0208333333333
+                          +in[(i+3)*n+(j+4)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-3)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-2)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-1)] * 0.00223214285714
+                          +in[(i+4)*n+(j+0)] * 0.00223214285714
+                          +in[(i+4)*n+(j+1)] * 0.00223214285714
+                          +in[(i+4)*n+(j+2)] * 0.00223214285714
+                          +in[(i+4)*n+(j+3)] * 0.00223214285714
                           +in[(i+4)*n+(j+4)] * 0.015625
                           ;
            }
@@ -307,114 +307,114 @@ void grid5(const int n, const int t, std::vector<double> & in, std::vector<doubl
       PRAGMA_SIMD
       for (auto j : inside) {
             out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01
-                          +in[(i+-5)*n+(j+-4)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+-3)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+-2)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+-1)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+0)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+1)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+2)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+3)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+4)] * -0.0011111111111111111
-                          +in[(i+-4)*n+(j+-5)] * -0.0011111111111111111
+                          +in[(i+-5)*n+(j+-4)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-3)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-2)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-1)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+0)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+1)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+2)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+3)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+4)] * -0.00111111111111
+                          +in[(i+-4)*n+(j+-5)] * -0.00111111111111
                           +in[(i+-4)*n+(j+-4)] * -0.0125
-                          +in[(i+-4)*n+(j+-3)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+-2)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+-1)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+0)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+1)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+2)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+3)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+-3)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+-3)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+-3)*n+(j+-3)] * -0.016666666666666666
-                          +in[(i+-3)*n+(j+-2)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+-1)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+0)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+1)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+2)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+-3)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+-2)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+-2)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+-2)*n+(j+-3)] * -0.0033333333333333335
+                          +in[(i+-4)*n+(j+-3)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+-2)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+-1)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+0)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+1)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+2)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+3)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-3)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-3)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-3)*n+(j+-3)] * -0.0166666666667
+                          +in[(i+-3)*n+(j+-2)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+-1)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+0)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+1)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+2)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-2)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-2)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-2)*n+(j+-3)] * -0.00333333333333
                           +in[(i+-2)*n+(j+-2)] * -0.025
-                          +in[(i+-2)*n+(j+-1)] * -0.008333333333333333
-                          +in[(i+-2)*n+(j+0)] * -0.008333333333333333
-                          +in[(i+-2)*n+(j+1)] * -0.008333333333333333
-                          +in[(i+-2)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+-2)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+-2)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+-1)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+-1)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+-1)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+-1)*n+(j+-2)] * -0.008333333333333333
+                          +in[(i+-2)*n+(j+-1)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+0)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+1)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+-2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-1)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-1)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-1)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+-1)*n+(j+-2)] * -0.00833333333333
                           +in[(i+-1)*n+(j+-1)] * -0.05
                           +in[(i+-1)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+2)] * 0.008333333333333333
-                          +in[(i+-1)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+-1)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+-1)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+0)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+0)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+0)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+0)*n+(j+-2)] * -0.008333333333333333
+                          +in[(i+-1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+-1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+-1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+0)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+0)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+0)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+0)*n+(j+-2)] * -0.00833333333333
                           +in[(i+0)*n+(j+-1)] * -0.05
                           +in[(i+0)*n+(j+1)] * 0.05
-                          +in[(i+0)*n+(j+2)] * 0.008333333333333333
-                          +in[(i+0)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+0)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+0)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+1)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+1)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+1)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+1)*n+(j+-2)] * -0.008333333333333333
+                          +in[(i+0)*n+(j+2)] * 0.00833333333333
+                          +in[(i+0)*n+(j+3)] * 0.00333333333333
+                          +in[(i+0)*n+(j+4)] * 0.00178571428571
+                          +in[(i+0)*n+(j+5)] * 0.00111111111111
+                          +in[(i+1)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+1)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+1)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+1)*n+(j+-2)] * -0.00833333333333
                           +in[(i+1)*n+(j+0)] * 0.05
                           +in[(i+1)*n+(j+1)] * 0.05
-                          +in[(i+1)*n+(j+2)] * 0.008333333333333333
-                          +in[(i+1)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+1)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+1)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+2)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+2)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+2)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+2)*n+(j+-1)] * 0.008333333333333333
-                          +in[(i+2)*n+(j+0)] * 0.008333333333333333
-                          +in[(i+2)*n+(j+1)] * 0.008333333333333333
+                          +in[(i+1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+2)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+2)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+2)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+2)*n+(j+-1)] * 0.00833333333333
+                          +in[(i+2)*n+(j+0)] * 0.00833333333333
+                          +in[(i+2)*n+(j+1)] * 0.00833333333333
                           +in[(i+2)*n+(j+2)] * 0.025
-                          +in[(i+2)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+2)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+2)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+3)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+3)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+3)*n+(j+-2)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+-1)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+0)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+1)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+2)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+3)] * 0.016666666666666666
-                          +in[(i+3)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+3)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+4)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+4)*n+(j+-3)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+-2)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+-1)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+0)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+1)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+2)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+3)] * 0.0017857142857142857
+                          +in[(i+2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+3)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+3)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+3)*n+(j+-2)] * 0.00333333333333
+                          +in[(i+3)*n+(j+-1)] * 0.00333333333333
+                          +in[(i+3)*n+(j+0)] * 0.00333333333333
+                          +in[(i+3)*n+(j+1)] * 0.00333333333333
+                          +in[(i+3)*n+(j+2)] * 0.00333333333333
+                          +in[(i+3)*n+(j+3)] * 0.0166666666667
+                          +in[(i+3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+4)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+4)*n+(j+-3)] * 0.00178571428571
+                          +in[(i+4)*n+(j+-2)] * 0.00178571428571
+                          +in[(i+4)*n+(j+-1)] * 0.00178571428571
+                          +in[(i+4)*n+(j+0)] * 0.00178571428571
+                          +in[(i+4)*n+(j+1)] * 0.00178571428571
+                          +in[(i+4)*n+(j+2)] * 0.00178571428571
+                          +in[(i+4)*n+(j+3)] * 0.00178571428571
                           +in[(i+4)*n+(j+4)] * 0.0125
-                          +in[(i+4)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-4)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-3)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-2)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-1)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+0)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+1)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+2)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+3)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+4)] * 0.0011111111111111111
+                          +in[(i+4)*n+(j+5)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-4)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-3)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-2)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-1)] * 0.00111111111111
+                          +in[(i+5)*n+(j+0)] * 0.00111111111111
+                          +in[(i+5)*n+(j+1)] * 0.00111111111111
+                          +in[(i+5)*n+(j+2)] * 0.00111111111111
+                          +in[(i+5)*n+(j+3)] * 0.00111111111111
+                          +in[(i+5)*n+(j+4)] * 0.00111111111111
                           +in[(i+5)*n+(j+5)] * 0.01
                           ;
            }
diff --git a/Cxx11/stencil_seq.hpp b/Cxx11/stencil_seq.hpp
index a29cc7de6..139082a3b 100644
--- a/Cxx11/stencil_seq.hpp
+++ b/Cxx11/stencil_seq.hpp
@@ -40,18 +40,18 @@ void star3(const int n, const int t, std::vector<double> & in, std::vector<doubl
         for (auto i=it; i<std::min(n-3,it+t); ++i) {
           PRAGMA_SIMD
           for (auto j=jt; j<std::min(n-3,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.05555555555555555
-                          +in[(i+-2)*n+(j+0)] * -0.08333333333333333
-                          +in[(i+-1)*n+(j+0)] * -0.16666666666666666
-                          +in[(i+0)*n+(j+-3)] * -0.05555555555555555
-                          +in[(i+0)*n+(j+-2)] * -0.08333333333333333
-                          +in[(i+0)*n+(j+-1)] * -0.16666666666666666
-                          +in[(i+0)*n+(j+1)] * 0.16666666666666666
-                          +in[(i+0)*n+(j+2)] * 0.08333333333333333
-                          +in[(i+0)*n+(j+3)] * 0.05555555555555555
-                          +in[(i+1)*n+(j+0)] * 0.16666666666666666
-                          +in[(i+2)*n+(j+0)] * 0.08333333333333333
-                          +in[(i+3)*n+(j+0)] * 0.05555555555555555;
+            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556
+                          +in[(i+-2)*n+(j+0)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+0)] * -0.166666666667
+                          +in[(i+0)*n+(j+-3)] * -0.0555555555556
+                          +in[(i+0)*n+(j+-2)] * -0.0833333333333
+                          +in[(i+0)*n+(j+-1)] * -0.166666666667
+                          +in[(i+0)*n+(j+1)] * 0.166666666667
+                          +in[(i+0)*n+(j+2)] * 0.0833333333333
+                          +in[(i+0)*n+(j+3)] * 0.0555555555556
+                          +in[(i+1)*n+(j+0)] * 0.166666666667
+                          +in[(i+2)*n+(j+0)] * 0.0833333333333
+                          +in[(i+3)*n+(j+0)] * 0.0555555555556;
            }
          }
        }
@@ -65,20 +65,20 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
           PRAGMA_SIMD
           for (auto j=jt; j<std::min(n-4,jt+t); ++j) {
             out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125
-                          +in[(i+-3)*n+(j+0)] * -0.041666666666666664
+                          +in[(i+-3)*n+(j+0)] * -0.0416666666667
                           +in[(i+-2)*n+(j+0)] * -0.0625
                           +in[(i+-1)*n+(j+0)] * -0.125
                           +in[(i+0)*n+(j+-4)] * -0.03125
-                          +in[(i+0)*n+(j+-3)] * -0.041666666666666664
+                          +in[(i+0)*n+(j+-3)] * -0.0416666666667
                           +in[(i+0)*n+(j+-2)] * -0.0625
                           +in[(i+0)*n+(j+-1)] * -0.125
                           +in[(i+0)*n+(j+1)] * 0.125
                           +in[(i+0)*n+(j+2)] * 0.0625
-                          +in[(i+0)*n+(j+3)] * 0.041666666666666664
+                          +in[(i+0)*n+(j+3)] * 0.0416666666667
                           +in[(i+0)*n+(j+4)] * 0.03125
                           +in[(i+1)*n+(j+0)] * 0.125
                           +in[(i+2)*n+(j+0)] * 0.0625
-                          +in[(i+3)*n+(j+0)] * 0.041666666666666664
+                          +in[(i+3)*n+(j+0)] * 0.0416666666667
                           +in[(i+4)*n+(j+0)] * 0.03125;
            }
          }
@@ -94,22 +94,22 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
           for (auto j=jt; j<std::min(n-5,jt+t); ++j) {
             out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02
                           +in[(i+-4)*n+(j+0)] * -0.025
-                          +in[(i+-3)*n+(j+0)] * -0.03333333333333333
+                          +in[(i+-3)*n+(j+0)] * -0.0333333333333
                           +in[(i+-2)*n+(j+0)] * -0.05
                           +in[(i+-1)*n+(j+0)] * -0.1
                           +in[(i+0)*n+(j+-5)] * -0.02
                           +in[(i+0)*n+(j+-4)] * -0.025
-                          +in[(i+0)*n+(j+-3)] * -0.03333333333333333
+                          +in[(i+0)*n+(j+-3)] * -0.0333333333333
                           +in[(i+0)*n+(j+-2)] * -0.05
                           +in[(i+0)*n+(j+-1)] * -0.1
                           +in[(i+0)*n+(j+1)] * 0.1
                           +in[(i+0)*n+(j+2)] * 0.05
-                          +in[(i+0)*n+(j+3)] * 0.03333333333333333
+                          +in[(i+0)*n+(j+3)] * 0.0333333333333
                           +in[(i+0)*n+(j+4)] * 0.025
                           +in[(i+0)*n+(j+5)] * 0.02
                           +in[(i+1)*n+(j+0)] * 0.1
                           +in[(i+2)*n+(j+0)] * 0.05
-                          +in[(i+3)*n+(j+0)] * 0.03333333333333333
+                          +in[(i+3)*n+(j+0)] * 0.0333333333333
                           +in[(i+4)*n+(j+0)] * 0.025
                           +in[(i+5)*n+(j+0)] * 0.02;
            }
@@ -144,24 +144,24 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
           PRAGMA_SIMD
           for (auto j=jt; j<std::min(n-2,jt+t); ++j) {
             out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625
-                          +in[(i+-2)*n+(j+-1)] * -0.020833333333333332
-                          +in[(i+-2)*n+(j+0)] * -0.020833333333333332
-                          +in[(i+-2)*n+(j+1)] * -0.020833333333333332
-                          +in[(i+-1)*n+(j+-2)] * -0.020833333333333332
+                          +in[(i+-2)*n+(j+-1)] * -0.0208333333333
+                          +in[(i+-2)*n+(j+0)] * -0.0208333333333
+                          +in[(i+-2)*n+(j+1)] * -0.0208333333333
+                          +in[(i+-1)*n+(j+-2)] * -0.0208333333333
                           +in[(i+-1)*n+(j+-1)] * -0.125
                           +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+2)] * 0.020833333333333332
-                          +in[(i+0)*n+(j+-2)] * -0.020833333333333332
+                          +in[(i+-1)*n+(j+2)] * 0.0208333333333
+                          +in[(i+0)*n+(j+-2)] * -0.0208333333333
                           +in[(i+0)*n+(j+-1)] * -0.125
                           +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.020833333333333332
-                          +in[(i+1)*n+(j+-2)] * -0.020833333333333332
+                          +in[(i+0)*n+(j+2)] * 0.0208333333333
+                          +in[(i+1)*n+(j+-2)] * -0.0208333333333
                           +in[(i+1)*n+(j+0)] * 0.125
                           +in[(i+1)*n+(j+1)] * 0.125
-                          +in[(i+1)*n+(j+2)] * 0.020833333333333332
-                          +in[(i+2)*n+(j+-1)] * 0.020833333333333332
-                          +in[(i+2)*n+(j+0)] * 0.020833333333333332
-                          +in[(i+2)*n+(j+1)] * 0.020833333333333332
+                          +in[(i+1)*n+(j+2)] * 0.0208333333333
+                          +in[(i+2)*n+(j+-1)] * 0.0208333333333
+                          +in[(i+2)*n+(j+0)] * 0.0208333333333
+                          +in[(i+2)*n+(j+1)] * 0.0208333333333
                           +in[(i+2)*n+(j+2)] * 0.0625
                           ;
            }
@@ -176,48 +176,48 @@ void grid3(const int n, const int t, std::vector<double> & in, std::vector<doubl
         for (auto i=it; i<std::min(n-3,it+t); ++i) {
           PRAGMA_SIMD
           for (auto j=jt; j<std::min(n-3,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.027777777777777776
-                          +in[(i+-3)*n+(j+-2)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+-1)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+0)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+1)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+2)] * -0.005555555555555556
-                          +in[(i+-2)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+-2)*n+(j+-2)] * -0.041666666666666664
-                          +in[(i+-2)*n+(j+-1)] * -0.013888888888888888
-                          +in[(i+-2)*n+(j+0)] * -0.013888888888888888
-                          +in[(i+-2)*n+(j+1)] * -0.013888888888888888
-                          +in[(i+-2)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+-1)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+-1)*n+(j+-2)] * -0.013888888888888888
-                          +in[(i+-1)*n+(j+-1)] * -0.08333333333333333
-                          +in[(i+-1)*n+(j+0)] * -0.08333333333333333
-                          +in[(i+-1)*n+(j+2)] * 0.013888888888888888
-                          +in[(i+-1)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+0)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.013888888888888888
-                          +in[(i+0)*n+(j+-1)] * -0.08333333333333333
-                          +in[(i+0)*n+(j+1)] * 0.08333333333333333
-                          +in[(i+0)*n+(j+2)] * 0.013888888888888888
-                          +in[(i+0)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+1)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+1)*n+(j+-2)] * -0.013888888888888888
-                          +in[(i+1)*n+(j+0)] * 0.08333333333333333
-                          +in[(i+1)*n+(j+1)] * 0.08333333333333333
-                          +in[(i+1)*n+(j+2)] * 0.013888888888888888
-                          +in[(i+1)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+2)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+2)*n+(j+-1)] * 0.013888888888888888
-                          +in[(i+2)*n+(j+0)] * 0.013888888888888888
-                          +in[(i+2)*n+(j+1)] * 0.013888888888888888
-                          +in[(i+2)*n+(j+2)] * 0.041666666666666664
-                          +in[(i+2)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+-2)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+-1)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+0)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+1)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+2)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+3)] * 0.027777777777777776
+            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778
+                          +in[(i+-3)*n+(j+-2)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+-1)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+0)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+1)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+2)] * -0.00555555555556
+                          +in[(i+-2)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+-2)*n+(j+-2)] * -0.0416666666667
+                          +in[(i+-2)*n+(j+-1)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+0)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+1)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+3)] * 0.00555555555556
+                          +in[(i+-1)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+-1)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+-1)*n+(j+-1)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+0)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+-1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+0)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+0)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+0)*n+(j+-1)] * -0.0833333333333
+                          +in[(i+0)*n+(j+1)] * 0.0833333333333
+                          +in[(i+0)*n+(j+2)] * 0.0138888888889
+                          +in[(i+0)*n+(j+3)] * 0.00555555555556
+                          +in[(i+1)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+1)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+1)*n+(j+0)] * 0.0833333333333
+                          +in[(i+1)*n+(j+1)] * 0.0833333333333
+                          +in[(i+1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+2)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+2)*n+(j+-1)] * 0.0138888888889
+                          +in[(i+2)*n+(j+0)] * 0.0138888888889
+                          +in[(i+2)*n+(j+1)] * 0.0138888888889
+                          +in[(i+2)*n+(j+2)] * 0.0416666666667
+                          +in[(i+2)*n+(j+3)] * 0.00555555555556
+                          +in[(i+3)*n+(j+-2)] * 0.00555555555556
+                          +in[(i+3)*n+(j+-1)] * 0.00555555555556
+                          +in[(i+3)*n+(j+0)] * 0.00555555555556
+                          +in[(i+3)*n+(j+1)] * 0.00555555555556
+                          +in[(i+3)*n+(j+2)] * 0.00555555555556
+                          +in[(i+3)*n+(j+3)] * 0.0277777777778
                           ;
            }
          }
@@ -232,76 +232,76 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
           PRAGMA_SIMD
           for (auto j=jt; j<std::min(n-4,jt+t); ++j) {
             out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625
-                          +in[(i+-4)*n+(j+-3)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+-2)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+-1)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+0)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+1)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+2)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+3)] * -0.002232142857142857
-                          +in[(i+-3)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+-3)*n+(j+-3)] * -0.020833333333333332
-                          +in[(i+-3)*n+(j+-2)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+-1)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+0)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+1)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+2)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+-2)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+-2)*n+(j+-3)] * -0.004166666666666667
+                          +in[(i+-4)*n+(j+-3)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+-2)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+-1)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+0)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+1)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+2)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+3)] * -0.00223214285714
+                          +in[(i+-3)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-3)*n+(j+-3)] * -0.0208333333333
+                          +in[(i+-3)*n+(j+-2)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+-1)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+0)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+1)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+2)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+4)] * 0.00223214285714
+                          +in[(i+-2)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-2)*n+(j+-3)] * -0.00416666666667
                           +in[(i+-2)*n+(j+-2)] * -0.03125
-                          +in[(i+-2)*n+(j+-1)] * -0.010416666666666666
-                          +in[(i+-2)*n+(j+0)] * -0.010416666666666666
-                          +in[(i+-2)*n+(j+1)] * -0.010416666666666666
-                          +in[(i+-2)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+-2)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+-1)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+-1)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+-1)*n+(j+-2)] * -0.010416666666666666
+                          +in[(i+-2)*n+(j+-1)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+0)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+1)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+-2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+-1)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-1)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+-1)*n+(j+-2)] * -0.0104166666667
                           +in[(i+-1)*n+(j+-1)] * -0.0625
                           +in[(i+-1)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+2)] * 0.010416666666666666
-                          +in[(i+-1)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+-1)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+0)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+0)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+0)*n+(j+-2)] * -0.010416666666666666
+                          +in[(i+-1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+-1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+-1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+0)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+0)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+0)*n+(j+-2)] * -0.0104166666667
                           +in[(i+0)*n+(j+-1)] * -0.0625
                           +in[(i+0)*n+(j+1)] * 0.0625
-                          +in[(i+0)*n+(j+2)] * 0.010416666666666666
-                          +in[(i+0)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+0)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+1)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+1)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+1)*n+(j+-2)] * -0.010416666666666666
+                          +in[(i+0)*n+(j+2)] * 0.0104166666667
+                          +in[(i+0)*n+(j+3)] * 0.00416666666667
+                          +in[(i+0)*n+(j+4)] * 0.00223214285714
+                          +in[(i+1)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+1)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+1)*n+(j+-2)] * -0.0104166666667
                           +in[(i+1)*n+(j+0)] * 0.0625
                           +in[(i+1)*n+(j+1)] * 0.0625
-                          +in[(i+1)*n+(j+2)] * 0.010416666666666666
-                          +in[(i+1)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+1)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+2)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+2)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+2)*n+(j+-1)] * 0.010416666666666666
-                          +in[(i+2)*n+(j+0)] * 0.010416666666666666
-                          +in[(i+2)*n+(j+1)] * 0.010416666666666666
+                          +in[(i+1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+2)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+2)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+2)*n+(j+-1)] * 0.0104166666667
+                          +in[(i+2)*n+(j+0)] * 0.0104166666667
+                          +in[(i+2)*n+(j+1)] * 0.0104166666667
                           +in[(i+2)*n+(j+2)] * 0.03125
-                          +in[(i+2)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+2)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+3)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+3)*n+(j+-2)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+-1)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+0)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+1)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+2)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+3)] * 0.020833333333333332
-                          +in[(i+3)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+-3)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+-2)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+-1)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+0)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+1)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+2)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+3)] * 0.002232142857142857
+                          +in[(i+2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+3)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+3)*n+(j+-2)] * 0.00416666666667
+                          +in[(i+3)*n+(j+-1)] * 0.00416666666667
+                          +in[(i+3)*n+(j+0)] * 0.00416666666667
+                          +in[(i+3)*n+(j+1)] * 0.00416666666667
+                          +in[(i+3)*n+(j+2)] * 0.00416666666667
+                          +in[(i+3)*n+(j+3)] * 0.0208333333333
+                          +in[(i+3)*n+(j+4)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-3)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-2)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-1)] * 0.00223214285714
+                          +in[(i+4)*n+(j+0)] * 0.00223214285714
+                          +in[(i+4)*n+(j+1)] * 0.00223214285714
+                          +in[(i+4)*n+(j+2)] * 0.00223214285714
+                          +in[(i+4)*n+(j+3)] * 0.00223214285714
                           +in[(i+4)*n+(j+4)] * 0.015625
                           ;
            }
@@ -317,114 +317,114 @@ void grid5(const int n, const int t, std::vector<double> & in, std::vector<doubl
           PRAGMA_SIMD
           for (auto j=jt; j<std::min(n-5,jt+t); ++j) {
             out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01
-                          +in[(i+-5)*n+(j+-4)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+-3)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+-2)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+-1)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+0)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+1)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+2)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+3)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+4)] * -0.0011111111111111111
-                          +in[(i+-4)*n+(j+-5)] * -0.0011111111111111111
+                          +in[(i+-5)*n+(j+-4)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-3)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-2)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-1)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+0)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+1)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+2)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+3)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+4)] * -0.00111111111111
+                          +in[(i+-4)*n+(j+-5)] * -0.00111111111111
                           +in[(i+-4)*n+(j+-4)] * -0.0125
-                          +in[(i+-4)*n+(j+-3)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+-2)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+-1)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+0)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+1)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+2)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+3)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+-3)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+-3)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+-3)*n+(j+-3)] * -0.016666666666666666
-                          +in[(i+-3)*n+(j+-2)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+-1)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+0)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+1)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+2)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+-3)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+-2)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+-2)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+-2)*n+(j+-3)] * -0.0033333333333333335
+                          +in[(i+-4)*n+(j+-3)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+-2)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+-1)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+0)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+1)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+2)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+3)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-3)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-3)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-3)*n+(j+-3)] * -0.0166666666667
+                          +in[(i+-3)*n+(j+-2)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+-1)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+0)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+1)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+2)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-2)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-2)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-2)*n+(j+-3)] * -0.00333333333333
                           +in[(i+-2)*n+(j+-2)] * -0.025
-                          +in[(i+-2)*n+(j+-1)] * -0.008333333333333333
-                          +in[(i+-2)*n+(j+0)] * -0.008333333333333333
-                          +in[(i+-2)*n+(j+1)] * -0.008333333333333333
-                          +in[(i+-2)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+-2)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+-2)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+-1)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+-1)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+-1)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+-1)*n+(j+-2)] * -0.008333333333333333
+                          +in[(i+-2)*n+(j+-1)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+0)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+1)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+-2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-1)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-1)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-1)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+-1)*n+(j+-2)] * -0.00833333333333
                           +in[(i+-1)*n+(j+-1)] * -0.05
                           +in[(i+-1)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+2)] * 0.008333333333333333
-                          +in[(i+-1)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+-1)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+-1)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+0)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+0)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+0)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+0)*n+(j+-2)] * -0.008333333333333333
+                          +in[(i+-1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+-1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+-1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+0)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+0)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+0)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+0)*n+(j+-2)] * -0.00833333333333
                           +in[(i+0)*n+(j+-1)] * -0.05
                           +in[(i+0)*n+(j+1)] * 0.05
-                          +in[(i+0)*n+(j+2)] * 0.008333333333333333
-                          +in[(i+0)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+0)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+0)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+1)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+1)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+1)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+1)*n+(j+-2)] * -0.008333333333333333
+                          +in[(i+0)*n+(j+2)] * 0.00833333333333
+                          +in[(i+0)*n+(j+3)] * 0.00333333333333
+                          +in[(i+0)*n+(j+4)] * 0.00178571428571
+                          +in[(i+0)*n+(j+5)] * 0.00111111111111
+                          +in[(i+1)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+1)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+1)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+1)*n+(j+-2)] * -0.00833333333333
                           +in[(i+1)*n+(j+0)] * 0.05
                           +in[(i+1)*n+(j+1)] * 0.05
-                          +in[(i+1)*n+(j+2)] * 0.008333333333333333
-                          +in[(i+1)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+1)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+1)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+2)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+2)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+2)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+2)*n+(j+-1)] * 0.008333333333333333
-                          +in[(i+2)*n+(j+0)] * 0.008333333333333333
-                          +in[(i+2)*n+(j+1)] * 0.008333333333333333
+                          +in[(i+1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+2)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+2)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+2)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+2)*n+(j+-1)] * 0.00833333333333
+                          +in[(i+2)*n+(j+0)] * 0.00833333333333
+                          +in[(i+2)*n+(j+1)] * 0.00833333333333
                           +in[(i+2)*n+(j+2)] * 0.025
-                          +in[(i+2)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+2)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+2)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+3)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+3)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+3)*n+(j+-2)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+-1)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+0)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+1)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+2)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+3)] * 0.016666666666666666
-                          +in[(i+3)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+3)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+4)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+4)*n+(j+-3)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+-2)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+-1)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+0)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+1)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+2)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+3)] * 0.0017857142857142857
+                          +in[(i+2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+3)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+3)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+3)*n+(j+-2)] * 0.00333333333333
+                          +in[(i+3)*n+(j+-1)] * 0.00333333333333
+                          +in[(i+3)*n+(j+0)] * 0.00333333333333
+                          +in[(i+3)*n+(j+1)] * 0.00333333333333
+                          +in[(i+3)*n+(j+2)] * 0.00333333333333
+                          +in[(i+3)*n+(j+3)] * 0.0166666666667
+                          +in[(i+3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+4)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+4)*n+(j+-3)] * 0.00178571428571
+                          +in[(i+4)*n+(j+-2)] * 0.00178571428571
+                          +in[(i+4)*n+(j+-1)] * 0.00178571428571
+                          +in[(i+4)*n+(j+0)] * 0.00178571428571
+                          +in[(i+4)*n+(j+1)] * 0.00178571428571
+                          +in[(i+4)*n+(j+2)] * 0.00178571428571
+                          +in[(i+4)*n+(j+3)] * 0.00178571428571
                           +in[(i+4)*n+(j+4)] * 0.0125
-                          +in[(i+4)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-4)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-3)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-2)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-1)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+0)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+1)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+2)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+3)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+4)] * 0.0011111111111111111
+                          +in[(i+4)*n+(j+5)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-4)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-3)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-2)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-1)] * 0.00111111111111
+                          +in[(i+5)*n+(j+0)] * 0.00111111111111
+                          +in[(i+5)*n+(j+1)] * 0.00111111111111
+                          +in[(i+5)*n+(j+2)] * 0.00111111111111
+                          +in[(i+5)*n+(j+3)] * 0.00111111111111
+                          +in[(i+5)*n+(j+4)] * 0.00111111111111
                           +in[(i+5)*n+(j+5)] * 0.01
                           ;
            }
diff --git a/Cxx11/stencil_stl.hpp b/Cxx11/stencil_stl.hpp
index 2aff06b13..ecde3e1ce 100644
--- a/Cxx11/stencil_stl.hpp
+++ b/Cxx11/stencil_stl.hpp
@@ -30,18 +30,18 @@ void star3(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = boost::irange(3,n-3);
     std::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.05555555555555555
-                          +in[(i+-2)*n+(j+0)] * -0.08333333333333333
-                          +in[(i+-1)*n+(j+0)] * -0.16666666666666666
-                          +in[(i+0)*n+(j+-3)] * -0.05555555555555555
-                          +in[(i+0)*n+(j+-2)] * -0.08333333333333333
-                          +in[(i+0)*n+(j+-1)] * -0.16666666666666666
-                          +in[(i+0)*n+(j+1)] * 0.16666666666666666
-                          +in[(i+0)*n+(j+2)] * 0.08333333333333333
-                          +in[(i+0)*n+(j+3)] * 0.05555555555555555
-                          +in[(i+1)*n+(j+0)] * 0.16666666666666666
-                          +in[(i+2)*n+(j+0)] * 0.08333333333333333
-                          +in[(i+3)*n+(j+0)] * 0.05555555555555555;
+            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556
+                          +in[(i+-2)*n+(j+0)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+0)] * -0.166666666667
+                          +in[(i+0)*n+(j+-3)] * -0.0555555555556
+                          +in[(i+0)*n+(j+-2)] * -0.0833333333333
+                          +in[(i+0)*n+(j+-1)] * -0.166666666667
+                          +in[(i+0)*n+(j+1)] * 0.166666666667
+                          +in[(i+0)*n+(j+2)] * 0.0833333333333
+                          +in[(i+0)*n+(j+3)] * 0.0555555555556
+                          +in[(i+1)*n+(j+0)] * 0.166666666667
+                          +in[(i+2)*n+(j+0)] * 0.0833333333333
+                          +in[(i+3)*n+(j+0)] * 0.0555555555556;
        });
      });
 }
@@ -51,20 +51,20 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
     std::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125
-                          +in[(i+-3)*n+(j+0)] * -0.041666666666666664
+                          +in[(i+-3)*n+(j+0)] * -0.0416666666667
                           +in[(i+-2)*n+(j+0)] * -0.0625
                           +in[(i+-1)*n+(j+0)] * -0.125
                           +in[(i+0)*n+(j+-4)] * -0.03125
-                          +in[(i+0)*n+(j+-3)] * -0.041666666666666664
+                          +in[(i+0)*n+(j+-3)] * -0.0416666666667
                           +in[(i+0)*n+(j+-2)] * -0.0625
                           +in[(i+0)*n+(j+-1)] * -0.125
                           +in[(i+0)*n+(j+1)] * 0.125
                           +in[(i+0)*n+(j+2)] * 0.0625
-                          +in[(i+0)*n+(j+3)] * 0.041666666666666664
+                          +in[(i+0)*n+(j+3)] * 0.0416666666667
                           +in[(i+0)*n+(j+4)] * 0.03125
                           +in[(i+1)*n+(j+0)] * 0.125
                           +in[(i+2)*n+(j+0)] * 0.0625
-                          +in[(i+3)*n+(j+0)] * 0.041666666666666664
+                          +in[(i+3)*n+(j+0)] * 0.0416666666667
                           +in[(i+4)*n+(j+0)] * 0.03125;
        });
      });
@@ -76,22 +76,22 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02
                           +in[(i+-4)*n+(j+0)] * -0.025
-                          +in[(i+-3)*n+(j+0)] * -0.03333333333333333
+                          +in[(i+-3)*n+(j+0)] * -0.0333333333333
                           +in[(i+-2)*n+(j+0)] * -0.05
                           +in[(i+-1)*n+(j+0)] * -0.1
                           +in[(i+0)*n+(j+-5)] * -0.02
                           +in[(i+0)*n+(j+-4)] * -0.025
-                          +in[(i+0)*n+(j+-3)] * -0.03333333333333333
+                          +in[(i+0)*n+(j+-3)] * -0.0333333333333
                           +in[(i+0)*n+(j+-2)] * -0.05
                           +in[(i+0)*n+(j+-1)] * -0.1
                           +in[(i+0)*n+(j+1)] * 0.1
                           +in[(i+0)*n+(j+2)] * 0.05
-                          +in[(i+0)*n+(j+3)] * 0.03333333333333333
+                          +in[(i+0)*n+(j+3)] * 0.0333333333333
                           +in[(i+0)*n+(j+4)] * 0.025
                           +in[(i+0)*n+(j+5)] * 0.02
                           +in[(i+1)*n+(j+0)] * 0.1
                           +in[(i+2)*n+(j+0)] * 0.05
-                          +in[(i+3)*n+(j+0)] * 0.03333333333333333
+                          +in[(i+3)*n+(j+0)] * 0.0333333333333
                           +in[(i+4)*n+(j+0)] * 0.025
                           +in[(i+5)*n+(j+0)] * 0.02;
        });
@@ -118,24 +118,24 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
     std::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625
-                          +in[(i+-2)*n+(j+-1)] * -0.020833333333333332
-                          +in[(i+-2)*n+(j+0)] * -0.020833333333333332
-                          +in[(i+-2)*n+(j+1)] * -0.020833333333333332
-                          +in[(i+-1)*n+(j+-2)] * -0.020833333333333332
+                          +in[(i+-2)*n+(j+-1)] * -0.0208333333333
+                          +in[(i+-2)*n+(j+0)] * -0.0208333333333
+                          +in[(i+-2)*n+(j+1)] * -0.0208333333333
+                          +in[(i+-1)*n+(j+-2)] * -0.0208333333333
                           +in[(i+-1)*n+(j+-1)] * -0.125
                           +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+2)] * 0.020833333333333332
-                          +in[(i+0)*n+(j+-2)] * -0.020833333333333332
+                          +in[(i+-1)*n+(j+2)] * 0.0208333333333
+                          +in[(i+0)*n+(j+-2)] * -0.0208333333333
                           +in[(i+0)*n+(j+-1)] * -0.125
                           +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.020833333333333332
-                          +in[(i+1)*n+(j+-2)] * -0.020833333333333332
+                          +in[(i+0)*n+(j+2)] * 0.0208333333333
+                          +in[(i+1)*n+(j+-2)] * -0.0208333333333
                           +in[(i+1)*n+(j+0)] * 0.125
                           +in[(i+1)*n+(j+1)] * 0.125
-                          +in[(i+1)*n+(j+2)] * 0.020833333333333332
-                          +in[(i+2)*n+(j+-1)] * 0.020833333333333332
-                          +in[(i+2)*n+(j+0)] * 0.020833333333333332
-                          +in[(i+2)*n+(j+1)] * 0.020833333333333332
+                          +in[(i+1)*n+(j+2)] * 0.0208333333333
+                          +in[(i+2)*n+(j+-1)] * 0.0208333333333
+                          +in[(i+2)*n+(j+0)] * 0.0208333333333
+                          +in[(i+2)*n+(j+1)] * 0.0208333333333
                           +in[(i+2)*n+(j+2)] * 0.0625
                           ;
        });
@@ -146,48 +146,48 @@ void grid3(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = boost::irange(3,n-3);
     std::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.027777777777777776
-                          +in[(i+-3)*n+(j+-2)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+-1)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+0)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+1)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+2)] * -0.005555555555555556
-                          +in[(i+-2)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+-2)*n+(j+-2)] * -0.041666666666666664
-                          +in[(i+-2)*n+(j+-1)] * -0.013888888888888888
-                          +in[(i+-2)*n+(j+0)] * -0.013888888888888888
-                          +in[(i+-2)*n+(j+1)] * -0.013888888888888888
-                          +in[(i+-2)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+-1)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+-1)*n+(j+-2)] * -0.013888888888888888
-                          +in[(i+-1)*n+(j+-1)] * -0.08333333333333333
-                          +in[(i+-1)*n+(j+0)] * -0.08333333333333333
-                          +in[(i+-1)*n+(j+2)] * 0.013888888888888888
-                          +in[(i+-1)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+0)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.013888888888888888
-                          +in[(i+0)*n+(j+-1)] * -0.08333333333333333
-                          +in[(i+0)*n+(j+1)] * 0.08333333333333333
-                          +in[(i+0)*n+(j+2)] * 0.013888888888888888
-                          +in[(i+0)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+1)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+1)*n+(j+-2)] * -0.013888888888888888
-                          +in[(i+1)*n+(j+0)] * 0.08333333333333333
-                          +in[(i+1)*n+(j+1)] * 0.08333333333333333
-                          +in[(i+1)*n+(j+2)] * 0.013888888888888888
-                          +in[(i+1)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+2)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+2)*n+(j+-1)] * 0.013888888888888888
-                          +in[(i+2)*n+(j+0)] * 0.013888888888888888
-                          +in[(i+2)*n+(j+1)] * 0.013888888888888888
-                          +in[(i+2)*n+(j+2)] * 0.041666666666666664
-                          +in[(i+2)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+-2)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+-1)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+0)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+1)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+2)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+3)] * 0.027777777777777776
+            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778
+                          +in[(i+-3)*n+(j+-2)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+-1)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+0)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+1)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+2)] * -0.00555555555556
+                          +in[(i+-2)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+-2)*n+(j+-2)] * -0.0416666666667
+                          +in[(i+-2)*n+(j+-1)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+0)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+1)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+3)] * 0.00555555555556
+                          +in[(i+-1)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+-1)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+-1)*n+(j+-1)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+0)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+-1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+0)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+0)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+0)*n+(j+-1)] * -0.0833333333333
+                          +in[(i+0)*n+(j+1)] * 0.0833333333333
+                          +in[(i+0)*n+(j+2)] * 0.0138888888889
+                          +in[(i+0)*n+(j+3)] * 0.00555555555556
+                          +in[(i+1)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+1)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+1)*n+(j+0)] * 0.0833333333333
+                          +in[(i+1)*n+(j+1)] * 0.0833333333333
+                          +in[(i+1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+2)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+2)*n+(j+-1)] * 0.0138888888889
+                          +in[(i+2)*n+(j+0)] * 0.0138888888889
+                          +in[(i+2)*n+(j+1)] * 0.0138888888889
+                          +in[(i+2)*n+(j+2)] * 0.0416666666667
+                          +in[(i+2)*n+(j+3)] * 0.00555555555556
+                          +in[(i+3)*n+(j+-2)] * 0.00555555555556
+                          +in[(i+3)*n+(j+-1)] * 0.00555555555556
+                          +in[(i+3)*n+(j+0)] * 0.00555555555556
+                          +in[(i+3)*n+(j+1)] * 0.00555555555556
+                          +in[(i+3)*n+(j+2)] * 0.00555555555556
+                          +in[(i+3)*n+(j+3)] * 0.0277777777778
                           ;
        });
      });
@@ -198,76 +198,76 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
     std::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625
-                          +in[(i+-4)*n+(j+-3)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+-2)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+-1)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+0)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+1)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+2)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+3)] * -0.002232142857142857
-                          +in[(i+-3)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+-3)*n+(j+-3)] * -0.020833333333333332
-                          +in[(i+-3)*n+(j+-2)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+-1)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+0)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+1)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+2)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+-2)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+-2)*n+(j+-3)] * -0.004166666666666667
+                          +in[(i+-4)*n+(j+-3)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+-2)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+-1)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+0)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+1)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+2)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+3)] * -0.00223214285714
+                          +in[(i+-3)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-3)*n+(j+-3)] * -0.0208333333333
+                          +in[(i+-3)*n+(j+-2)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+-1)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+0)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+1)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+2)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+4)] * 0.00223214285714
+                          +in[(i+-2)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-2)*n+(j+-3)] * -0.00416666666667
                           +in[(i+-2)*n+(j+-2)] * -0.03125
-                          +in[(i+-2)*n+(j+-1)] * -0.010416666666666666
-                          +in[(i+-2)*n+(j+0)] * -0.010416666666666666
-                          +in[(i+-2)*n+(j+1)] * -0.010416666666666666
-                          +in[(i+-2)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+-2)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+-1)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+-1)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+-1)*n+(j+-2)] * -0.010416666666666666
+                          +in[(i+-2)*n+(j+-1)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+0)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+1)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+-2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+-1)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-1)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+-1)*n+(j+-2)] * -0.0104166666667
                           +in[(i+-1)*n+(j+-1)] * -0.0625
                           +in[(i+-1)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+2)] * 0.010416666666666666
-                          +in[(i+-1)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+-1)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+0)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+0)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+0)*n+(j+-2)] * -0.010416666666666666
+                          +in[(i+-1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+-1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+-1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+0)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+0)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+0)*n+(j+-2)] * -0.0104166666667
                           +in[(i+0)*n+(j+-1)] * -0.0625
                           +in[(i+0)*n+(j+1)] * 0.0625
-                          +in[(i+0)*n+(j+2)] * 0.010416666666666666
-                          +in[(i+0)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+0)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+1)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+1)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+1)*n+(j+-2)] * -0.010416666666666666
+                          +in[(i+0)*n+(j+2)] * 0.0104166666667
+                          +in[(i+0)*n+(j+3)] * 0.00416666666667
+                          +in[(i+0)*n+(j+4)] * 0.00223214285714
+                          +in[(i+1)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+1)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+1)*n+(j+-2)] * -0.0104166666667
                           +in[(i+1)*n+(j+0)] * 0.0625
                           +in[(i+1)*n+(j+1)] * 0.0625
-                          +in[(i+1)*n+(j+2)] * 0.010416666666666666
-                          +in[(i+1)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+1)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+2)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+2)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+2)*n+(j+-1)] * 0.010416666666666666
-                          +in[(i+2)*n+(j+0)] * 0.010416666666666666
-                          +in[(i+2)*n+(j+1)] * 0.010416666666666666
+                          +in[(i+1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+2)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+2)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+2)*n+(j+-1)] * 0.0104166666667
+                          +in[(i+2)*n+(j+0)] * 0.0104166666667
+                          +in[(i+2)*n+(j+1)] * 0.0104166666667
                           +in[(i+2)*n+(j+2)] * 0.03125
-                          +in[(i+2)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+2)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+3)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+3)*n+(j+-2)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+-1)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+0)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+1)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+2)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+3)] * 0.020833333333333332
-                          +in[(i+3)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+-3)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+-2)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+-1)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+0)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+1)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+2)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+3)] * 0.002232142857142857
+                          +in[(i+2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+3)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+3)*n+(j+-2)] * 0.00416666666667
+                          +in[(i+3)*n+(j+-1)] * 0.00416666666667
+                          +in[(i+3)*n+(j+0)] * 0.00416666666667
+                          +in[(i+3)*n+(j+1)] * 0.00416666666667
+                          +in[(i+3)*n+(j+2)] * 0.00416666666667
+                          +in[(i+3)*n+(j+3)] * 0.0208333333333
+                          +in[(i+3)*n+(j+4)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-3)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-2)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-1)] * 0.00223214285714
+                          +in[(i+4)*n+(j+0)] * 0.00223214285714
+                          +in[(i+4)*n+(j+1)] * 0.00223214285714
+                          +in[(i+4)*n+(j+2)] * 0.00223214285714
+                          +in[(i+4)*n+(j+3)] * 0.00223214285714
                           +in[(i+4)*n+(j+4)] * 0.015625
                           ;
        });
@@ -279,114 +279,114 @@ void grid5(const int n, const int t, std::vector<double> & in, std::vector<doubl
     std::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01
-                          +in[(i+-5)*n+(j+-4)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+-3)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+-2)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+-1)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+0)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+1)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+2)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+3)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+4)] * -0.0011111111111111111
-                          +in[(i+-4)*n+(j+-5)] * -0.0011111111111111111
+                          +in[(i+-5)*n+(j+-4)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-3)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-2)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-1)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+0)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+1)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+2)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+3)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+4)] * -0.00111111111111
+                          +in[(i+-4)*n+(j+-5)] * -0.00111111111111
                           +in[(i+-4)*n+(j+-4)] * -0.0125
-                          +in[(i+-4)*n+(j+-3)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+-2)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+-1)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+0)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+1)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+2)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+3)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+-3)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+-3)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+-3)*n+(j+-3)] * -0.016666666666666666
-                          +in[(i+-3)*n+(j+-2)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+-1)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+0)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+1)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+2)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+-3)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+-2)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+-2)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+-2)*n+(j+-3)] * -0.0033333333333333335
+                          +in[(i+-4)*n+(j+-3)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+-2)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+-1)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+0)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+1)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+2)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+3)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-3)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-3)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-3)*n+(j+-3)] * -0.0166666666667
+                          +in[(i+-3)*n+(j+-2)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+-1)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+0)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+1)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+2)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-2)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-2)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-2)*n+(j+-3)] * -0.00333333333333
                           +in[(i+-2)*n+(j+-2)] * -0.025
-                          +in[(i+-2)*n+(j+-1)] * -0.008333333333333333
-                          +in[(i+-2)*n+(j+0)] * -0.008333333333333333
-                          +in[(i+-2)*n+(j+1)] * -0.008333333333333333
-                          +in[(i+-2)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+-2)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+-2)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+-1)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+-1)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+-1)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+-1)*n+(j+-2)] * -0.008333333333333333
+                          +in[(i+-2)*n+(j+-1)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+0)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+1)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+-2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-1)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-1)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-1)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+-1)*n+(j+-2)] * -0.00833333333333
                           +in[(i+-1)*n+(j+-1)] * -0.05
                           +in[(i+-1)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+2)] * 0.008333333333333333
-                          +in[(i+-1)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+-1)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+-1)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+0)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+0)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+0)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+0)*n+(j+-2)] * -0.008333333333333333
+                          +in[(i+-1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+-1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+-1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+0)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+0)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+0)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+0)*n+(j+-2)] * -0.00833333333333
                           +in[(i+0)*n+(j+-1)] * -0.05
                           +in[(i+0)*n+(j+1)] * 0.05
-                          +in[(i+0)*n+(j+2)] * 0.008333333333333333
-                          +in[(i+0)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+0)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+0)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+1)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+1)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+1)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+1)*n+(j+-2)] * -0.008333333333333333
+                          +in[(i+0)*n+(j+2)] * 0.00833333333333
+                          +in[(i+0)*n+(j+3)] * 0.00333333333333
+                          +in[(i+0)*n+(j+4)] * 0.00178571428571
+                          +in[(i+0)*n+(j+5)] * 0.00111111111111
+                          +in[(i+1)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+1)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+1)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+1)*n+(j+-2)] * -0.00833333333333
                           +in[(i+1)*n+(j+0)] * 0.05
                           +in[(i+1)*n+(j+1)] * 0.05
-                          +in[(i+1)*n+(j+2)] * 0.008333333333333333
-                          +in[(i+1)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+1)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+1)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+2)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+2)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+2)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+2)*n+(j+-1)] * 0.008333333333333333
-                          +in[(i+2)*n+(j+0)] * 0.008333333333333333
-                          +in[(i+2)*n+(j+1)] * 0.008333333333333333
+                          +in[(i+1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+2)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+2)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+2)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+2)*n+(j+-1)] * 0.00833333333333
+                          +in[(i+2)*n+(j+0)] * 0.00833333333333
+                          +in[(i+2)*n+(j+1)] * 0.00833333333333
                           +in[(i+2)*n+(j+2)] * 0.025
-                          +in[(i+2)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+2)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+2)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+3)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+3)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+3)*n+(j+-2)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+-1)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+0)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+1)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+2)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+3)] * 0.016666666666666666
-                          +in[(i+3)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+3)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+4)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+4)*n+(j+-3)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+-2)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+-1)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+0)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+1)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+2)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+3)] * 0.0017857142857142857
+                          +in[(i+2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+3)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+3)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+3)*n+(j+-2)] * 0.00333333333333
+                          +in[(i+3)*n+(j+-1)] * 0.00333333333333
+                          +in[(i+3)*n+(j+0)] * 0.00333333333333
+                          +in[(i+3)*n+(j+1)] * 0.00333333333333
+                          +in[(i+3)*n+(j+2)] * 0.00333333333333
+                          +in[(i+3)*n+(j+3)] * 0.0166666666667
+                          +in[(i+3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+4)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+4)*n+(j+-3)] * 0.00178571428571
+                          +in[(i+4)*n+(j+-2)] * 0.00178571428571
+                          +in[(i+4)*n+(j+-1)] * 0.00178571428571
+                          +in[(i+4)*n+(j+0)] * 0.00178571428571
+                          +in[(i+4)*n+(j+1)] * 0.00178571428571
+                          +in[(i+4)*n+(j+2)] * 0.00178571428571
+                          +in[(i+4)*n+(j+3)] * 0.00178571428571
                           +in[(i+4)*n+(j+4)] * 0.0125
-                          +in[(i+4)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-4)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-3)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-2)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-1)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+0)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+1)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+2)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+3)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+4)] * 0.0011111111111111111
+                          +in[(i+4)*n+(j+5)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-4)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-3)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-2)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-1)] * 0.00111111111111
+                          +in[(i+5)*n+(j+0)] * 0.00111111111111
+                          +in[(i+5)*n+(j+1)] * 0.00111111111111
+                          +in[(i+5)*n+(j+2)] * 0.00111111111111
+                          +in[(i+5)*n+(j+3)] * 0.00111111111111
+                          +in[(i+5)*n+(j+4)] * 0.00111111111111
                           +in[(i+5)*n+(j+5)] * 0.01
                           ;
        });
diff --git a/Cxx11/stencil_target.hpp b/Cxx11/stencil_target.hpp
index 8e0359176..1e36ef2be 100644
--- a/Cxx11/stencil_target.hpp
+++ b/Cxx11/stencil_target.hpp
@@ -32,18 +32,18 @@ void star3(const int n, const int t, const double * RESTRICT in, double * RESTRI
     OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
     for (auto i=3; i<n-3; ++i) {
       for (auto j=3; j<n-3; ++j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.05555555555555555
-                          +in[(i+-2)*n+(j+0)] * -0.08333333333333333
-                          +in[(i+-1)*n+(j+0)] * -0.16666666666666666
-                          +in[(i+0)*n+(j+-3)] * -0.05555555555555555
-                          +in[(i+0)*n+(j+-2)] * -0.08333333333333333
-                          +in[(i+0)*n+(j+-1)] * -0.16666666666666666
-                          +in[(i+0)*n+(j+1)] * 0.16666666666666666
-                          +in[(i+0)*n+(j+2)] * 0.08333333333333333
-                          +in[(i+0)*n+(j+3)] * 0.05555555555555555
-                          +in[(i+1)*n+(j+0)] * 0.16666666666666666
-                          +in[(i+2)*n+(j+0)] * 0.08333333333333333
-                          +in[(i+3)*n+(j+0)] * 0.05555555555555555;
+            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556
+                          +in[(i+-2)*n+(j+0)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+0)] * -0.166666666667
+                          +in[(i+0)*n+(j+-3)] * -0.0555555555556
+                          +in[(i+0)*n+(j+-2)] * -0.0833333333333
+                          +in[(i+0)*n+(j+-1)] * -0.166666666667
+                          +in[(i+0)*n+(j+1)] * 0.166666666667
+                          +in[(i+0)*n+(j+2)] * 0.0833333333333
+                          +in[(i+0)*n+(j+3)] * 0.0555555555556
+                          +in[(i+1)*n+(j+0)] * 0.166666666667
+                          +in[(i+2)*n+(j+0)] * 0.0833333333333
+                          +in[(i+3)*n+(j+0)] * 0.0555555555556;
        }
      }
 }
@@ -53,20 +53,20 @@ void star4(const int n, const int t, const double * RESTRICT in, double * RESTRI
     for (auto i=4; i<n-4; ++i) {
       for (auto j=4; j<n-4; ++j) {
             out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125
-                          +in[(i+-3)*n+(j+0)] * -0.041666666666666664
+                          +in[(i+-3)*n+(j+0)] * -0.0416666666667
                           +in[(i+-2)*n+(j+0)] * -0.0625
                           +in[(i+-1)*n+(j+0)] * -0.125
                           +in[(i+0)*n+(j+-4)] * -0.03125
-                          +in[(i+0)*n+(j+-3)] * -0.041666666666666664
+                          +in[(i+0)*n+(j+-3)] * -0.0416666666667
                           +in[(i+0)*n+(j+-2)] * -0.0625
                           +in[(i+0)*n+(j+-1)] * -0.125
                           +in[(i+0)*n+(j+1)] * 0.125
                           +in[(i+0)*n+(j+2)] * 0.0625
-                          +in[(i+0)*n+(j+3)] * 0.041666666666666664
+                          +in[(i+0)*n+(j+3)] * 0.0416666666667
                           +in[(i+0)*n+(j+4)] * 0.03125
                           +in[(i+1)*n+(j+0)] * 0.125
                           +in[(i+2)*n+(j+0)] * 0.0625
-                          +in[(i+3)*n+(j+0)] * 0.041666666666666664
+                          +in[(i+3)*n+(j+0)] * 0.0416666666667
                           +in[(i+4)*n+(j+0)] * 0.03125;
        }
      }
@@ -78,22 +78,22 @@ void star5(const int n, const int t, const double * RESTRICT in, double * RESTRI
       for (auto j=5; j<n-5; ++j) {
             out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02
                           +in[(i+-4)*n+(j+0)] * -0.025
-                          +in[(i+-3)*n+(j+0)] * -0.03333333333333333
+                          +in[(i+-3)*n+(j+0)] * -0.0333333333333
                           +in[(i+-2)*n+(j+0)] * -0.05
                           +in[(i+-1)*n+(j+0)] * -0.1
                           +in[(i+0)*n+(j+-5)] * -0.02
                           +in[(i+0)*n+(j+-4)] * -0.025
-                          +in[(i+0)*n+(j+-3)] * -0.03333333333333333
+                          +in[(i+0)*n+(j+-3)] * -0.0333333333333
                           +in[(i+0)*n+(j+-2)] * -0.05
                           +in[(i+0)*n+(j+-1)] * -0.1
                           +in[(i+0)*n+(j+1)] * 0.1
                           +in[(i+0)*n+(j+2)] * 0.05
-                          +in[(i+0)*n+(j+3)] * 0.03333333333333333
+                          +in[(i+0)*n+(j+3)] * 0.0333333333333
                           +in[(i+0)*n+(j+4)] * 0.025
                           +in[(i+0)*n+(j+5)] * 0.02
                           +in[(i+1)*n+(j+0)] * 0.1
                           +in[(i+2)*n+(j+0)] * 0.05
-                          +in[(i+3)*n+(j+0)] * 0.03333333333333333
+                          +in[(i+3)*n+(j+0)] * 0.0333333333333
                           +in[(i+4)*n+(j+0)] * 0.025
                           +in[(i+5)*n+(j+0)] * 0.02;
        }
@@ -120,24 +120,24 @@ void grid2(const int n, const int t, const double * RESTRICT in, double * RESTRI
     for (auto i=2; i<n-2; ++i) {
       for (auto j=2; j<n-2; ++j) {
             out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625
-                          +in[(i+-2)*n+(j+-1)] * -0.020833333333333332
-                          +in[(i+-2)*n+(j+0)] * -0.020833333333333332
-                          +in[(i+-2)*n+(j+1)] * -0.020833333333333332
-                          +in[(i+-1)*n+(j+-2)] * -0.020833333333333332
+                          +in[(i+-2)*n+(j+-1)] * -0.0208333333333
+                          +in[(i+-2)*n+(j+0)] * -0.0208333333333
+                          +in[(i+-2)*n+(j+1)] * -0.0208333333333
+                          +in[(i+-1)*n+(j+-2)] * -0.0208333333333
                           +in[(i+-1)*n+(j+-1)] * -0.125
                           +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+2)] * 0.020833333333333332
-                          +in[(i+0)*n+(j+-2)] * -0.020833333333333332
+                          +in[(i+-1)*n+(j+2)] * 0.0208333333333
+                          +in[(i+0)*n+(j+-2)] * -0.0208333333333
                           +in[(i+0)*n+(j+-1)] * -0.125
                           +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.020833333333333332
-                          +in[(i+1)*n+(j+-2)] * -0.020833333333333332
+                          +in[(i+0)*n+(j+2)] * 0.0208333333333
+                          +in[(i+1)*n+(j+-2)] * -0.0208333333333
                           +in[(i+1)*n+(j+0)] * 0.125
                           +in[(i+1)*n+(j+1)] * 0.125
-                          +in[(i+1)*n+(j+2)] * 0.020833333333333332
-                          +in[(i+2)*n+(j+-1)] * 0.020833333333333332
-                          +in[(i+2)*n+(j+0)] * 0.020833333333333332
-                          +in[(i+2)*n+(j+1)] * 0.020833333333333332
+                          +in[(i+1)*n+(j+2)] * 0.0208333333333
+                          +in[(i+2)*n+(j+-1)] * 0.0208333333333
+                          +in[(i+2)*n+(j+0)] * 0.0208333333333
+                          +in[(i+2)*n+(j+1)] * 0.0208333333333
                           +in[(i+2)*n+(j+2)] * 0.0625
                           ;
        }
@@ -148,48 +148,48 @@ void grid3(const int n, const int t, const double * RESTRICT in, double * RESTRI
     OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
     for (auto i=3; i<n-3; ++i) {
       for (auto j=3; j<n-3; ++j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.027777777777777776
-                          +in[(i+-3)*n+(j+-2)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+-1)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+0)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+1)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+2)] * -0.005555555555555556
-                          +in[(i+-2)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+-2)*n+(j+-2)] * -0.041666666666666664
-                          +in[(i+-2)*n+(j+-1)] * -0.013888888888888888
-                          +in[(i+-2)*n+(j+0)] * -0.013888888888888888
-                          +in[(i+-2)*n+(j+1)] * -0.013888888888888888
-                          +in[(i+-2)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+-1)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+-1)*n+(j+-2)] * -0.013888888888888888
-                          +in[(i+-1)*n+(j+-1)] * -0.08333333333333333
-                          +in[(i+-1)*n+(j+0)] * -0.08333333333333333
-                          +in[(i+-1)*n+(j+2)] * 0.013888888888888888
-                          +in[(i+-1)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+0)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.013888888888888888
-                          +in[(i+0)*n+(j+-1)] * -0.08333333333333333
-                          +in[(i+0)*n+(j+1)] * 0.08333333333333333
-                          +in[(i+0)*n+(j+2)] * 0.013888888888888888
-                          +in[(i+0)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+1)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+1)*n+(j+-2)] * -0.013888888888888888
-                          +in[(i+1)*n+(j+0)] * 0.08333333333333333
-                          +in[(i+1)*n+(j+1)] * 0.08333333333333333
-                          +in[(i+1)*n+(j+2)] * 0.013888888888888888
-                          +in[(i+1)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+2)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+2)*n+(j+-1)] * 0.013888888888888888
-                          +in[(i+2)*n+(j+0)] * 0.013888888888888888
-                          +in[(i+2)*n+(j+1)] * 0.013888888888888888
-                          +in[(i+2)*n+(j+2)] * 0.041666666666666664
-                          +in[(i+2)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+-2)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+-1)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+0)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+1)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+2)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+3)] * 0.027777777777777776
+            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778
+                          +in[(i+-3)*n+(j+-2)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+-1)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+0)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+1)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+2)] * -0.00555555555556
+                          +in[(i+-2)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+-2)*n+(j+-2)] * -0.0416666666667
+                          +in[(i+-2)*n+(j+-1)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+0)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+1)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+3)] * 0.00555555555556
+                          +in[(i+-1)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+-1)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+-1)*n+(j+-1)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+0)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+-1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+0)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+0)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+0)*n+(j+-1)] * -0.0833333333333
+                          +in[(i+0)*n+(j+1)] * 0.0833333333333
+                          +in[(i+0)*n+(j+2)] * 0.0138888888889
+                          +in[(i+0)*n+(j+3)] * 0.00555555555556
+                          +in[(i+1)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+1)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+1)*n+(j+0)] * 0.0833333333333
+                          +in[(i+1)*n+(j+1)] * 0.0833333333333
+                          +in[(i+1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+2)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+2)*n+(j+-1)] * 0.0138888888889
+                          +in[(i+2)*n+(j+0)] * 0.0138888888889
+                          +in[(i+2)*n+(j+1)] * 0.0138888888889
+                          +in[(i+2)*n+(j+2)] * 0.0416666666667
+                          +in[(i+2)*n+(j+3)] * 0.00555555555556
+                          +in[(i+3)*n+(j+-2)] * 0.00555555555556
+                          +in[(i+3)*n+(j+-1)] * 0.00555555555556
+                          +in[(i+3)*n+(j+0)] * 0.00555555555556
+                          +in[(i+3)*n+(j+1)] * 0.00555555555556
+                          +in[(i+3)*n+(j+2)] * 0.00555555555556
+                          +in[(i+3)*n+(j+3)] * 0.0277777777778
                           ;
        }
      }
@@ -200,76 +200,76 @@ void grid4(const int n, const int t, const double * RESTRICT in, double * RESTRI
     for (auto i=4; i<n-4; ++i) {
       for (auto j=4; j<n-4; ++j) {
             out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625
-                          +in[(i+-4)*n+(j+-3)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+-2)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+-1)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+0)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+1)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+2)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+3)] * -0.002232142857142857
-                          +in[(i+-3)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+-3)*n+(j+-3)] * -0.020833333333333332
-                          +in[(i+-3)*n+(j+-2)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+-1)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+0)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+1)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+2)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+-2)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+-2)*n+(j+-3)] * -0.004166666666666667
+                          +in[(i+-4)*n+(j+-3)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+-2)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+-1)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+0)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+1)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+2)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+3)] * -0.00223214285714
+                          +in[(i+-3)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-3)*n+(j+-3)] * -0.0208333333333
+                          +in[(i+-3)*n+(j+-2)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+-1)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+0)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+1)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+2)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+4)] * 0.00223214285714
+                          +in[(i+-2)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-2)*n+(j+-3)] * -0.00416666666667
                           +in[(i+-2)*n+(j+-2)] * -0.03125
-                          +in[(i+-2)*n+(j+-1)] * -0.010416666666666666
-                          +in[(i+-2)*n+(j+0)] * -0.010416666666666666
-                          +in[(i+-2)*n+(j+1)] * -0.010416666666666666
-                          +in[(i+-2)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+-2)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+-1)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+-1)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+-1)*n+(j+-2)] * -0.010416666666666666
+                          +in[(i+-2)*n+(j+-1)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+0)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+1)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+-2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+-1)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-1)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+-1)*n+(j+-2)] * -0.0104166666667
                           +in[(i+-1)*n+(j+-1)] * -0.0625
                           +in[(i+-1)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+2)] * 0.010416666666666666
-                          +in[(i+-1)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+-1)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+0)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+0)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+0)*n+(j+-2)] * -0.010416666666666666
+                          +in[(i+-1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+-1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+-1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+0)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+0)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+0)*n+(j+-2)] * -0.0104166666667
                           +in[(i+0)*n+(j+-1)] * -0.0625
                           +in[(i+0)*n+(j+1)] * 0.0625
-                          +in[(i+0)*n+(j+2)] * 0.010416666666666666
-                          +in[(i+0)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+0)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+1)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+1)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+1)*n+(j+-2)] * -0.010416666666666666
+                          +in[(i+0)*n+(j+2)] * 0.0104166666667
+                          +in[(i+0)*n+(j+3)] * 0.00416666666667
+                          +in[(i+0)*n+(j+4)] * 0.00223214285714
+                          +in[(i+1)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+1)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+1)*n+(j+-2)] * -0.0104166666667
                           +in[(i+1)*n+(j+0)] * 0.0625
                           +in[(i+1)*n+(j+1)] * 0.0625
-                          +in[(i+1)*n+(j+2)] * 0.010416666666666666
-                          +in[(i+1)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+1)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+2)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+2)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+2)*n+(j+-1)] * 0.010416666666666666
-                          +in[(i+2)*n+(j+0)] * 0.010416666666666666
-                          +in[(i+2)*n+(j+1)] * 0.010416666666666666
+                          +in[(i+1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+2)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+2)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+2)*n+(j+-1)] * 0.0104166666667
+                          +in[(i+2)*n+(j+0)] * 0.0104166666667
+                          +in[(i+2)*n+(j+1)] * 0.0104166666667
                           +in[(i+2)*n+(j+2)] * 0.03125
-                          +in[(i+2)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+2)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+3)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+3)*n+(j+-2)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+-1)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+0)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+1)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+2)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+3)] * 0.020833333333333332
-                          +in[(i+3)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+-3)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+-2)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+-1)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+0)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+1)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+2)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+3)] * 0.002232142857142857
+                          +in[(i+2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+3)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+3)*n+(j+-2)] * 0.00416666666667
+                          +in[(i+3)*n+(j+-1)] * 0.00416666666667
+                          +in[(i+3)*n+(j+0)] * 0.00416666666667
+                          +in[(i+3)*n+(j+1)] * 0.00416666666667
+                          +in[(i+3)*n+(j+2)] * 0.00416666666667
+                          +in[(i+3)*n+(j+3)] * 0.0208333333333
+                          +in[(i+3)*n+(j+4)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-3)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-2)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-1)] * 0.00223214285714
+                          +in[(i+4)*n+(j+0)] * 0.00223214285714
+                          +in[(i+4)*n+(j+1)] * 0.00223214285714
+                          +in[(i+4)*n+(j+2)] * 0.00223214285714
+                          +in[(i+4)*n+(j+3)] * 0.00223214285714
                           +in[(i+4)*n+(j+4)] * 0.015625
                           ;
        }
@@ -281,114 +281,114 @@ void grid5(const int n, const int t, const double * RESTRICT in, double * RESTRI
     for (auto i=5; i<n-5; ++i) {
       for (auto j=5; j<n-5; ++j) {
             out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01
-                          +in[(i+-5)*n+(j+-4)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+-3)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+-2)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+-1)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+0)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+1)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+2)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+3)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+4)] * -0.0011111111111111111
-                          +in[(i+-4)*n+(j+-5)] * -0.0011111111111111111
+                          +in[(i+-5)*n+(j+-4)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-3)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-2)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-1)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+0)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+1)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+2)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+3)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+4)] * -0.00111111111111
+                          +in[(i+-4)*n+(j+-5)] * -0.00111111111111
                           +in[(i+-4)*n+(j+-4)] * -0.0125
-                          +in[(i+-4)*n+(j+-3)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+-2)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+-1)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+0)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+1)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+2)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+3)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+-3)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+-3)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+-3)*n+(j+-3)] * -0.016666666666666666
-                          +in[(i+-3)*n+(j+-2)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+-1)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+0)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+1)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+2)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+-3)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+-2)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+-2)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+-2)*n+(j+-3)] * -0.0033333333333333335
+                          +in[(i+-4)*n+(j+-3)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+-2)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+-1)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+0)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+1)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+2)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+3)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-3)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-3)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-3)*n+(j+-3)] * -0.0166666666667
+                          +in[(i+-3)*n+(j+-2)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+-1)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+0)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+1)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+2)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-2)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-2)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-2)*n+(j+-3)] * -0.00333333333333
                           +in[(i+-2)*n+(j+-2)] * -0.025
-                          +in[(i+-2)*n+(j+-1)] * -0.008333333333333333
-                          +in[(i+-2)*n+(j+0)] * -0.008333333333333333
-                          +in[(i+-2)*n+(j+1)] * -0.008333333333333333
-                          +in[(i+-2)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+-2)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+-2)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+-1)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+-1)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+-1)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+-1)*n+(j+-2)] * -0.008333333333333333
+                          +in[(i+-2)*n+(j+-1)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+0)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+1)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+-2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-1)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-1)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-1)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+-1)*n+(j+-2)] * -0.00833333333333
                           +in[(i+-1)*n+(j+-1)] * -0.05
                           +in[(i+-1)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+2)] * 0.008333333333333333
-                          +in[(i+-1)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+-1)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+-1)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+0)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+0)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+0)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+0)*n+(j+-2)] * -0.008333333333333333
+                          +in[(i+-1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+-1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+-1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+0)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+0)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+0)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+0)*n+(j+-2)] * -0.00833333333333
                           +in[(i+0)*n+(j+-1)] * -0.05
                           +in[(i+0)*n+(j+1)] * 0.05
-                          +in[(i+0)*n+(j+2)] * 0.008333333333333333
-                          +in[(i+0)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+0)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+0)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+1)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+1)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+1)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+1)*n+(j+-2)] * -0.008333333333333333
+                          +in[(i+0)*n+(j+2)] * 0.00833333333333
+                          +in[(i+0)*n+(j+3)] * 0.00333333333333
+                          +in[(i+0)*n+(j+4)] * 0.00178571428571
+                          +in[(i+0)*n+(j+5)] * 0.00111111111111
+                          +in[(i+1)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+1)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+1)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+1)*n+(j+-2)] * -0.00833333333333
                           +in[(i+1)*n+(j+0)] * 0.05
                           +in[(i+1)*n+(j+1)] * 0.05
-                          +in[(i+1)*n+(j+2)] * 0.008333333333333333
-                          +in[(i+1)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+1)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+1)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+2)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+2)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+2)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+2)*n+(j+-1)] * 0.008333333333333333
-                          +in[(i+2)*n+(j+0)] * 0.008333333333333333
-                          +in[(i+2)*n+(j+1)] * 0.008333333333333333
+                          +in[(i+1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+2)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+2)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+2)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+2)*n+(j+-1)] * 0.00833333333333
+                          +in[(i+2)*n+(j+0)] * 0.00833333333333
+                          +in[(i+2)*n+(j+1)] * 0.00833333333333
                           +in[(i+2)*n+(j+2)] * 0.025
-                          +in[(i+2)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+2)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+2)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+3)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+3)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+3)*n+(j+-2)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+-1)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+0)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+1)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+2)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+3)] * 0.016666666666666666
-                          +in[(i+3)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+3)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+4)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+4)*n+(j+-3)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+-2)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+-1)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+0)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+1)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+2)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+3)] * 0.0017857142857142857
+                          +in[(i+2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+3)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+3)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+3)*n+(j+-2)] * 0.00333333333333
+                          +in[(i+3)*n+(j+-1)] * 0.00333333333333
+                          +in[(i+3)*n+(j+0)] * 0.00333333333333
+                          +in[(i+3)*n+(j+1)] * 0.00333333333333
+                          +in[(i+3)*n+(j+2)] * 0.00333333333333
+                          +in[(i+3)*n+(j+3)] * 0.0166666666667
+                          +in[(i+3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+4)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+4)*n+(j+-3)] * 0.00178571428571
+                          +in[(i+4)*n+(j+-2)] * 0.00178571428571
+                          +in[(i+4)*n+(j+-1)] * 0.00178571428571
+                          +in[(i+4)*n+(j+0)] * 0.00178571428571
+                          +in[(i+4)*n+(j+1)] * 0.00178571428571
+                          +in[(i+4)*n+(j+2)] * 0.00178571428571
+                          +in[(i+4)*n+(j+3)] * 0.00178571428571
                           +in[(i+4)*n+(j+4)] * 0.0125
-                          +in[(i+4)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-4)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-3)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-2)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-1)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+0)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+1)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+2)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+3)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+4)] * 0.0011111111111111111
+                          +in[(i+4)*n+(j+5)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-4)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-3)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-2)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-1)] * 0.00111111111111
+                          +in[(i+5)*n+(j+0)] * 0.00111111111111
+                          +in[(i+5)*n+(j+1)] * 0.00111111111111
+                          +in[(i+5)*n+(j+2)] * 0.00111111111111
+                          +in[(i+5)*n+(j+3)] * 0.00111111111111
+                          +in[(i+5)*n+(j+4)] * 0.00111111111111
                           +in[(i+5)*n+(j+5)] * 0.01
                           ;
        }
diff --git a/Cxx11/stencil_taskloop.hpp b/Cxx11/stencil_taskloop.hpp
index 240acb0ef..fec723685 100644
--- a/Cxx11/stencil_taskloop.hpp
+++ b/Cxx11/stencil_taskloop.hpp
@@ -43,18 +43,18 @@ void star3(const int n, const int t, std::vector<double> & in, std::vector<doubl
         for (auto i=it; i<std::min(n-3,it+t); ++i) {
           OMP_SIMD
           for (auto j=jt; j<std::min(n-3,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.05555555555555555
-                          +in[(i+-2)*n+(j+0)] * -0.08333333333333333
-                          +in[(i+-1)*n+(j+0)] * -0.16666666666666666
-                          +in[(i+0)*n+(j+-3)] * -0.05555555555555555
-                          +in[(i+0)*n+(j+-2)] * -0.08333333333333333
-                          +in[(i+0)*n+(j+-1)] * -0.16666666666666666
-                          +in[(i+0)*n+(j+1)] * 0.16666666666666666
-                          +in[(i+0)*n+(j+2)] * 0.08333333333333333
-                          +in[(i+0)*n+(j+3)] * 0.05555555555555555
-                          +in[(i+1)*n+(j+0)] * 0.16666666666666666
-                          +in[(i+2)*n+(j+0)] * 0.08333333333333333
-                          +in[(i+3)*n+(j+0)] * 0.05555555555555555;
+            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556
+                          +in[(i+-2)*n+(j+0)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+0)] * -0.166666666667
+                          +in[(i+0)*n+(j+-3)] * -0.0555555555556
+                          +in[(i+0)*n+(j+-2)] * -0.0833333333333
+                          +in[(i+0)*n+(j+-1)] * -0.166666666667
+                          +in[(i+0)*n+(j+1)] * 0.166666666667
+                          +in[(i+0)*n+(j+2)] * 0.0833333333333
+                          +in[(i+0)*n+(j+3)] * 0.0555555555556
+                          +in[(i+1)*n+(j+0)] * 0.166666666667
+                          +in[(i+2)*n+(j+0)] * 0.0833333333333
+                          +in[(i+3)*n+(j+0)] * 0.0555555555556;
            }
          }
        }
@@ -69,20 +69,20 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
           OMP_SIMD
           for (auto j=jt; j<std::min(n-4,jt+t); ++j) {
             out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125
-                          +in[(i+-3)*n+(j+0)] * -0.041666666666666664
+                          +in[(i+-3)*n+(j+0)] * -0.0416666666667
                           +in[(i+-2)*n+(j+0)] * -0.0625
                           +in[(i+-1)*n+(j+0)] * -0.125
                           +in[(i+0)*n+(j+-4)] * -0.03125
-                          +in[(i+0)*n+(j+-3)] * -0.041666666666666664
+                          +in[(i+0)*n+(j+-3)] * -0.0416666666667
                           +in[(i+0)*n+(j+-2)] * -0.0625
                           +in[(i+0)*n+(j+-1)] * -0.125
                           +in[(i+0)*n+(j+1)] * 0.125
                           +in[(i+0)*n+(j+2)] * 0.0625
-                          +in[(i+0)*n+(j+3)] * 0.041666666666666664
+                          +in[(i+0)*n+(j+3)] * 0.0416666666667
                           +in[(i+0)*n+(j+4)] * 0.03125
                           +in[(i+1)*n+(j+0)] * 0.125
                           +in[(i+2)*n+(j+0)] * 0.0625
-                          +in[(i+3)*n+(j+0)] * 0.041666666666666664
+                          +in[(i+3)*n+(j+0)] * 0.0416666666667
                           +in[(i+4)*n+(j+0)] * 0.03125;
            }
          }
@@ -99,22 +99,22 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
           for (auto j=jt; j<std::min(n-5,jt+t); ++j) {
             out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02
                           +in[(i+-4)*n+(j+0)] * -0.025
-                          +in[(i+-3)*n+(j+0)] * -0.03333333333333333
+                          +in[(i+-3)*n+(j+0)] * -0.0333333333333
                           +in[(i+-2)*n+(j+0)] * -0.05
                           +in[(i+-1)*n+(j+0)] * -0.1
                           +in[(i+0)*n+(j+-5)] * -0.02
                           +in[(i+0)*n+(j+-4)] * -0.025
-                          +in[(i+0)*n+(j+-3)] * -0.03333333333333333
+                          +in[(i+0)*n+(j+-3)] * -0.0333333333333
                           +in[(i+0)*n+(j+-2)] * -0.05
                           +in[(i+0)*n+(j+-1)] * -0.1
                           +in[(i+0)*n+(j+1)] * 0.1
                           +in[(i+0)*n+(j+2)] * 0.05
-                          +in[(i+0)*n+(j+3)] * 0.03333333333333333
+                          +in[(i+0)*n+(j+3)] * 0.0333333333333
                           +in[(i+0)*n+(j+4)] * 0.025
                           +in[(i+0)*n+(j+5)] * 0.02
                           +in[(i+1)*n+(j+0)] * 0.1
                           +in[(i+2)*n+(j+0)] * 0.05
-                          +in[(i+3)*n+(j+0)] * 0.03333333333333333
+                          +in[(i+3)*n+(j+0)] * 0.0333333333333
                           +in[(i+4)*n+(j+0)] * 0.025
                           +in[(i+5)*n+(j+0)] * 0.02;
            }
@@ -151,24 +151,24 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
           OMP_SIMD
           for (auto j=jt; j<std::min(n-2,jt+t); ++j) {
             out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625
-                          +in[(i+-2)*n+(j+-1)] * -0.020833333333333332
-                          +in[(i+-2)*n+(j+0)] * -0.020833333333333332
-                          +in[(i+-2)*n+(j+1)] * -0.020833333333333332
-                          +in[(i+-1)*n+(j+-2)] * -0.020833333333333332
+                          +in[(i+-2)*n+(j+-1)] * -0.0208333333333
+                          +in[(i+-2)*n+(j+0)] * -0.0208333333333
+                          +in[(i+-2)*n+(j+1)] * -0.0208333333333
+                          +in[(i+-1)*n+(j+-2)] * -0.0208333333333
                           +in[(i+-1)*n+(j+-1)] * -0.125
                           +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+2)] * 0.020833333333333332
-                          +in[(i+0)*n+(j+-2)] * -0.020833333333333332
+                          +in[(i+-1)*n+(j+2)] * 0.0208333333333
+                          +in[(i+0)*n+(j+-2)] * -0.0208333333333
                           +in[(i+0)*n+(j+-1)] * -0.125
                           +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.020833333333333332
-                          +in[(i+1)*n+(j+-2)] * -0.020833333333333332
+                          +in[(i+0)*n+(j+2)] * 0.0208333333333
+                          +in[(i+1)*n+(j+-2)] * -0.0208333333333
                           +in[(i+1)*n+(j+0)] * 0.125
                           +in[(i+1)*n+(j+1)] * 0.125
-                          +in[(i+1)*n+(j+2)] * 0.020833333333333332
-                          +in[(i+2)*n+(j+-1)] * 0.020833333333333332
-                          +in[(i+2)*n+(j+0)] * 0.020833333333333332
-                          +in[(i+2)*n+(j+1)] * 0.020833333333333332
+                          +in[(i+1)*n+(j+2)] * 0.0208333333333
+                          +in[(i+2)*n+(j+-1)] * 0.0208333333333
+                          +in[(i+2)*n+(j+0)] * 0.0208333333333
+                          +in[(i+2)*n+(j+1)] * 0.0208333333333
                           +in[(i+2)*n+(j+2)] * 0.0625
                           ;
            }
@@ -184,48 +184,48 @@ void grid3(const int n, const int t, std::vector<double> & in, std::vector<doubl
         for (auto i=it; i<std::min(n-3,it+t); ++i) {
           OMP_SIMD
           for (auto j=jt; j<std::min(n-3,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.027777777777777776
-                          +in[(i+-3)*n+(j+-2)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+-1)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+0)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+1)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+2)] * -0.005555555555555556
-                          +in[(i+-2)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+-2)*n+(j+-2)] * -0.041666666666666664
-                          +in[(i+-2)*n+(j+-1)] * -0.013888888888888888
-                          +in[(i+-2)*n+(j+0)] * -0.013888888888888888
-                          +in[(i+-2)*n+(j+1)] * -0.013888888888888888
-                          +in[(i+-2)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+-1)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+-1)*n+(j+-2)] * -0.013888888888888888
-                          +in[(i+-1)*n+(j+-1)] * -0.08333333333333333
-                          +in[(i+-1)*n+(j+0)] * -0.08333333333333333
-                          +in[(i+-1)*n+(j+2)] * 0.013888888888888888
-                          +in[(i+-1)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+0)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.013888888888888888
-                          +in[(i+0)*n+(j+-1)] * -0.08333333333333333
-                          +in[(i+0)*n+(j+1)] * 0.08333333333333333
-                          +in[(i+0)*n+(j+2)] * 0.013888888888888888
-                          +in[(i+0)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+1)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+1)*n+(j+-2)] * -0.013888888888888888
-                          +in[(i+1)*n+(j+0)] * 0.08333333333333333
-                          +in[(i+1)*n+(j+1)] * 0.08333333333333333
-                          +in[(i+1)*n+(j+2)] * 0.013888888888888888
-                          +in[(i+1)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+2)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+2)*n+(j+-1)] * 0.013888888888888888
-                          +in[(i+2)*n+(j+0)] * 0.013888888888888888
-                          +in[(i+2)*n+(j+1)] * 0.013888888888888888
-                          +in[(i+2)*n+(j+2)] * 0.041666666666666664
-                          +in[(i+2)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+-2)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+-1)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+0)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+1)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+2)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+3)] * 0.027777777777777776
+            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778
+                          +in[(i+-3)*n+(j+-2)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+-1)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+0)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+1)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+2)] * -0.00555555555556
+                          +in[(i+-2)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+-2)*n+(j+-2)] * -0.0416666666667
+                          +in[(i+-2)*n+(j+-1)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+0)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+1)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+3)] * 0.00555555555556
+                          +in[(i+-1)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+-1)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+-1)*n+(j+-1)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+0)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+-1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+0)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+0)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+0)*n+(j+-1)] * -0.0833333333333
+                          +in[(i+0)*n+(j+1)] * 0.0833333333333
+                          +in[(i+0)*n+(j+2)] * 0.0138888888889
+                          +in[(i+0)*n+(j+3)] * 0.00555555555556
+                          +in[(i+1)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+1)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+1)*n+(j+0)] * 0.0833333333333
+                          +in[(i+1)*n+(j+1)] * 0.0833333333333
+                          +in[(i+1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+2)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+2)*n+(j+-1)] * 0.0138888888889
+                          +in[(i+2)*n+(j+0)] * 0.0138888888889
+                          +in[(i+2)*n+(j+1)] * 0.0138888888889
+                          +in[(i+2)*n+(j+2)] * 0.0416666666667
+                          +in[(i+2)*n+(j+3)] * 0.00555555555556
+                          +in[(i+3)*n+(j+-2)] * 0.00555555555556
+                          +in[(i+3)*n+(j+-1)] * 0.00555555555556
+                          +in[(i+3)*n+(j+0)] * 0.00555555555556
+                          +in[(i+3)*n+(j+1)] * 0.00555555555556
+                          +in[(i+3)*n+(j+2)] * 0.00555555555556
+                          +in[(i+3)*n+(j+3)] * 0.0277777777778
                           ;
            }
          }
@@ -241,76 +241,76 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
           OMP_SIMD
           for (auto j=jt; j<std::min(n-4,jt+t); ++j) {
             out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625
-                          +in[(i+-4)*n+(j+-3)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+-2)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+-1)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+0)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+1)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+2)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+3)] * -0.002232142857142857
-                          +in[(i+-3)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+-3)*n+(j+-3)] * -0.020833333333333332
-                          +in[(i+-3)*n+(j+-2)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+-1)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+0)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+1)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+2)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+-2)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+-2)*n+(j+-3)] * -0.004166666666666667
+                          +in[(i+-4)*n+(j+-3)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+-2)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+-1)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+0)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+1)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+2)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+3)] * -0.00223214285714
+                          +in[(i+-3)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-3)*n+(j+-3)] * -0.0208333333333
+                          +in[(i+-3)*n+(j+-2)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+-1)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+0)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+1)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+2)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+4)] * 0.00223214285714
+                          +in[(i+-2)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-2)*n+(j+-3)] * -0.00416666666667
                           +in[(i+-2)*n+(j+-2)] * -0.03125
-                          +in[(i+-2)*n+(j+-1)] * -0.010416666666666666
-                          +in[(i+-2)*n+(j+0)] * -0.010416666666666666
-                          +in[(i+-2)*n+(j+1)] * -0.010416666666666666
-                          +in[(i+-2)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+-2)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+-1)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+-1)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+-1)*n+(j+-2)] * -0.010416666666666666
+                          +in[(i+-2)*n+(j+-1)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+0)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+1)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+-2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+-1)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-1)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+-1)*n+(j+-2)] * -0.0104166666667
                           +in[(i+-1)*n+(j+-1)] * -0.0625
                           +in[(i+-1)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+2)] * 0.010416666666666666
-                          +in[(i+-1)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+-1)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+0)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+0)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+0)*n+(j+-2)] * -0.010416666666666666
+                          +in[(i+-1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+-1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+-1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+0)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+0)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+0)*n+(j+-2)] * -0.0104166666667
                           +in[(i+0)*n+(j+-1)] * -0.0625
                           +in[(i+0)*n+(j+1)] * 0.0625
-                          +in[(i+0)*n+(j+2)] * 0.010416666666666666
-                          +in[(i+0)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+0)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+1)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+1)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+1)*n+(j+-2)] * -0.010416666666666666
+                          +in[(i+0)*n+(j+2)] * 0.0104166666667
+                          +in[(i+0)*n+(j+3)] * 0.00416666666667
+                          +in[(i+0)*n+(j+4)] * 0.00223214285714
+                          +in[(i+1)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+1)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+1)*n+(j+-2)] * -0.0104166666667
                           +in[(i+1)*n+(j+0)] * 0.0625
                           +in[(i+1)*n+(j+1)] * 0.0625
-                          +in[(i+1)*n+(j+2)] * 0.010416666666666666
-                          +in[(i+1)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+1)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+2)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+2)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+2)*n+(j+-1)] * 0.010416666666666666
-                          +in[(i+2)*n+(j+0)] * 0.010416666666666666
-                          +in[(i+2)*n+(j+1)] * 0.010416666666666666
+                          +in[(i+1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+2)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+2)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+2)*n+(j+-1)] * 0.0104166666667
+                          +in[(i+2)*n+(j+0)] * 0.0104166666667
+                          +in[(i+2)*n+(j+1)] * 0.0104166666667
                           +in[(i+2)*n+(j+2)] * 0.03125
-                          +in[(i+2)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+2)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+3)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+3)*n+(j+-2)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+-1)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+0)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+1)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+2)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+3)] * 0.020833333333333332
-                          +in[(i+3)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+-3)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+-2)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+-1)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+0)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+1)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+2)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+3)] * 0.002232142857142857
+                          +in[(i+2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+3)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+3)*n+(j+-2)] * 0.00416666666667
+                          +in[(i+3)*n+(j+-1)] * 0.00416666666667
+                          +in[(i+3)*n+(j+0)] * 0.00416666666667
+                          +in[(i+3)*n+(j+1)] * 0.00416666666667
+                          +in[(i+3)*n+(j+2)] * 0.00416666666667
+                          +in[(i+3)*n+(j+3)] * 0.0208333333333
+                          +in[(i+3)*n+(j+4)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-3)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-2)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-1)] * 0.00223214285714
+                          +in[(i+4)*n+(j+0)] * 0.00223214285714
+                          +in[(i+4)*n+(j+1)] * 0.00223214285714
+                          +in[(i+4)*n+(j+2)] * 0.00223214285714
+                          +in[(i+4)*n+(j+3)] * 0.00223214285714
                           +in[(i+4)*n+(j+4)] * 0.015625
                           ;
            }
@@ -327,114 +327,114 @@ void grid5(const int n, const int t, std::vector<double> & in, std::vector<doubl
           OMP_SIMD
           for (auto j=jt; j<std::min(n-5,jt+t); ++j) {
             out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01
-                          +in[(i+-5)*n+(j+-4)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+-3)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+-2)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+-1)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+0)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+1)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+2)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+3)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+4)] * -0.0011111111111111111
-                          +in[(i+-4)*n+(j+-5)] * -0.0011111111111111111
+                          +in[(i+-5)*n+(j+-4)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-3)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-2)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-1)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+0)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+1)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+2)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+3)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+4)] * -0.00111111111111
+                          +in[(i+-4)*n+(j+-5)] * -0.00111111111111
                           +in[(i+-4)*n+(j+-4)] * -0.0125
-                          +in[(i+-4)*n+(j+-3)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+-2)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+-1)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+0)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+1)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+2)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+3)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+-3)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+-3)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+-3)*n+(j+-3)] * -0.016666666666666666
-                          +in[(i+-3)*n+(j+-2)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+-1)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+0)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+1)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+2)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+-3)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+-2)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+-2)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+-2)*n+(j+-3)] * -0.0033333333333333335
+                          +in[(i+-4)*n+(j+-3)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+-2)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+-1)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+0)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+1)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+2)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+3)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-3)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-3)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-3)*n+(j+-3)] * -0.0166666666667
+                          +in[(i+-3)*n+(j+-2)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+-1)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+0)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+1)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+2)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-2)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-2)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-2)*n+(j+-3)] * -0.00333333333333
                           +in[(i+-2)*n+(j+-2)] * -0.025
-                          +in[(i+-2)*n+(j+-1)] * -0.008333333333333333
-                          +in[(i+-2)*n+(j+0)] * -0.008333333333333333
-                          +in[(i+-2)*n+(j+1)] * -0.008333333333333333
-                          +in[(i+-2)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+-2)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+-2)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+-1)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+-1)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+-1)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+-1)*n+(j+-2)] * -0.008333333333333333
+                          +in[(i+-2)*n+(j+-1)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+0)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+1)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+-2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-1)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-1)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-1)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+-1)*n+(j+-2)] * -0.00833333333333
                           +in[(i+-1)*n+(j+-1)] * -0.05
                           +in[(i+-1)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+2)] * 0.008333333333333333
-                          +in[(i+-1)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+-1)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+-1)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+0)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+0)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+0)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+0)*n+(j+-2)] * -0.008333333333333333
+                          +in[(i+-1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+-1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+-1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+0)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+0)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+0)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+0)*n+(j+-2)] * -0.00833333333333
                           +in[(i+0)*n+(j+-1)] * -0.05
                           +in[(i+0)*n+(j+1)] * 0.05
-                          +in[(i+0)*n+(j+2)] * 0.008333333333333333
-                          +in[(i+0)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+0)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+0)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+1)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+1)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+1)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+1)*n+(j+-2)] * -0.008333333333333333
+                          +in[(i+0)*n+(j+2)] * 0.00833333333333
+                          +in[(i+0)*n+(j+3)] * 0.00333333333333
+                          +in[(i+0)*n+(j+4)] * 0.00178571428571
+                          +in[(i+0)*n+(j+5)] * 0.00111111111111
+                          +in[(i+1)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+1)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+1)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+1)*n+(j+-2)] * -0.00833333333333
                           +in[(i+1)*n+(j+0)] * 0.05
                           +in[(i+1)*n+(j+1)] * 0.05
-                          +in[(i+1)*n+(j+2)] * 0.008333333333333333
-                          +in[(i+1)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+1)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+1)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+2)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+2)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+2)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+2)*n+(j+-1)] * 0.008333333333333333
-                          +in[(i+2)*n+(j+0)] * 0.008333333333333333
-                          +in[(i+2)*n+(j+1)] * 0.008333333333333333
+                          +in[(i+1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+2)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+2)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+2)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+2)*n+(j+-1)] * 0.00833333333333
+                          +in[(i+2)*n+(j+0)] * 0.00833333333333
+                          +in[(i+2)*n+(j+1)] * 0.00833333333333
                           +in[(i+2)*n+(j+2)] * 0.025
-                          +in[(i+2)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+2)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+2)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+3)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+3)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+3)*n+(j+-2)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+-1)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+0)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+1)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+2)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+3)] * 0.016666666666666666
-                          +in[(i+3)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+3)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+4)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+4)*n+(j+-3)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+-2)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+-1)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+0)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+1)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+2)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+3)] * 0.0017857142857142857
+                          +in[(i+2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+3)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+3)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+3)*n+(j+-2)] * 0.00333333333333
+                          +in[(i+3)*n+(j+-1)] * 0.00333333333333
+                          +in[(i+3)*n+(j+0)] * 0.00333333333333
+                          +in[(i+3)*n+(j+1)] * 0.00333333333333
+                          +in[(i+3)*n+(j+2)] * 0.00333333333333
+                          +in[(i+3)*n+(j+3)] * 0.0166666666667
+                          +in[(i+3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+4)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+4)*n+(j+-3)] * 0.00178571428571
+                          +in[(i+4)*n+(j+-2)] * 0.00178571428571
+                          +in[(i+4)*n+(j+-1)] * 0.00178571428571
+                          +in[(i+4)*n+(j+0)] * 0.00178571428571
+                          +in[(i+4)*n+(j+1)] * 0.00178571428571
+                          +in[(i+4)*n+(j+2)] * 0.00178571428571
+                          +in[(i+4)*n+(j+3)] * 0.00178571428571
                           +in[(i+4)*n+(j+4)] * 0.0125
-                          +in[(i+4)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-4)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-3)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-2)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-1)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+0)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+1)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+2)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+3)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+4)] * 0.0011111111111111111
+                          +in[(i+4)*n+(j+5)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-4)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-3)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-2)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-1)] * 0.00111111111111
+                          +in[(i+5)*n+(j+0)] * 0.00111111111111
+                          +in[(i+5)*n+(j+1)] * 0.00111111111111
+                          +in[(i+5)*n+(j+2)] * 0.00111111111111
+                          +in[(i+5)*n+(j+3)] * 0.00111111111111
+                          +in[(i+5)*n+(j+4)] * 0.00111111111111
                           +in[(i+5)*n+(j+5)] * 0.01
                           ;
            }
diff --git a/Cxx11/stencil_tbb.hpp b/Cxx11/stencil_tbb.hpp
index 5b4f8c24b..9dc8b4d16 100644
--- a/Cxx11/stencil_tbb.hpp
+++ b/Cxx11/stencil_tbb.hpp
@@ -38,18 +38,18 @@ void star3(const int n, const int t, std::vector<double> & in, std::vector<doubl
     for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) {
       PRAGMA_SIMD
       for (auto j=r.cols().begin(); j!=r.cols().end(); ++j ) {
-            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.05555555555555555
-                          +in[(i+-2)*n+(j+0)] * -0.08333333333333333
-                          +in[(i+-1)*n+(j+0)] * -0.16666666666666666
-                          +in[(i+0)*n+(j+-3)] * -0.05555555555555555
-                          +in[(i+0)*n+(j+-2)] * -0.08333333333333333
-                          +in[(i+0)*n+(j+-1)] * -0.16666666666666666
-                          +in[(i+0)*n+(j+1)] * 0.16666666666666666
-                          +in[(i+0)*n+(j+2)] * 0.08333333333333333
-                          +in[(i+0)*n+(j+3)] * 0.05555555555555555
-                          +in[(i+1)*n+(j+0)] * 0.16666666666666666
-                          +in[(i+2)*n+(j+0)] * 0.08333333333333333
-                          +in[(i+3)*n+(j+0)] * 0.05555555555555555;
+            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556
+                          +in[(i+-2)*n+(j+0)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+0)] * -0.166666666667
+                          +in[(i+0)*n+(j+-3)] * -0.0555555555556
+                          +in[(i+0)*n+(j+-2)] * -0.0833333333333
+                          +in[(i+0)*n+(j+-1)] * -0.166666666667
+                          +in[(i+0)*n+(j+1)] * 0.166666666667
+                          +in[(i+0)*n+(j+2)] * 0.0833333333333
+                          +in[(i+0)*n+(j+3)] * 0.0555555555556
+                          +in[(i+1)*n+(j+0)] * 0.166666666667
+                          +in[(i+2)*n+(j+0)] * 0.0833333333333
+                          +in[(i+3)*n+(j+0)] * 0.0555555555556;
       }
     }
   }, tbb_partitioner );
@@ -62,20 +62,20 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
       PRAGMA_SIMD
       for (auto j=r.cols().begin(); j!=r.cols().end(); ++j ) {
             out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125
-                          +in[(i+-3)*n+(j+0)] * -0.041666666666666664
+                          +in[(i+-3)*n+(j+0)] * -0.0416666666667
                           +in[(i+-2)*n+(j+0)] * -0.0625
                           +in[(i+-1)*n+(j+0)] * -0.125
                           +in[(i+0)*n+(j+-4)] * -0.03125
-                          +in[(i+0)*n+(j+-3)] * -0.041666666666666664
+                          +in[(i+0)*n+(j+-3)] * -0.0416666666667
                           +in[(i+0)*n+(j+-2)] * -0.0625
                           +in[(i+0)*n+(j+-1)] * -0.125
                           +in[(i+0)*n+(j+1)] * 0.125
                           +in[(i+0)*n+(j+2)] * 0.0625
-                          +in[(i+0)*n+(j+3)] * 0.041666666666666664
+                          +in[(i+0)*n+(j+3)] * 0.0416666666667
                           +in[(i+0)*n+(j+4)] * 0.03125
                           +in[(i+1)*n+(j+0)] * 0.125
                           +in[(i+2)*n+(j+0)] * 0.0625
-                          +in[(i+3)*n+(j+0)] * 0.041666666666666664
+                          +in[(i+3)*n+(j+0)] * 0.0416666666667
                           +in[(i+4)*n+(j+0)] * 0.03125;
       }
     }
@@ -90,22 +90,22 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
       for (auto j=r.cols().begin(); j!=r.cols().end(); ++j ) {
             out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02
                           +in[(i+-4)*n+(j+0)] * -0.025
-                          +in[(i+-3)*n+(j+0)] * -0.03333333333333333
+                          +in[(i+-3)*n+(j+0)] * -0.0333333333333
                           +in[(i+-2)*n+(j+0)] * -0.05
                           +in[(i+-1)*n+(j+0)] * -0.1
                           +in[(i+0)*n+(j+-5)] * -0.02
                           +in[(i+0)*n+(j+-4)] * -0.025
-                          +in[(i+0)*n+(j+-3)] * -0.03333333333333333
+                          +in[(i+0)*n+(j+-3)] * -0.0333333333333
                           +in[(i+0)*n+(j+-2)] * -0.05
                           +in[(i+0)*n+(j+-1)] * -0.1
                           +in[(i+0)*n+(j+1)] * 0.1
                           +in[(i+0)*n+(j+2)] * 0.05
-                          +in[(i+0)*n+(j+3)] * 0.03333333333333333
+                          +in[(i+0)*n+(j+3)] * 0.0333333333333
                           +in[(i+0)*n+(j+4)] * 0.025
                           +in[(i+0)*n+(j+5)] * 0.02
                           +in[(i+1)*n+(j+0)] * 0.1
                           +in[(i+2)*n+(j+0)] * 0.05
-                          +in[(i+3)*n+(j+0)] * 0.03333333333333333
+                          +in[(i+3)*n+(j+0)] * 0.0333333333333
                           +in[(i+4)*n+(j+0)] * 0.025
                           +in[(i+5)*n+(j+0)] * 0.02;
       }
@@ -138,24 +138,24 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
       PRAGMA_SIMD
       for (auto j=r.cols().begin(); j!=r.cols().end(); ++j ) {
             out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625
-                          +in[(i+-2)*n+(j+-1)] * -0.020833333333333332
-                          +in[(i+-2)*n+(j+0)] * -0.020833333333333332
-                          +in[(i+-2)*n+(j+1)] * -0.020833333333333332
-                          +in[(i+-1)*n+(j+-2)] * -0.020833333333333332
+                          +in[(i+-2)*n+(j+-1)] * -0.0208333333333
+                          +in[(i+-2)*n+(j+0)] * -0.0208333333333
+                          +in[(i+-2)*n+(j+1)] * -0.0208333333333
+                          +in[(i+-1)*n+(j+-2)] * -0.0208333333333
                           +in[(i+-1)*n+(j+-1)] * -0.125
                           +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+2)] * 0.020833333333333332
-                          +in[(i+0)*n+(j+-2)] * -0.020833333333333332
+                          +in[(i+-1)*n+(j+2)] * 0.0208333333333
+                          +in[(i+0)*n+(j+-2)] * -0.0208333333333
                           +in[(i+0)*n+(j+-1)] * -0.125
                           +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.020833333333333332
-                          +in[(i+1)*n+(j+-2)] * -0.020833333333333332
+                          +in[(i+0)*n+(j+2)] * 0.0208333333333
+                          +in[(i+1)*n+(j+-2)] * -0.0208333333333
                           +in[(i+1)*n+(j+0)] * 0.125
                           +in[(i+1)*n+(j+1)] * 0.125
-                          +in[(i+1)*n+(j+2)] * 0.020833333333333332
-                          +in[(i+2)*n+(j+-1)] * 0.020833333333333332
-                          +in[(i+2)*n+(j+0)] * 0.020833333333333332
-                          +in[(i+2)*n+(j+1)] * 0.020833333333333332
+                          +in[(i+1)*n+(j+2)] * 0.0208333333333
+                          +in[(i+2)*n+(j+-1)] * 0.0208333333333
+                          +in[(i+2)*n+(j+0)] * 0.0208333333333
+                          +in[(i+2)*n+(j+1)] * 0.0208333333333
                           +in[(i+2)*n+(j+2)] * 0.0625
                           ;
       }
@@ -169,48 +169,48 @@ void grid3(const int n, const int t, std::vector<double> & in, std::vector<doubl
     for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) {
       PRAGMA_SIMD
       for (auto j=r.cols().begin(); j!=r.cols().end(); ++j ) {
-            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.027777777777777776
-                          +in[(i+-3)*n+(j+-2)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+-1)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+0)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+1)] * -0.005555555555555556
-                          +in[(i+-3)*n+(j+2)] * -0.005555555555555556
-                          +in[(i+-2)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+-2)*n+(j+-2)] * -0.041666666666666664
-                          +in[(i+-2)*n+(j+-1)] * -0.013888888888888888
-                          +in[(i+-2)*n+(j+0)] * -0.013888888888888888
-                          +in[(i+-2)*n+(j+1)] * -0.013888888888888888
-                          +in[(i+-2)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+-1)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+-1)*n+(j+-2)] * -0.013888888888888888
-                          +in[(i+-1)*n+(j+-1)] * -0.08333333333333333
-                          +in[(i+-1)*n+(j+0)] * -0.08333333333333333
-                          +in[(i+-1)*n+(j+2)] * 0.013888888888888888
-                          +in[(i+-1)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+0)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.013888888888888888
-                          +in[(i+0)*n+(j+-1)] * -0.08333333333333333
-                          +in[(i+0)*n+(j+1)] * 0.08333333333333333
-                          +in[(i+0)*n+(j+2)] * 0.013888888888888888
-                          +in[(i+0)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+1)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+1)*n+(j+-2)] * -0.013888888888888888
-                          +in[(i+1)*n+(j+0)] * 0.08333333333333333
-                          +in[(i+1)*n+(j+1)] * 0.08333333333333333
-                          +in[(i+1)*n+(j+2)] * 0.013888888888888888
-                          +in[(i+1)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+2)*n+(j+-3)] * -0.005555555555555556
-                          +in[(i+2)*n+(j+-1)] * 0.013888888888888888
-                          +in[(i+2)*n+(j+0)] * 0.013888888888888888
-                          +in[(i+2)*n+(j+1)] * 0.013888888888888888
-                          +in[(i+2)*n+(j+2)] * 0.041666666666666664
-                          +in[(i+2)*n+(j+3)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+-2)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+-1)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+0)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+1)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+2)] * 0.005555555555555556
-                          +in[(i+3)*n+(j+3)] * 0.027777777777777776
+            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778
+                          +in[(i+-3)*n+(j+-2)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+-1)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+0)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+1)] * -0.00555555555556
+                          +in[(i+-3)*n+(j+2)] * -0.00555555555556
+                          +in[(i+-2)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+-2)*n+(j+-2)] * -0.0416666666667
+                          +in[(i+-2)*n+(j+-1)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+0)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+1)] * -0.0138888888889
+                          +in[(i+-2)*n+(j+3)] * 0.00555555555556
+                          +in[(i+-1)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+-1)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+-1)*n+(j+-1)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+0)] * -0.0833333333333
+                          +in[(i+-1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+-1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+0)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+0)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+0)*n+(j+-1)] * -0.0833333333333
+                          +in[(i+0)*n+(j+1)] * 0.0833333333333
+                          +in[(i+0)*n+(j+2)] * 0.0138888888889
+                          +in[(i+0)*n+(j+3)] * 0.00555555555556
+                          +in[(i+1)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+1)*n+(j+-2)] * -0.0138888888889
+                          +in[(i+1)*n+(j+0)] * 0.0833333333333
+                          +in[(i+1)*n+(j+1)] * 0.0833333333333
+                          +in[(i+1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+2)*n+(j+-3)] * -0.00555555555556
+                          +in[(i+2)*n+(j+-1)] * 0.0138888888889
+                          +in[(i+2)*n+(j+0)] * 0.0138888888889
+                          +in[(i+2)*n+(j+1)] * 0.0138888888889
+                          +in[(i+2)*n+(j+2)] * 0.0416666666667
+                          +in[(i+2)*n+(j+3)] * 0.00555555555556
+                          +in[(i+3)*n+(j+-2)] * 0.00555555555556
+                          +in[(i+3)*n+(j+-1)] * 0.00555555555556
+                          +in[(i+3)*n+(j+0)] * 0.00555555555556
+                          +in[(i+3)*n+(j+1)] * 0.00555555555556
+                          +in[(i+3)*n+(j+2)] * 0.00555555555556
+                          +in[(i+3)*n+(j+3)] * 0.0277777777778
                           ;
       }
     }
@@ -224,76 +224,76 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
       PRAGMA_SIMD
       for (auto j=r.cols().begin(); j!=r.cols().end(); ++j ) {
             out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625
-                          +in[(i+-4)*n+(j+-3)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+-2)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+-1)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+0)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+1)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+2)] * -0.002232142857142857
-                          +in[(i+-4)*n+(j+3)] * -0.002232142857142857
-                          +in[(i+-3)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+-3)*n+(j+-3)] * -0.020833333333333332
-                          +in[(i+-3)*n+(j+-2)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+-1)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+0)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+1)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+2)] * -0.004166666666666667
-                          +in[(i+-3)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+-2)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+-2)*n+(j+-3)] * -0.004166666666666667
+                          +in[(i+-4)*n+(j+-3)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+-2)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+-1)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+0)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+1)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+2)] * -0.00223214285714
+                          +in[(i+-4)*n+(j+3)] * -0.00223214285714
+                          +in[(i+-3)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-3)*n+(j+-3)] * -0.0208333333333
+                          +in[(i+-3)*n+(j+-2)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+-1)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+0)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+1)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+2)] * -0.00416666666667
+                          +in[(i+-3)*n+(j+4)] * 0.00223214285714
+                          +in[(i+-2)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-2)*n+(j+-3)] * -0.00416666666667
                           +in[(i+-2)*n+(j+-2)] * -0.03125
-                          +in[(i+-2)*n+(j+-1)] * -0.010416666666666666
-                          +in[(i+-2)*n+(j+0)] * -0.010416666666666666
-                          +in[(i+-2)*n+(j+1)] * -0.010416666666666666
-                          +in[(i+-2)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+-2)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+-1)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+-1)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+-1)*n+(j+-2)] * -0.010416666666666666
+                          +in[(i+-2)*n+(j+-1)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+0)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+1)] * -0.0104166666667
+                          +in[(i+-2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+-2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+-1)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+-1)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+-1)*n+(j+-2)] * -0.0104166666667
                           +in[(i+-1)*n+(j+-1)] * -0.0625
                           +in[(i+-1)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+2)] * 0.010416666666666666
-                          +in[(i+-1)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+-1)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+0)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+0)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+0)*n+(j+-2)] * -0.010416666666666666
+                          +in[(i+-1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+-1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+-1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+0)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+0)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+0)*n+(j+-2)] * -0.0104166666667
                           +in[(i+0)*n+(j+-1)] * -0.0625
                           +in[(i+0)*n+(j+1)] * 0.0625
-                          +in[(i+0)*n+(j+2)] * 0.010416666666666666
-                          +in[(i+0)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+0)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+1)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+1)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+1)*n+(j+-2)] * -0.010416666666666666
+                          +in[(i+0)*n+(j+2)] * 0.0104166666667
+                          +in[(i+0)*n+(j+3)] * 0.00416666666667
+                          +in[(i+0)*n+(j+4)] * 0.00223214285714
+                          +in[(i+1)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+1)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+1)*n+(j+-2)] * -0.0104166666667
                           +in[(i+1)*n+(j+0)] * 0.0625
                           +in[(i+1)*n+(j+1)] * 0.0625
-                          +in[(i+1)*n+(j+2)] * 0.010416666666666666
-                          +in[(i+1)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+1)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+2)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+2)*n+(j+-3)] * -0.004166666666666667
-                          +in[(i+2)*n+(j+-1)] * 0.010416666666666666
-                          +in[(i+2)*n+(j+0)] * 0.010416666666666666
-                          +in[(i+2)*n+(j+1)] * 0.010416666666666666
+                          +in[(i+1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+2)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+2)*n+(j+-3)] * -0.00416666666667
+                          +in[(i+2)*n+(j+-1)] * 0.0104166666667
+                          +in[(i+2)*n+(j+0)] * 0.0104166666667
+                          +in[(i+2)*n+(j+1)] * 0.0104166666667
                           +in[(i+2)*n+(j+2)] * 0.03125
-                          +in[(i+2)*n+(j+3)] * 0.004166666666666667
-                          +in[(i+2)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+3)*n+(j+-4)] * -0.002232142857142857
-                          +in[(i+3)*n+(j+-2)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+-1)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+0)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+1)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+2)] * 0.004166666666666667
-                          +in[(i+3)*n+(j+3)] * 0.020833333333333332
-                          +in[(i+3)*n+(j+4)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+-3)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+-2)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+-1)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+0)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+1)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+2)] * 0.002232142857142857
-                          +in[(i+4)*n+(j+3)] * 0.002232142857142857
+                          +in[(i+2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+3)*n+(j+-4)] * -0.00223214285714
+                          +in[(i+3)*n+(j+-2)] * 0.00416666666667
+                          +in[(i+3)*n+(j+-1)] * 0.00416666666667
+                          +in[(i+3)*n+(j+0)] * 0.00416666666667
+                          +in[(i+3)*n+(j+1)] * 0.00416666666667
+                          +in[(i+3)*n+(j+2)] * 0.00416666666667
+                          +in[(i+3)*n+(j+3)] * 0.0208333333333
+                          +in[(i+3)*n+(j+4)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-3)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-2)] * 0.00223214285714
+                          +in[(i+4)*n+(j+-1)] * 0.00223214285714
+                          +in[(i+4)*n+(j+0)] * 0.00223214285714
+                          +in[(i+4)*n+(j+1)] * 0.00223214285714
+                          +in[(i+4)*n+(j+2)] * 0.00223214285714
+                          +in[(i+4)*n+(j+3)] * 0.00223214285714
                           +in[(i+4)*n+(j+4)] * 0.015625
                           ;
       }
@@ -308,114 +308,114 @@ void grid5(const int n, const int t, std::vector<double> & in, std::vector<doubl
       PRAGMA_SIMD
       for (auto j=r.cols().begin(); j!=r.cols().end(); ++j ) {
             out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01
-                          +in[(i+-5)*n+(j+-4)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+-3)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+-2)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+-1)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+0)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+1)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+2)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+3)] * -0.0011111111111111111
-                          +in[(i+-5)*n+(j+4)] * -0.0011111111111111111
-                          +in[(i+-4)*n+(j+-5)] * -0.0011111111111111111
+                          +in[(i+-5)*n+(j+-4)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-3)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-2)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+-1)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+0)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+1)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+2)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+3)] * -0.00111111111111
+                          +in[(i+-5)*n+(j+4)] * -0.00111111111111
+                          +in[(i+-4)*n+(j+-5)] * -0.00111111111111
                           +in[(i+-4)*n+(j+-4)] * -0.0125
-                          +in[(i+-4)*n+(j+-3)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+-2)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+-1)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+0)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+1)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+2)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+3)] * -0.0017857142857142857
-                          +in[(i+-4)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+-3)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+-3)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+-3)*n+(j+-3)] * -0.016666666666666666
-                          +in[(i+-3)*n+(j+-2)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+-1)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+0)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+1)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+2)] * -0.0033333333333333335
-                          +in[(i+-3)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+-3)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+-2)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+-2)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+-2)*n+(j+-3)] * -0.0033333333333333335
+                          +in[(i+-4)*n+(j+-3)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+-2)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+-1)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+0)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+1)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+2)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+3)] * -0.00178571428571
+                          +in[(i+-4)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-3)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-3)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-3)*n+(j+-3)] * -0.0166666666667
+                          +in[(i+-3)*n+(j+-2)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+-1)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+0)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+1)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+2)] * -0.00333333333333
+                          +in[(i+-3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-2)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-2)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-2)*n+(j+-3)] * -0.00333333333333
                           +in[(i+-2)*n+(j+-2)] * -0.025
-                          +in[(i+-2)*n+(j+-1)] * -0.008333333333333333
-                          +in[(i+-2)*n+(j+0)] * -0.008333333333333333
-                          +in[(i+-2)*n+(j+1)] * -0.008333333333333333
-                          +in[(i+-2)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+-2)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+-2)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+-1)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+-1)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+-1)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+-1)*n+(j+-2)] * -0.008333333333333333
+                          +in[(i+-2)*n+(j+-1)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+0)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+1)] * -0.00833333333333
+                          +in[(i+-2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+-2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+-1)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+-1)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+-1)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+-1)*n+(j+-2)] * -0.00833333333333
                           +in[(i+-1)*n+(j+-1)] * -0.05
                           +in[(i+-1)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+2)] * 0.008333333333333333
-                          +in[(i+-1)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+-1)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+-1)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+0)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+0)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+0)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+0)*n+(j+-2)] * -0.008333333333333333
+                          +in[(i+-1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+-1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+-1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+-1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+0)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+0)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+0)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+0)*n+(j+-2)] * -0.00833333333333
                           +in[(i+0)*n+(j+-1)] * -0.05
                           +in[(i+0)*n+(j+1)] * 0.05
-                          +in[(i+0)*n+(j+2)] * 0.008333333333333333
-                          +in[(i+0)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+0)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+0)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+1)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+1)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+1)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+1)*n+(j+-2)] * -0.008333333333333333
+                          +in[(i+0)*n+(j+2)] * 0.00833333333333
+                          +in[(i+0)*n+(j+3)] * 0.00333333333333
+                          +in[(i+0)*n+(j+4)] * 0.00178571428571
+                          +in[(i+0)*n+(j+5)] * 0.00111111111111
+                          +in[(i+1)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+1)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+1)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+1)*n+(j+-2)] * -0.00833333333333
                           +in[(i+1)*n+(j+0)] * 0.05
                           +in[(i+1)*n+(j+1)] * 0.05
-                          +in[(i+1)*n+(j+2)] * 0.008333333333333333
-                          +in[(i+1)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+1)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+1)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+2)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+2)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+2)*n+(j+-3)] * -0.0033333333333333335
-                          +in[(i+2)*n+(j+-1)] * 0.008333333333333333
-                          +in[(i+2)*n+(j+0)] * 0.008333333333333333
-                          +in[(i+2)*n+(j+1)] * 0.008333333333333333
+                          +in[(i+1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+2)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+2)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+2)*n+(j+-3)] * -0.00333333333333
+                          +in[(i+2)*n+(j+-1)] * 0.00833333333333
+                          +in[(i+2)*n+(j+0)] * 0.00833333333333
+                          +in[(i+2)*n+(j+1)] * 0.00833333333333
                           +in[(i+2)*n+(j+2)] * 0.025
-                          +in[(i+2)*n+(j+3)] * 0.0033333333333333335
-                          +in[(i+2)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+2)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+3)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+3)*n+(j+-4)] * -0.0017857142857142857
-                          +in[(i+3)*n+(j+-2)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+-1)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+0)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+1)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+2)] * 0.0033333333333333335
-                          +in[(i+3)*n+(j+3)] * 0.016666666666666666
-                          +in[(i+3)*n+(j+4)] * 0.0017857142857142857
-                          +in[(i+3)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+4)*n+(j+-5)] * -0.0011111111111111111
-                          +in[(i+4)*n+(j+-3)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+-2)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+-1)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+0)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+1)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+2)] * 0.0017857142857142857
-                          +in[(i+4)*n+(j+3)] * 0.0017857142857142857
+                          +in[(i+2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+3)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+3)*n+(j+-4)] * -0.00178571428571
+                          +in[(i+3)*n+(j+-2)] * 0.00333333333333
+                          +in[(i+3)*n+(j+-1)] * 0.00333333333333
+                          +in[(i+3)*n+(j+0)] * 0.00333333333333
+                          +in[(i+3)*n+(j+1)] * 0.00333333333333
+                          +in[(i+3)*n+(j+2)] * 0.00333333333333
+                          +in[(i+3)*n+(j+3)] * 0.0166666666667
+                          +in[(i+3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+4)*n+(j+-5)] * -0.00111111111111
+                          +in[(i+4)*n+(j+-3)] * 0.00178571428571
+                          +in[(i+4)*n+(j+-2)] * 0.00178571428571
+                          +in[(i+4)*n+(j+-1)] * 0.00178571428571
+                          +in[(i+4)*n+(j+0)] * 0.00178571428571
+                          +in[(i+4)*n+(j+1)] * 0.00178571428571
+                          +in[(i+4)*n+(j+2)] * 0.00178571428571
+                          +in[(i+4)*n+(j+3)] * 0.00178571428571
                           +in[(i+4)*n+(j+4)] * 0.0125
-                          +in[(i+4)*n+(j+5)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-4)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-3)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-2)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+-1)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+0)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+1)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+2)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+3)] * 0.0011111111111111111
-                          +in[(i+5)*n+(j+4)] * 0.0011111111111111111
+                          +in[(i+4)*n+(j+5)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-4)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-3)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-2)] * 0.00111111111111
+                          +in[(i+5)*n+(j+-1)] * 0.00111111111111
+                          +in[(i+5)*n+(j+0)] * 0.00111111111111
+                          +in[(i+5)*n+(j+1)] * 0.00111111111111
+                          +in[(i+5)*n+(j+2)] * 0.00111111111111
+                          +in[(i+5)*n+(j+3)] * 0.00111111111111
+                          +in[(i+5)*n+(j+4)] * 0.00111111111111
                           +in[(i+5)*n+(j+5)] * 0.01
                           ;
       }
diff --git a/Cxx11/transpose-kokkos.cc b/Cxx11/transpose-kokkos.cc
index 268d9e19a..aff072f53 100644
--- a/Cxx11/transpose-kokkos.cc
+++ b/Cxx11/transpose-kokkos.cc
@@ -1,5 +1,5 @@
 ///
-/// Copyright (c) 2013, Intel Corporation
+/// Copyright (c) 2018, Intel Corporation
 ///
 /// Redistribution and use in source and binary forms, with or without
 /// modification, are permitted provided that the following conditions
@@ -54,140 +54,136 @@
 int main(int argc, char * argv[])
 {
   std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
-  std::cout << "C++11 Matrix transpose: B = A^T" << std::endl;
+  std::cout << "C++11/Kokkos Matrix transpose: B = A^T" << std::endl;
 
   Kokkos::initialize(argc, argv);
+  {
+    // row-major 2D array
+    typedef Kokkos::View<double**, Kokkos::LayoutRight> matrix;
+    // column-major 2D array
+    //typedef Kokkos::View<double**, Kokkos::LayoutLeft> matrix;
+    // default 2D array
+    //typedef Kokkos::View<double**> matrix;
+
+    //////////////////////////////////////////////////////////////////////
+    /// Read and test input parameters
+    //////////////////////////////////////////////////////////////////////
+
+    int iterations;
+    int order;
+    int tile_size;
+    bool permute = false;
+    try {
+        if (argc < 3) {
+          throw "Usage: <# iterations> <matrix order> [<tile_size> <permute=0/1>]";
+        }
+
+        iterations  = std::atoi(argv[1]);
+        if (iterations < 1) {
+          throw "ERROR: iterations must be >= 1";
+        }
+
+        order = std::atoi(argv[2]);
+        if (order <= 0) {
+          throw "ERROR: Matrix Order must be greater than 0";
+        } else if (order > std::floor(std::sqrt(INT_MAX))) {
+          throw "ERROR: matrix dimension too large - overflow risk";
+        }
+
+        // default tile size for tiling of local transpose
+        tile_size = (argc>3) ? std::atoi(argv[3]) : 32;
+        // a negative tile size means no tiling of the local transpose
+        if (tile_size <= 0) tile_size = order;
+
+        auto permute_input = (argc>4) ? std::atoi(argv[4]) : 0;
+        if (permute_input != 0 && permute_input != 1) {
+          throw "ERROR: permute must be 0 (no) or 1 (yes)";
+        }
+        permute = (permute_input == 1);
+    }
+    catch (const char * e) {
+      std::cout << e << std::endl;
+      return 1;
+    }
 
-  typedef Kokkos::TeamPolicy<>               team_policy ;
-  typedef Kokkos::TeamPolicy<>::member_type  member_type ;
-
-  // row-major 2D array
-  typedef Kokkos::View<double**, Kokkos::LayoutRight> matrix;
-  // column-major 2D array
-  //typedef Kokkos::View<double**, Kokkos::LayoutLeft> matrix;
-  // default 2D array
-  //typedef Kokkos::View<double**> matrix;
-
-  //////////////////////////////////////////////////////////////////////
-  /// Read and test input parameters
-  //////////////////////////////////////////////////////////////////////
-
-  int iterations;
-  int order;
-  try {
-      if (argc < 3) {
-        throw "Usage: <# iterations> <matrix order>";
-      }
-
-      // number of times to do the transpose
-      iterations  = std::atoi(argv[1]);
-      if (iterations < 1) {
-        throw "ERROR: iterations must be >= 1";
-      }
+    std::cout << "Number of iterations = " << iterations << std::endl;
+    std::cout << "Matrix order         = " << order << std::endl;
+    std::cout << "Tile size            = " << tile_size << std::endl;
+    std::cout << "Permute loops        = " << (permute ? "yes" : "no") << std::endl;
+    std::cout << "Kokkos execution space: " << typeid(Kokkos::DefaultExecutionSpace).name() << std::endl;
 
-      // order of a the matrix
-      order = std::atoi(argv[2]);
-      if (order <= 0) {
-        throw "ERROR: Matrix Order must be greater than 0";
-      } else if (order > std::floor(std::sqrt(INT_MAX))) {
-        throw "ERROR: matrix dimension too large - overflow risk";
-      }
-  }
-  catch (const char * e) {
-    std::cout << e << std::endl;
-    return 1;
-  }
+    //////////////////////////////////////////////////////////////////////
+    // Allocate space and perform the computation
+    //////////////////////////////////////////////////////////////////////
 
-  std::cout << "Number of iterations  = " << iterations << std::endl;
-  std::cout << "Matrix order          = " << order << std::endl;
-  std::cout << "Kokkos execution space: " << typeid(Kokkos::DefaultExecutionSpace).name() << std::endl;
+    matrix A("A", order, order);
+    matrix B("B", order, order);
 
-  //////////////////////////////////////////////////////////////////////
-  /// Allocate space for the input and transpose matrix
-  //////////////////////////////////////////////////////////////////////
+    auto order2 = {order,order};
+    auto tile2  = {tile_size,tile_size};
 
-  matrix A("A", order, order);
-  matrix B("B", order, order);
+    auto policy    = Kokkos::MDRangePolicy<Kokkos::Rank<2>>({0,0},order2,tile2);
+    typedef Kokkos::Rank<2,Kokkos::Iterate::Right,Kokkos::Iterate::Left > rl;
+    typedef Kokkos::Rank<2,Kokkos::Iterate::Left, Kokkos::Iterate::Right> lr;
+    auto policy_lr = Kokkos::MDRangePolicy<rl>({0,0},order2,tile2);
+    auto policy_rl = Kokkos::MDRangePolicy<lr>({0,0},order2,tile2);
 
-#if 0
-  Kokkos::parallel_for ( order, KOKKOS_LAMBDA(const int i) {
-    for (auto j=0; j<order; ++j){
+    Kokkos::parallel_for(policy, KOKKOS_LAMBDA(int i, int j) {
         A(i,j) = static_cast<double>(i*order+j);
         B(i,j) = 0.0;
-    }
-  });
-#else
-  Kokkos::parallel_for( team_policy(order, Kokkos::AUTO), KOKKOS_LAMBDA(const member_type& teamMember) {
-    const int i = teamMember.league_rank();
-    Kokkos::parallel_for( Kokkos::TeamThreadRange(teamMember, order), [&](const int j) {
-      A(i,j) = static_cast<double>(i*order+j);
-      B(i,j) = 0.0;
     });
-  });
-#endif
 
-  auto trans_time = 0.0;
+    double trans_time(0);
 
-  for (auto iter = 0; iter<=iterations; iter++) {
+    for (int iter = 0; iter<=iterations; ++iter) {
 
-    if (iter==1) trans_time = prk::wtime();
+      if (iter==1) trans_time = prk::wtime();
 
-#if 0
-    Kokkos::parallel_for ( order, KOKKOS_LAMBDA(const int i) {
-      for (auto j=0; j<order; ++j){
-        B(i,j) += A(j,i);
-        A(j,i) += 1.0;
+      if (permute) {
+          Kokkos::parallel_for(policy_rl, KOKKOS_LAMBDA(int i, int j) {
+              B(i,j) += A(j,i);
+              A(j,i) += 1.0;
+          });
+      } else {
+          Kokkos::parallel_for(policy_lr, KOKKOS_LAMBDA(int i, int j) {
+              B(i,j) += A(j,i);
+              A(j,i) += 1.0;
+          });
       }
-    });
-#else
-    Kokkos::parallel_for( team_policy(order, Kokkos::AUTO), KOKKOS_LAMBDA(const member_type& teamMember) {
-      const int i = teamMember.league_rank();
-      Kokkos::parallel_for( Kokkos::TeamThreadRange(teamMember, order), [&](const int j) {
-        B(i,j) += A(j,i);
-        A(j,i) += 1.0;
-      });
-    });
-#endif
     }
 
-  trans_time = prk::wtime() - trans_time;
-
-  //////////////////////////////////////////////////////////////////////
-  /// Analyze and output results
-  //////////////////////////////////////////////////////////////////////
-
-  const double addit = (iterations+1.) * (0.5*iterations);
-  double abserr(0);
-  Kokkos::parallel_reduce( team_policy(order, Kokkos::AUTO), KOKKOS_LAMBDA(const member_type & teamMember, double & update) {
-    const int i = teamMember.league_rank();
-    double temp(0);
-    Kokkos::parallel_reduce( Kokkos::TeamThreadRange(teamMember, order), [&](const int j, double & inner) {
-      const size_t ij = i*order+j;
-      const double reference = static_cast<double>(ij)*(1.+iterations)+addit;
-      inner += std::fabs(B(j,i) - reference);
-    }, temp);
-    Kokkos::single( Kokkos::PerTeam( teamMember ), [&] () {
-        update += temp;
-    });
-  }, abserr);
+    trans_time = prk::wtime() - trans_time;
+
+    //////////////////////////////////////////////////////////////////////
+    /// Analyze and output results
+    //////////////////////////////////////////////////////////////////////
+
+    double const addit = (iterations+1.) * (0.5*iterations);
+    double abserr(0);
+    Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(int i, int j, double & update) {
+        size_t const ij = i*order+j;
+        double const reference = static_cast<double>(ij)*(1.+iterations)+addit;
+        update += std::fabs(B(j,i) - reference);
+    }, abserr);
 
 #ifdef VERBOSE
-  std::cout << "Sum of absolute differences: " << abserr << std::endl;
+    std::cout << "Sum of absolute differences: " << abserr << std::endl;
 #endif
 
-  const auto epsilon = 1.0e-8;
-  if (abserr < epsilon) {
-    std::cout << "Solution validates" << std::endl;
-    auto avgtime = trans_time/iterations;
-    auto bytes = (size_t)order * (size_t)order * sizeof(double);
-    std::cout << "Rate (MB/s): " << 1.0e-6 * (2L*bytes)/avgtime
-              << " Avg time (s): " << avgtime << std::endl;
-  } else {
-    std::cout << "ERROR: Aggregate squared error " << abserr
-              << " exceeds threshold " << epsilon << std::endl;
-    return 1;
-  }
+    double epsilon(1.0e-8);
+    if (abserr < epsilon) {
+      std::cout << "Solution validates" << std::endl;
+      auto avgtime = trans_time/iterations;
+      auto bytes = (size_t)order * (size_t)order * sizeof(double);
+      std::cout << "Rate (MB/s): " << 1.0e-6 * (2.*bytes)/avgtime
+                << " Avg time (s): " << avgtime << std::endl;
+    } else {
+      std::cout << "ERROR: Aggregate squared error " << abserr
+                << " exceeds threshold " << epsilon << std::endl;
+      return 1;
+    }
 
+  }
   Kokkos::finalize();
 
   return 0;

From 3dde8a27e10f554a1c56ccf197dcb4f66a1b4b49 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 25 Apr 2018 19:49:37 -0400
Subject: [PATCH 079/245] Sycl remove boost (#335)

* remove boost dependency from sycl nstream
* remove boost dependency from sycl transpose
* cleanup sycl like kokkos
* bug fix range replacement
* initialize simply to avoid issues
---
 Cxx11/Makefile          |  2 +-
 Cxx11/nstream-sycl.cc   | 24 ++++++++----------------
 Cxx11/stencil-sycl.cc   |  2 --
 Cxx11/transpose-sycl.cc | 26 +++++++++++---------------
 travis/install-raja.sh  |  3 ++-
 5 files changed, 22 insertions(+), 35 deletions(-)

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index 5eb4b1526..c4b9b6ba8 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -35,7 +35,7 @@ TARGETFLAGS = $(OFFLOADFLAG)
 OPENCLFLAGS = $(OPENCLFLAG)
 # We do not yet handle all possible exceptions...
 #OPENCLFLAGS += -D__CL_ENABLE_EXCEPTIONS
-SYCLFLAGS = $(SYCLFLAG) $(BOOSTFLAG)
+SYCLFLAGS = $(SYCLFLAG) -DUSE_2D_INDEXING=0
 ORNLACCFLAGS = $(ORNLACCFLAG)
 TBBFLAGS = $(TBBFLAG) -DPRK_TBB_PARTITIONER=$(PRK_TBB_PARTITIONER)
 CBLASFLAGS = $(CBLASFLAG) $(OPENMPFLAG)
diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc
index b21c73593..c5d390341 100644
--- a/Cxx11/nstream-sycl.cc
+++ b/Cxx11/nstream-sycl.cc
@@ -111,21 +111,13 @@ int main(int argc, char * argv[])
   // Allocate space and perform the computation
   //////////////////////////////////////////////////////////////////////
 
-  auto nstream_time = 0.0;
+  double nstream_time(0);
 
-  std::vector<double> h_A(length);
-  std::vector<double> h_B(length);
-  std::vector<double> h_C(length);
+  std::vector<double> h_A(length,0);
+  std::vector<double> h_B(length,2);
+  std::vector<double> h_C(length,2);
 
-  auto range = boost::irange(static_cast<size_t>(0), length);
-
-  const double scalar(3);
-
-  std::for_each( std::begin(range), std::end(range), [&] (size_t i) {
-      h_A[i] = 0;
-      h_B[i] = 2;
-      h_C[i] = 2;
-  });
+  double const scalar(3);
 
   {
     // initialize device buffers from host buffers
@@ -133,7 +125,7 @@ int main(int argc, char * argv[])
     cl::sycl::buffer<double> d_B { h_B.data(), h_B.size() };
     cl::sycl::buffer<double> d_C { h_C.data(), h_C.size() };
 
-    for (auto iter = 0; iter<=iterations; iter++) {
+    for (int iter = 0; iter<=iterations; ++iter) {
 
       if (iter==1) nstream_time = prk::wtime();
 
@@ -164,14 +156,14 @@ int main(int argc, char * argv[])
   double ar(0);
   double br(2);
   double cr(2);
-  for (auto i=0; i<=iterations; i++) {
+  for (int i=0; i<=iterations; ++i) {
       ar += br + scalar * cr;
   }
 
   ar *= length;
 
   double asum(0);
-  for (size_t i=0; i<length; i++) {
+  for (size_t i=0; i<length; ++i) {
       asum += std::fabs(h_A[i]);
   }
 
diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc
index 7aceb02c0..9989b5bdc 100644
--- a/Cxx11/stencil-sycl.cc
+++ b/Cxx11/stencil-sycl.cc
@@ -60,8 +60,6 @@
 ///
 //////////////////////////////////////////////////////////////////////
 
-#define USE_2D_INDEXING 1
-
 #include "prk_util.h"
 #include "stencil_sycl.hpp"
 
diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc
index e75897e77..5055374d2 100644
--- a/Cxx11/transpose-sycl.cc
+++ b/Cxx11/transpose-sycl.cc
@@ -51,8 +51,6 @@
 
 #include "prk_util.h"
 
-#define USE_2D_INDEXING 1
-
 int main(int argc, char * argv[])
 {
   std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
@@ -95,7 +93,7 @@ int main(int argc, char * argv[])
   /// Allocate space for the input and transpose matrix
   //////////////////////////////////////////////////////////////////////
 
-  auto trans_time = 0.0;
+  double trans_time(0);
 
   std::vector<double> h_A(order*order);
   std::vector<double> h_B(order*order,0.0);
@@ -115,7 +113,7 @@ int main(int argc, char * argv[])
     cl::sycl::buffer<double> d_B { h_B.data(), h_B.size() };
 #endif
 
-    for (auto iter = 0; iter<=iterations; iter++) {
+    for (int iter = 0; iter<=iterations; ++iter) {
 
       if (iter==1) trans_time = prk::wtime();
 
@@ -151,16 +149,14 @@ int main(int argc, char * argv[])
   /// Analyze and output results
   //////////////////////////////////////////////////////////////////////
 
-  auto range = boost::irange(static_cast<size_t>(0),order);
-
   // TODO: replace with std::generate, std::accumulate, or similar
-  const auto addit = (iterations+1.) * (iterations/2.);
-  auto abserr = 0.0;
-  for (auto i : range) {
-    for (auto j : range) {
-      const int ij = i*order+j;
-      const int ji = j*order+i;
-      const double reference = static_cast<double>(ij)*(1.+iterations)+addit;
+  double const addit = (iterations+1.) * (iterations/2.);
+  double abserr(0);
+  for (size_t i=0; i<order; ++i) {
+    for (size_t j=0; j<order; ++j) {
+      size_t const ij = i*order+j;
+      size_t const ji = j*order+i;
+      double const reference = static_cast<double>(ij)*(1.+iterations)+addit;
       abserr += std::fabs(h_B[ji] - reference);
     }
   }
@@ -169,12 +165,12 @@ int main(int argc, char * argv[])
   std::cout << "Sum of absolute differences: " << abserr << std::endl;
 #endif
 
-  const auto epsilon = 1.0e-8;
+  double const epsilon(1.0e-8);
   if (abserr < epsilon) {
     std::cout << "Solution validates" << std::endl;
     auto avgtime = trans_time/iterations;
     auto bytes = (size_t)order * (size_t)order * sizeof(double);
-    std::cout << "Rate (MB/s): " << 1.0e-6 * (2L*bytes)/avgtime
+    std::cout << "Rate (MB/s): " << 1.0e-6 * (2.*bytes)/avgtime
               << " Avg time (s): " << avgtime << std::endl;
   } else {
     std::cout << "ERROR: Aggregate squared error " << abserr
diff --git a/travis/install-raja.sh b/travis/install-raja.sh
index 114b9f2a5..fe633f5aa 100644
--- a/travis/install-raja.sh
+++ b/travis/install-raja.sh
@@ -40,7 +40,8 @@ esac
 ${PRK_CXX} -v
 
 if [ ! -d "$TRAVIS_ROOT/raja" ]; then
-    BRANCH=develop
+    #BRANCH=develop # forallN deprecated
+    BRANCH=master
     git clone --recursive --depth 1 -b ${BRANCH} https://github.com/LLNL/RAJA.git
     cd RAJA
     mkdir build

From bb3afd897e704be1277b988f5c4bf6e952c21af3 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 25 Apr 2018 19:49:53 -0400
Subject: [PATCH 080/245] use RAJA master branch (#336)

- they have deprecated forallN in develop branch and i don't want to
  see the warnings about it
- somebody committed a "here 2" debug message to develop branch that
  leads to excessive output in Travis

From 9d5780cd224398f639013c4df9d9dacd71047480 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Fri, 27 Apr 2018 12:15:56 -0700
Subject: [PATCH 081/245] Use ranges ts (#337)

* add Travis installer for Ranges TS
* update makefile examples for ranges TS change
* hide difference between Boost and TS ranges with prk::range in prk_util.h
* finish ranges update:
- replace boost::irange with prk::range everywhere
- prk::range supports both contiguous and strided
---
 Cxx11/Makefile                      | 10 +++--
 Cxx11/generate-cxx-stencil.py       |  8 ++--
 Cxx11/nstream-vector-pstl.cc        |  2 +-
 Cxx11/nstream-vector-rangefor.cc    |  2 +-
 Cxx11/p2p-hyperplane-vector-pstl.cc |  4 +-
 Cxx11/prk_util.h                    | 37 ++++++++++++++--
 Cxx11/stencil-vector-pstl.cc        |  4 +-
 Cxx11/stencil-vector-rangefor.cc    |  4 +-
 Cxx11/stencil_pgnu.hpp              | 20 ++++-----
 Cxx11/stencil_pstl.hpp              | 20 ++++-----
 Cxx11/stencil_rangefor.hpp          | 20 ++++-----
 Cxx11/stencil_stl.hpp               | 20 ++++-----
 Cxx11/transpose-vector-pstl.cc      |  2 +-
 Cxx11/transpose-vector-rangefor.cc  | 16 +++----
 common/make.defs.gcc                | 20 +++++----
 common/make.defs.intel              | 14 ++++---
 common/make.defs.llvm               | 13 +++---
 travis/build-run-prk.sh             | 65 ++++++++++++++---------------
 travis/install-deps.sh              |  1 +
 travis/install-ranges.sh            |  8 ++++
 20 files changed, 169 insertions(+), 121 deletions(-)
 create mode 100644 travis/install-ranges.sh

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index c4b9b6ba8..3a26ead0b 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -28,7 +28,8 @@ ifdef USE_PRK_KOKKOS_BACKEND
     KOKKOS_BACKEND_FLAG = -DPRK_KOKKOS_BACKEND=$(USE_PRK_KOKKOS_BACKEND)
 endif
 
-ASMFLAGS = -fsource-asm -fverbose-asm -fasm-blocks -fcode-asm
+#ASMFLAGS = -fsource-asm -fverbose-asm -fasm-blocks -fcode-asm
+ASMFLAGS = -fverbose-asm
 
 OMPFLAGS = $(OPENMPFLAG)
 TARGETFLAGS = $(OFFLOADFLAG)
@@ -40,8 +41,9 @@ ORNLACCFLAGS = $(ORNLACCFLAG)
 TBBFLAGS = $(TBBFLAG) -DPRK_TBB_PARTITIONER=$(PRK_TBB_PARTITIONER)
 CBLASFLAGS = $(CBLASFLAG) $(OPENMPFLAG)
 BOOSTFLAGS = $(BOOSTFLAG)
-STLFLAGS = $(STLFLAG) $(BOOSTFLAGS)
-PSTLFLAGS = $(PSTLFLAG) $(BOOSTFLAGS)
+RANGEFLAGS = -DUSE_RANGES $(RANGEFLAG)
+STLFLAGS = $(STLFLAG) $(RANGEFLAGS)
+PSTLFLAGS = $(PSTLFLAG) $(RANGEFLAGS)
 RAJAFLAGS = $(RAJAFLAG)
 KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG)
 ORNLACCFLAGS = $(ORNLACCFLAG)
@@ -159,7 +161,7 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h
 	$(CXX) $(CXXFLAGS) $< $(PSTLFLAGS) -o $@
 
 %-rangefor: %-rangefor.cc prk_util.h
-	$(CXX) $(CXXFLAGS) $< $(BOOSTFLAGS) -o $@
+	$(CXX) $(CXXFLAGS) $< $(RANGEFLAGS) -o $@
 
 %-boost-compute: %-boost-compute.cc prk_util.h
 	$(CXX) $(CXXFLAGS) $< $(BOOSTFLAGS) -DUSE_BOOST_COMPUTE $(OPENCLFLAGS) -o $@
diff --git a/Cxx11/generate-cxx-stencil.py b/Cxx11/generate-cxx-stencil.py
index 39e66459a..b3b573887 100755
--- a/Cxx11/generate-cxx-stencil.py
+++ b/Cxx11/generate-cxx-stencil.py
@@ -29,24 +29,24 @@ def codegen(src,pattern,stencil_size,radius,W,model):
         src.write('      for (auto j='+str(radius)+'; j<n-'+str(radius)+'; ++j) {\n')
     elif (model=='rangefor'):
         src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {\n')
-        src.write('    auto inside = boost::irange('+str(radius)+',n-'+str(radius)+');\n')
+        src.write('    auto inside = prk::range('+str(radius)+',n-'+str(radius)+');\n')
         src.write('    for (auto i : inside) {\n')
         src.write('      PRAGMA_SIMD\n')
         src.write('      for (auto j : inside) {\n')
     elif (model=='stl'):
         src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {\n')
-        src.write('    auto inside = boost::irange('+str(radius)+',n-'+str(radius)+');\n')
+        src.write('    auto inside = prk::range('+str(radius)+',n-'+str(radius)+');\n')
         src.write('    std::for_each( std::begin(inside), std::end(inside), [&] (int i) {\n')
         #src.write('      PRAGMA_SIMD\n')
         src.write('      std::for_each( std::begin(inside), std::end(inside), [&] (int j) {\n')
     elif (model=='pgnu'):
         src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {\n')
-        src.write('    auto inside = boost::irange('+str(radius)+',n-'+str(radius)+');\n')
+        src.write('    auto inside = prk::range('+str(radius)+',n-'+str(radius)+');\n')
         src.write('    __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) {\n')
         src.write('      std::for_each( std::begin(inside), std::end(inside), [&] (int j) {\n')
     elif (model=='pstl'):
         src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {\n')
-        src.write('    auto inside = boost::irange('+str(radius)+',n-'+str(radius)+');\n')
+        src.write('    auto inside = prk::range('+str(radius)+',n-'+str(radius)+');\n')
         src.write('    std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) {\n')
         src.write('      std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) {\n')
     elif (model=='raja'):
diff --git a/Cxx11/nstream-vector-pstl.cc b/Cxx11/nstream-vector-pstl.cc
index dfe053ebf..dbc52aaf4 100644
--- a/Cxx11/nstream-vector-pstl.cc
+++ b/Cxx11/nstream-vector-pstl.cc
@@ -120,7 +120,7 @@ int main(int argc, char * argv[])
   std::vector<double> B(length);
   std::vector<double> C(length);
 
-  auto range = boost::irange(static_cast<size_t>(0), length);
+  auto range = prk::range(static_cast<size_t>(0), length);
 
   double scalar(3);
 
diff --git a/Cxx11/nstream-vector-rangefor.cc b/Cxx11/nstream-vector-rangefor.cc
index 54bad9274..2bdadea3d 100644
--- a/Cxx11/nstream-vector-rangefor.cc
+++ b/Cxx11/nstream-vector-rangefor.cc
@@ -116,7 +116,7 @@ int main(int argc, char * argv[])
   std::vector<double> B(length,2.0);
   std::vector<double> C(length,2.0);
 
-  auto range = boost::irange(static_cast<size_t>(0), length);
+  auto range = prk::range(0,length);
 
   double scalar(3);
 
diff --git a/Cxx11/p2p-hyperplane-vector-pstl.cc b/Cxx11/p2p-hyperplane-vector-pstl.cc
index 81b58d50c..91b0392f0 100644
--- a/Cxx11/p2p-hyperplane-vector-pstl.cc
+++ b/Cxx11/p2p-hyperplane-vector-pstl.cc
@@ -161,7 +161,7 @@ int main(int argc, char* argv[])
       for (auto i=2; i<=2*n-2; i++) {
         const auto begin = std::max(2,i-n+2);
         const auto end   = std::min(i,n)+1;
-        auto range = boost::irange(begin,end);
+        auto range = prk::range(begin,end);
 #if defined(USE_PSTL) && defined(USE_INTEL_PSTL)
         std::for_each( pstl::execution::par, std::begin(range), std::end(range), [&] (auto j) {
 #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \
@@ -179,7 +179,7 @@ int main(int argc, char* argv[])
       for (int i=2; i<=2*(nb+1)-2; i++) {
         const auto begin = std::max(2,i-(nb+1)+2);
         const auto end   = std::min(i,nb+1)+1;
-        auto range = boost::irange(begin,end);
+        auto range = prk::range(begin,end);
 #if defined(USE_PSTL) && defined(USE_INTEL_PSTL)
         std::for_each( pstl::execution::par, std::begin(range), std::end(range), [&] (auto j) {
 #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \
diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h
index 8bb718fe0..e1576e3a1 100644
--- a/Cxx11/prk_util.h
+++ b/Cxx11/prk_util.h
@@ -1,5 +1,5 @@
 ///
-/// Copyright (c) 2013, Intel Corporation
+/// Copyright (c) 2018, Intel Corporation
 ///
 /// Redistribution and use in source and binary forms, with or without
 /// modification, are permitted provided that the following conditions
@@ -180,8 +180,16 @@ const T prk_reduce(I first, I last, T init) {
 # endif
 #endif
 
-#if defined(USE_BOOST)
-# include "boost/range/irange.hpp"
+#if defined(USE_RANGES)
+# if defined(USE_BOOST_IRANGE)
+#  include "boost/range/irange.hpp"
+# elif defined(USE_RANGES_TS)
+#  include "range/v3/view/iota.hpp"
+#  include "range/v3/view/slice.hpp"
+#  include "range/v3/view/stride.hpp"
+# else
+#  error You have not provided a version of ranges to use.
+# endif
 #endif
 
 #if defined(USE_BOOST_COMPUTE)
@@ -248,6 +256,29 @@ namespace prk {
         return ( numerator / denominator + (numerator % denominator > 0) );
     }
 
+    template <class S, class E>
+    auto range(S start, E end) {
+#if defined(USE_BOOST_IRANGE)
+        return boost::irange(static_cast<decltype(end)>(start), end);
+#elif defined(USE_RANGES_TS)
+        return ranges::view::iota(static_cast<decltype(end)>(start), end);
+#endif
+    }
+
+    template <class S, class E, class B>
+    auto range(S start, E end, B blocking) {
+#if defined(USE_BOOST_IRANGE)
+        return boost::irange(static_cast<decltype(end)>(start), end, decltype(end)>(blocking) );
+#elif defined(USE_RANGES_TS)
+        // NOTE:
+        // iota(s) | slice(s,e) | stride(b)  is faster than
+        // iota(s,e) | stride(b) for some reason.
+        return ranges::view::iota(static_cast<decltype(end)>(start)) |
+               ranges::view::slice(static_cast<decltype(end)>(start), end) |
+               ranges::view::stride(static_cast<decltype(end)>(blocking));
+#endif
+    }
+
 } // namespace prk
 
 #endif /* PRK_UTIL_H */
diff --git a/Cxx11/stencil-vector-pstl.cc b/Cxx11/stencil-vector-pstl.cc
index 863a50df5..8495032ca 100644
--- a/Cxx11/stencil-vector-pstl.cc
+++ b/Cxx11/stencil-vector-pstl.cc
@@ -180,7 +180,7 @@ int main(int argc, char* argv[])
   std::vector<double> out(n*n);
 
   // initialize the input and output arrays
-  auto range = boost::irange(0,n);
+  auto range = prk::range(0,n);
 #if defined(USE_PSTL) && defined(USE_INTEL_PSTL)
   std::for_each( pstl::execution::par, std::begin(range), std::end(range), [&] (int i) {
     std::for_each( pstl::execution::unseq, std::begin(range), std::end(range), [&] (int j) {
@@ -240,7 +240,7 @@ int main(int argc, char* argv[])
 
   // compute L1 norm in parallel
   double norm = 0.0;
-  auto inside = boost::irange(radius,n-radius);
+  auto inside = prk::range(radius,n-radius);
   for (auto i : inside) {
     for (auto j : inside) {
       norm += std::fabs(out[i*n+j]);
diff --git a/Cxx11/stencil-vector-rangefor.cc b/Cxx11/stencil-vector-rangefor.cc
index aef3a3880..040bde745 100644
--- a/Cxx11/stencil-vector-rangefor.cc
+++ b/Cxx11/stencil-vector-rangefor.cc
@@ -168,7 +168,7 @@ int main(int argc, char* argv[])
   std::vector<double> out(n*n);
 
   // initialize the input and output arrays
-  auto range = boost::irange(0,n);
+  auto range = prk::range(0,n);
   for (auto i : range) {
     for (auto j : range) {
       in[i*n+j] = static_cast<double>(i+j);
@@ -200,7 +200,7 @@ int main(int argc, char* argv[])
 
   // compute L1 norm in parallel
   double norm = 0.0;
-  auto inside = boost::irange(radius,n-radius);
+  auto inside = prk::range(radius,n-radius);
   for (auto i : inside) {
     for (auto j : inside) {
       norm += std::fabs(out[i*n+j]);
diff --git a/Cxx11/stencil_pgnu.hpp b/Cxx11/stencil_pgnu.hpp
index c1236b120..d6c1ee3eb 100644
--- a/Cxx11/stencil_pgnu.hpp
+++ b/Cxx11/stencil_pgnu.hpp
@@ -1,5 +1,5 @@
 void star1(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(1,n-1);
+    auto inside = prk::range(1,n-1);
     __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-1)*n+(j+0)] * -0.5
@@ -11,7 +11,7 @@ void star1(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void star2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(2,n-2);
+    auto inside = prk::range(2,n-2);
     __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-2)*n+(j+0)] * -0.125
@@ -27,7 +27,7 @@ void star2(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void star3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(3,n-3);
+    auto inside = prk::range(3,n-3);
     __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556
@@ -47,7 +47,7 @@ void star3(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void star4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(4,n-4);
+    auto inside = prk::range(4,n-4);
     __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125
@@ -71,7 +71,7 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void star5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(5,n-5);
+    auto inside = prk::range(5,n-5);
     __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02
@@ -99,7 +99,7 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void grid1(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(1,n-1);
+    auto inside = prk::range(1,n-1);
     __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-1)*n+(j+-1)] * -0.25
@@ -114,7 +114,7 @@ void grid1(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void grid2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(2,n-2);
+    auto inside = prk::range(2,n-2);
     __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625
@@ -143,7 +143,7 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void grid3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(3,n-3);
+    auto inside = prk::range(3,n-3);
     __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778
@@ -194,7 +194,7 @@ void grid3(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void grid4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(4,n-4);
+    auto inside = prk::range(4,n-4);
     __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625
@@ -275,7 +275,7 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void grid5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(5,n-5);
+    auto inside = prk::range(5,n-5);
     __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01
diff --git a/Cxx11/stencil_pstl.hpp b/Cxx11/stencil_pstl.hpp
index 70ad6cf66..8713da4d8 100644
--- a/Cxx11/stencil_pstl.hpp
+++ b/Cxx11/stencil_pstl.hpp
@@ -1,5 +1,5 @@
 void star1(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(1,n-1);
+    auto inside = prk::range(1,n-1);
     std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-1)*n+(j+0)] * -0.5
@@ -11,7 +11,7 @@ void star1(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void star2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(2,n-2);
+    auto inside = prk::range(2,n-2);
     std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-2)*n+(j+0)] * -0.125
@@ -27,7 +27,7 @@ void star2(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void star3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(3,n-3);
+    auto inside = prk::range(3,n-3);
     std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556
@@ -47,7 +47,7 @@ void star3(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void star4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(4,n-4);
+    auto inside = prk::range(4,n-4);
     std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125
@@ -71,7 +71,7 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void star5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(5,n-5);
+    auto inside = prk::range(5,n-5);
     std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02
@@ -99,7 +99,7 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void grid1(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(1,n-1);
+    auto inside = prk::range(1,n-1);
     std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-1)*n+(j+-1)] * -0.25
@@ -114,7 +114,7 @@ void grid1(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void grid2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(2,n-2);
+    auto inside = prk::range(2,n-2);
     std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625
@@ -143,7 +143,7 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void grid3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(3,n-3);
+    auto inside = prk::range(3,n-3);
     std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778
@@ -194,7 +194,7 @@ void grid3(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void grid4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(4,n-4);
+    auto inside = prk::range(4,n-4);
     std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625
@@ -275,7 +275,7 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void grid5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(5,n-5);
+    auto inside = prk::range(5,n-5);
     std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01
diff --git a/Cxx11/stencil_rangefor.hpp b/Cxx11/stencil_rangefor.hpp
index b6bf57581..c85964181 100644
--- a/Cxx11/stencil_rangefor.hpp
+++ b/Cxx11/stencil_rangefor.hpp
@@ -1,5 +1,5 @@
 void star1(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(1,n-1);
+    auto inside = prk::range(1,n-1);
     for (auto i : inside) {
       PRAGMA_SIMD
       for (auto j : inside) {
@@ -14,7 +14,7 @@ void star1(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void star2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(2,n-2);
+    auto inside = prk::range(2,n-2);
     for (auto i : inside) {
       PRAGMA_SIMD
       for (auto j : inside) {
@@ -33,7 +33,7 @@ void star2(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void star3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(3,n-3);
+    auto inside = prk::range(3,n-3);
     for (auto i : inside) {
       PRAGMA_SIMD
       for (auto j : inside) {
@@ -56,7 +56,7 @@ void star3(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void star4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(4,n-4);
+    auto inside = prk::range(4,n-4);
     for (auto i : inside) {
       PRAGMA_SIMD
       for (auto j : inside) {
@@ -83,7 +83,7 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void star5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(5,n-5);
+    auto inside = prk::range(5,n-5);
     for (auto i : inside) {
       PRAGMA_SIMD
       for (auto j : inside) {
@@ -114,7 +114,7 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void grid1(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(1,n-1);
+    auto inside = prk::range(1,n-1);
     for (auto i : inside) {
       PRAGMA_SIMD
       for (auto j : inside) {
@@ -132,7 +132,7 @@ void grid1(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void grid2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(2,n-2);
+    auto inside = prk::range(2,n-2);
     for (auto i : inside) {
       PRAGMA_SIMD
       for (auto j : inside) {
@@ -164,7 +164,7 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void grid3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(3,n-3);
+    auto inside = prk::range(3,n-3);
     for (auto i : inside) {
       PRAGMA_SIMD
       for (auto j : inside) {
@@ -218,7 +218,7 @@ void grid3(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void grid4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(4,n-4);
+    auto inside = prk::range(4,n-4);
     for (auto i : inside) {
       PRAGMA_SIMD
       for (auto j : inside) {
@@ -302,7 +302,7 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void grid5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(5,n-5);
+    auto inside = prk::range(5,n-5);
     for (auto i : inside) {
       PRAGMA_SIMD
       for (auto j : inside) {
diff --git a/Cxx11/stencil_stl.hpp b/Cxx11/stencil_stl.hpp
index ecde3e1ce..4dcdde467 100644
--- a/Cxx11/stencil_stl.hpp
+++ b/Cxx11/stencil_stl.hpp
@@ -1,5 +1,5 @@
 void star1(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(1,n-1);
+    auto inside = prk::range(1,n-1);
     std::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-1)*n+(j+0)] * -0.5
@@ -11,7 +11,7 @@ void star1(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void star2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(2,n-2);
+    auto inside = prk::range(2,n-2);
     std::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-2)*n+(j+0)] * -0.125
@@ -27,7 +27,7 @@ void star2(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void star3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(3,n-3);
+    auto inside = prk::range(3,n-3);
     std::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556
@@ -47,7 +47,7 @@ void star3(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void star4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(4,n-4);
+    auto inside = prk::range(4,n-4);
     std::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125
@@ -71,7 +71,7 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void star5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(5,n-5);
+    auto inside = prk::range(5,n-5);
     std::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02
@@ -99,7 +99,7 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void grid1(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(1,n-1);
+    auto inside = prk::range(1,n-1);
     std::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-1)*n+(j+-1)] * -0.25
@@ -114,7 +114,7 @@ void grid1(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void grid2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(2,n-2);
+    auto inside = prk::range(2,n-2);
     std::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625
@@ -143,7 +143,7 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void grid3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(3,n-3);
+    auto inside = prk::range(3,n-3);
     std::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778
@@ -194,7 +194,7 @@ void grid3(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void grid4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(4,n-4);
+    auto inside = prk::range(4,n-4);
     std::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625
@@ -275,7 +275,7 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void grid5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    auto inside = boost::irange(5,n-5);
+    auto inside = prk::range(5,n-5);
     std::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01
diff --git a/Cxx11/transpose-vector-pstl.cc b/Cxx11/transpose-vector-pstl.cc
index 8b9734200..222322bd8 100644
--- a/Cxx11/transpose-vector-pstl.cc
+++ b/Cxx11/transpose-vector-pstl.cc
@@ -105,7 +105,7 @@ int main(int argc, char * argv[])
   // fill A with the sequence 0 to order^2-1 as doubles
   std::iota(A.begin(), A.end(), 0.0);
 
-  auto range = boost::irange(0,order);
+  auto range = prk::range(0,order);
 
   auto trans_time = 0.0;
 
diff --git a/Cxx11/transpose-vector-rangefor.cc b/Cxx11/transpose-vector-rangefor.cc
index e02047a6d..3d2e4f9f1 100644
--- a/Cxx11/transpose-vector-rangefor.cc
+++ b/Cxx11/transpose-vector-rangefor.cc
@@ -109,17 +109,17 @@ int main(int argc, char * argv[])
   // fill A with the sequence 0 to order^2-1 as doubles
   std::iota(A.begin(), A.end(), 0.0);
 
-  auto itrange = boost::irange(0,order,tile_size);
-  auto jtrange = boost::irange(0,order,tile_size);
+  auto itrange = prk::range(0,order,tile_size);
+  auto jtrange = prk::range(0,order,tile_size);
 
   for (auto iter = 0; iter<=iterations; iter++) {
 
     if (iter==1) trans_time = prk::wtime();
 
     for (auto it : itrange) {
-      auto irange = boost::irange(it,std::min(order,it+tile_size));
+      auto irange = prk::range(it,std::min(order,it+tile_size));
       for (auto jt : jtrange) {
-        auto jrange = boost::irange(jt,std::min(order,jt+tile_size));
+        auto jrange = prk::range(jt,std::min(order,jt+tile_size));
         for (auto i : irange) {
           for (auto j : jrange) {
             B[i*order+j] += A[j*order+i];
@@ -136,10 +136,10 @@ int main(int argc, char * argv[])
   //////////////////////////////////////////////////////////////////////
 
   // TODO: replace with std::generate, std::accumulate, or similar
-  const auto addit = (iterations+1.) * (iterations/2.);
-  auto abserr = 0.0;
-  auto irange = boost::irange(0,order);
-  auto jrange = boost::irange(0,order);
+  auto const addit = (iterations+1.) * (iterations/2.);
+  double abserr(0);
+  auto irange = prk::range(0,order);
+  auto jrange = prk::range(0,order);
   for (auto i : irange) {
     for (auto j : jrange) {
       const int ij = i*order+j;
diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index 3b3e6413a..d1a56e5bd 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -46,14 +46,14 @@ OPENCLFLAG=-framework OpenCL
 #
 # triSYCL
 # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
-#SYCLDIR=./triSYCL
-#SYCLCXX=${CXX} ${OPENMPFLAG}
-#SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include
+SYCLDIR=./triSYCL
+SYCLCXX=${CXX} -std=c++17 ${OPENMPFLAG}
+SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include
 # ProGTX
 # https://github.com/ProGTX/sycl-gtx
-SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx
-SYCLCXX=${CXX} ${OPENMPFLAG}
-SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG}
+#SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx
+#SYCLCXX=${CXX} ${OPENMPFLAG}
+#SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG}
 #
 # OCCA
 #
@@ -65,13 +65,15 @@ CILKFLAG=-fcilkplus
 #
 # TBB
 #
-TBBDIR=/usr/local/Cellar/tbb/2018_U2
+TBBDIR=/usr/local/Cellar/tbb/2018_U3_1
 TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
 #
 # Parallel STL, Boost, etc.
 #
-BOOSTFLAG=-DUSE_BOOST -DUSE_BOOST_COMPUTE -I/usr/local/Cellar/boost/1.65.1/include
-PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include
+BOOSTFLAG=-DUSE_BOOST -I/usr/local/Cellar/boost/1.65.1/include
+RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
+#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
+PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG}
 KOKKOSDIR=/opt/kokkos/gcc
 KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG}
 RAJADIR=/opt/raja/gcc
diff --git a/common/make.defs.intel b/common/make.defs.intel
index 7ecd87ead..0dea4bb44 100644
--- a/common/make.defs.intel
+++ b/common/make.defs.intel
@@ -45,12 +45,12 @@ OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL
 # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
 SYCLDIR=./triSYCL
 SYCLCXX=${CXX} ${OPENMPFLAG}
-SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include
+SYCLFLAG=-DUSE_SYCL -std=gnu++14 -I$(SYCLDIR)/include
 # ProGTX
 # https://github.com/ProGTX/sycl-gtx
-SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx
-SYCLCXX=${CXX} ${OPENMPFLAG}
-SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG}
+#SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx
+#SYCLCXX=${CXX} ${OPENMPFLAG}
+#SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG}
 #
 # OCCA
 #
@@ -66,8 +66,10 @@ TBBFLAG=-DUSE_TBB -tbb
 #
 # Parallel STL, Boost, etc.
 #
-BOOSTFLAG=-DUSE_BOOST -DUSE_BOOST_COMPUTE -I/usr/local/Cellar/boost/1.65.1/include
-PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG}
+BOOSTFLAG=-DUSE_BOOST -I/usr/local/Cellar/boost/1.65.1/include
+RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
+#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
+PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} ${RANGEFLAG}
 KOKKOSDIR=/opt/kokkos/intel
 KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl
 RAJADIR=/opt/raja/intel
diff --git a/common/make.defs.llvm b/common/make.defs.llvm
index 0ae50e78b..cdbd6510a 100644
--- a/common/make.defs.llvm
+++ b/common/make.defs.llvm
@@ -75,8 +75,8 @@ SYCLFLAG+=-std=c++14
 # triSYCL
 # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
 SYCLDIR=./triSYCL
-SYCLCXX=${CXX} ${OPENMPFLAG}
-SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include
+SYCLCXX=${CXX} -std=gnu++14 ${OPENMPFLAG}
+SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include $(DEFAULT_OPT_FLAGS)
 # ProGTX
 # https://github.com/ProGTX/sycl-gtx
 #SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx
@@ -84,17 +84,20 @@ SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include
 #SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG}
 #
 # OCCA
+#
 OCCADIR=${HOME}/prk-repo/Cxx11/occa
 #
 # TBB
 #
-TBBDIR=/usr/local/Cellar/tbb/2018_U3
+TBBDIR=/usr/local/Cellar/tbb/2018_U3_1
 TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
 #
 # Parallel STL, Boost, etc.
 #
-BOOSTFLAG=-DUSE_BOOST -DUSE_BOOST_COMPUTE -I/usr/local/Cellar/boost/1.65.1/include
-PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include
+BOOSTFLAG=-DUSE_BOOST -I/usr/local/Cellar/boost/1.65.1/include
+RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
+#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
+PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG}
 KOKKOSDIR=/opt/kokkos/clang
 KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl
 RAJADIR=/opt/raja/clang
diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index 73883df11..6c1e1a1ca 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -18,7 +18,7 @@ case "$os" in
         export MPI_ROOT=/usr/local
         ;;
     Linux)
-        export MPI_ROOT=$TRAVIS_ROOT
+        export MPI_ROOT=${TRAVIS_ROOT}
         ;;
 esac
 
@@ -68,7 +68,7 @@ case "$PRK_TARGET" in
                 export JULIA_PATH=/usr/local/bin/
                 ;;
             Linux)
-                export JULIA_PATH=$TRAVIS_ROOT/julia/bin/
+                export JULIA_PATH=${TRAVIS_ROOT}/julia/bin/
                 ;;
         esac
         ${JULIA_PATH}julia --version
@@ -442,11 +442,10 @@ case "$PRK_TARGET" in
         esac
 
         # Boost.Compute found after OpenCL, and only available in Travis with MacOS.
-        if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then
-            echo "BOOSTFLAG=-DUSE_BOOST -DUSE_BOOST_COMPUTE" >> common/make.defs
-        else
-            echo "BOOSTFLAG=-DUSE_BOOST" >> common/make.defs
-        fi
+        echo "BOOSTFLAG=-DUSE_BOOST" >> common/make.defs
+
+        #echo "RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}" >> common/make.defs
+        echo "RANGEFLAG=-DUSE_RANGES_TS -I${TRAVIS_ROOT}/range-v3/include" >> common/make.defs
 
         # C++11 with rangefor and Boost.Ranges
         make -C $PRK_TARGET_PATH rangefor
@@ -511,9 +510,9 @@ case "$PRK_TARGET" in
         if [ "${CC}" = "gcc" ] || [ "${TRAVIS_OS_NAME}" = "osx" ] ; then
             if [ "${CC}" = "clang" ] ; then
                 # omp.h not found with clang-3.9 - just work around instead of fixing.
-                echo "PSTLFLAG=-DUSE_PSTL ${TBBFLAG} -DUSE_INTEL_PSTL -I$TRAVIS_ROOT/pstl/include" >> common/make.defs
+                echo "PSTLFLAG=-DUSE_PSTL ${TBBFLAG} -DUSE_INTEL_PSTL -I${TRAVIS_ROOT}/pstl/include ${RANGEFLAG}" >> common/make.defs
             else
-                echo "PSTLFLAG=-DUSE_PSTL -fopenmp ${TBBFLAG} -DUSE_INTEL_PSTL -I$TRAVIS_ROOT/pstl/include" >> common/make.defs
+                echo "PSTLFLAG=-DUSE_PSTL -fopenmp ${TBBFLAG} -DUSE_INTEL_PSTL -I${TRAVIS_ROOT}/pstl/include ${RANGEFLAG}" >> common/make.defs
             fi
             make -C $PRK_TARGET_PATH p2p-hyperplane-vector-pstl stencil-vector-pstl transpose-vector-pstl nstream-vector-pstl
             $PRK_TARGET_PATH/p2p-hyperplane-vector-pstl    10 1024 1
@@ -666,7 +665,7 @@ case "$PRK_TARGET" in
                     # Homebrew installs a symlink in /usr/local/bin
                     export PRK_CAFC=/usr/local/bin/caf
                 elif [ "${TRAVIS_OS_NAME}" = "linux" ] ; then
-                    export PRK_CAFC=$TRAVIS_ROOT/opencoarrays/bin/caf
+                    export PRK_CAFC=${TRAVIS_ROOT}/opencoarrays/bin/caf
                 fi
                 echo "CAFC=$PRK_CAFC -std=f2008 -cpp" >> common/make.defs
                 echo "COARRAYFLAG=-fcoarray=single" >> common/make.defs
@@ -745,7 +744,7 @@ case "$PRK_TARGET" in
                     export PRK_OVERSUBSCRIBE="--oversubscribe"
                     export TMPDIR=/tmp
                 elif [ "${TRAVIS_OS_NAME}" = "linux" ] ; then
-                    export PRK_LAUNCHER=$TRAVIS_ROOT/opencoarrays/bin/cafrun
+                    export PRK_LAUNCHER=${TRAVIS_ROOT}/opencoarrays/bin/cafrun
                 fi
                 $PRK_LAUNCHER -n $PRK_MPI_PROCS ${PRK_OVERSUBSCRIBE:-} $PRK_TARGET_PATH/p2p-coarray       10 1024 1024
                 $PRK_LAUNCHER -n $PRK_MPI_PROCS ${PRK_OVERSUBSCRIBE:-} $PRK_TARGET_PATH/stencil-coarray   10 1000
@@ -901,13 +900,13 @@ case "$PRK_TARGET" in
     allshmem)
         echo "SHMEM"
         # This should be fixed by rpath (https://github.com/regrant/sandia-shmem/issues/83)
-        export LD_LIBRARY_PATH=$TRAVIS_ROOT/sandia-openshmem/lib:$TRAVIS_ROOT/libfabric/lib:$LD_LIBRARY_PATH
-        export SHMEM_ROOT=$TRAVIS_ROOT/sandia-openshmem
+        export LD_LIBRARY_PATH=${TRAVIS_ROOT}/sandia-openshmem/lib:${TRAVIS_ROOT}/libfabric/lib:$LD_LIBRARY_PATH
+        export SHMEM_ROOT=${TRAVIS_ROOT}/sandia-openshmem
         echo "SHMEMTOP=$SHMEM_ROOT\nSHMEMCC=$SHMEM_ROOT/bin/oshcc" >> common/make.defs
         make $PRK_TARGET
         export PRK_TARGET_PATH=SHMEM
         export PRK_SHMEM_PROCS=4
-        export OSHRUN_LAUNCHER=$TRAVIS_ROOT/hydra/bin/mpirun
+        export OSHRUN_LAUNCHER=${TRAVIS_ROOT}/hydra/bin/mpirun
         export PRK_LAUNCHER=$SHMEM_ROOT/bin/oshrun
         $PRK_LAUNCHER -n $PRK_SHMEM_PROCS $PRK_TARGET_PATH/Synch_p2p/p2p       10 1024 1024
         $PRK_LAUNCHER -n $PRK_SHMEM_PROCS $PRK_TARGET_PATH/Stencil/stencil     10 1000
@@ -921,14 +920,14 @@ case "$PRK_TARGET" in
                 case "$CC" in
                     gcc)
                         # If building from source (impossible)
-                        #export UPC_ROOT=$TRAVIS_ROOT/gupc
+                        #export UPC_ROOT=${TRAVIS_ROOT}/gupc
                         # If installing deb file
-                        export UPC_ROOT=$TRAVIS_ROOT/gupc/usr/local/gupc
+                        export UPC_ROOT=${TRAVIS_ROOT}/gupc/usr/local/gupc
                         ;;
                     clang)
                         echo "Clang UPC is not supported."
                         exit 9
-                        export UPC_ROOT=$TRAVIS_ROOT/clupc
+                        export UPC_ROOT=${TRAVIS_ROOT}/clupc
                         ;;
                 esac
                 echo "UPCC=$UPC_ROOT/bin/upc" >> common/make.defs
@@ -937,7 +936,7 @@ case "$PRK_TARGET" in
                 make $PRK_TARGET
                 ;;
             bupc)
-                export UPC_ROOT=$TRAVIS_ROOT/bupc-$CC
+                export UPC_ROOT=${TRAVIS_ROOT}/bupc-$CC
                 echo "UPCC=$UPC_ROOT/bin/upcc" >> common/make.defs
                 # -N $nodes -n UPC threads -c $cores_per_node
                 # -localhost is only for UDP
@@ -947,7 +946,7 @@ case "$PRK_TARGET" in
                         ;;
                     ofi)
                         export GASNET_SSH_SERVERS="localhost"
-                        export LD_LIBRARY_PATH="$TRAVIS_ROOT/libfabric/lib:$LD_LIBRARY_PATH"
+                        export LD_LIBRARY_PATH="${TRAVIS_ROOT}/libfabric/lib:$LD_LIBRARY_PATH"
                         export PRK_LAUNCHER="$UPC_ROOT/bin/upcrun -v -N 1 -n $PRK_UPC_PROCS -c $PRK_UPC_PROCS"
                         ;;
                     mpi)
@@ -978,12 +977,12 @@ case "$PRK_TARGET" in
         os=`uname`
         case "$os" in
             Darwin)
-                export CHARM_ROOT=$TRAVIS_ROOT/charm/netlrts-darwin-x86_64-smp
+                export CHARM_ROOT=${TRAVIS_ROOT}/charm/netlrts-darwin-x86_64-smp
                 ;;
             Linux)
-                #export CHARM_ROOT=$TRAVIS_ROOT/charm/netlrts-linux-x86_64
-                export CHARM_ROOT=$TRAVIS_ROOT/charm/netlrts-linux-x86_64-smp
-                #export CHARM_ROOT=$TRAVIS_ROOT/charm/multicore-linux64
+                #export CHARM_ROOT=${TRAVIS_ROOT}/charm/netlrts-linux-x86_64
+                export CHARM_ROOT=${TRAVIS_ROOT}/charm/netlrts-linux-x86_64-smp
+                #export CHARM_ROOT=${TRAVIS_ROOT}/charm/multicore-linux64
                 ;;
         esac
         echo "CHARMTOP=$CHARM_ROOT" >> common/make.defs
@@ -1002,12 +1001,12 @@ case "$PRK_TARGET" in
         os=`uname`
         case "$os" in
             Darwin)
-                export CHARM_ROOT=$TRAVIS_ROOT/charm/netlrts-darwin-x86_64-smp
+                export CHARM_ROOT=${TRAVIS_ROOT}/charm/netlrts-darwin-x86_64-smp
                 ;;
             Linux)
-                #export CHARM_ROOT=$TRAVIS_ROOT/charm/netlrts-linux-x86_64
-                export CHARM_ROOT=$TRAVIS_ROOT/charm/netlrts-linux-x86_64-smp
-                #export CHARM_ROOT=$TRAVIS_ROOT/charm/multicore-linux64
+                #export CHARM_ROOT=${TRAVIS_ROOT}/charm/netlrts-linux-x86_64
+                export CHARM_ROOT=${TRAVIS_ROOT}/charm/netlrts-linux-x86_64-smp
+                #export CHARM_ROOT=${TRAVIS_ROOT}/charm/multicore-linux64
                 ;;
         esac
         echo "CHARMTOP=$CHARM_ROOT" >> common/make.defs
@@ -1036,7 +1035,7 @@ case "$PRK_TARGET" in
         ;;
     allfgmpi)
         echo "Fine-Grain MPI (FG-MPI)"
-        export FGMPI_ROOT=$TRAVIS_ROOT/fgmpi
+        export FGMPI_ROOT=${TRAVIS_ROOT}/fgmpi
         echo "FGMPITOP=$FGMPI_ROOT\nFGMPICC=$FGMPI_ROOT/bin/mpicc -std=c99" >> common/make.defs
         make $PRK_TARGET
         export PRK_TARGET_PATH=FG_MPI
@@ -1060,11 +1059,11 @@ case "$PRK_TARGET" in
     allgrappa)
         echo "Grappa"
         ########################
-        #. $TRAVIS_ROOT/grappa/bin/settings.sh
-        export GRAPPA_PREFIX=$TRAVIS_ROOT/grappa
-        export SCRIPT_PATH=$TRAVIS_ROOT/grappa/bin
+        #. ${TRAVIS_ROOT}/grappa/bin/settings.sh
+        export GRAPPA_PREFIX=${TRAVIS_ROOT}/grappa
+        export SCRIPT_PATH=${TRAVIS_ROOT}/grappa/bin
         ########################
-        echo "GRAPPATOP=$TRAVIS_ROOT/grappa" >> common/make.defs
+        echo "GRAPPATOP=${TRAVIS_ROOT}/grappa" >> common/make.defs
         make $PRK_TARGET
         export PRK_TARGET_PATH=GRAPPA
         export PRK_MPI_PROCS=2
@@ -1087,7 +1086,7 @@ case "$PRK_TARGET" in
         ;;
     alllegion)
         echo "Legion"
-        echo "LEGIONTOP=$TRAVIS_ROOT/legion" > common/make.defs
+        echo "LEGIONTOP=${TRAVIS_ROOT}/legion" > common/make.defs
         make $PRK_TARGET -k
         ;;
 esac
diff --git a/travis/install-deps.sh b/travis/install-deps.sh
index 3917e2fec..a82df34cc 100755
--- a/travis/install-deps.sh
+++ b/travis/install-deps.sh
@@ -61,6 +61,7 @@ case "$PRK_TARGET" in
         fi
         sh ./travis/install-tbb.sh $TRAVIS_ROOT
         sh ./travis/install-pstl.sh $TRAVIS_ROOT
+        sh ./travis/install-ranges.sh $TRAVIS_ROOT
         # Boost is whitelisted and obtained from package manager
         if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then
             sh ./travis/install-boost.sh $TRAVIS_ROOT
diff --git a/travis/install-ranges.sh b/travis/install-ranges.sh
new file mode 100644
index 000000000..fda3e48aa
--- /dev/null
+++ b/travis/install-ranges.sh
@@ -0,0 +1,8 @@
+#!/bin/sh
+
+set -e
+set -x
+
+TRAVIS_ROOT="$1"
+
+git clone --depth 1 https://github.com/ericniebler/range-v3.git $TRAVIS_ROOT/range-v3

From e2e2c94d591ec9533bec5a4ccfd8ddc92abd794d Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Tue, 1 May 2018 11:51:16 -0700
Subject: [PATCH 082/245] make ranges TS the default for GCC and Clang. (#339)

ICC 18.0.1 bug report has been filed for inability to compile the ranges
TS implementation.  we leave Boost as the default there.

fixed a bug identied by ICC in prk_util.h
---
 Cxx11/prk_util.h       | 2 +-
 common/make.defs.gcc   | 4 ++--
 common/make.defs.intel | 4 ++--
 common/make.defs.llvm  | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h
index e1576e3a1..868f8b8c9 100644
--- a/Cxx11/prk_util.h
+++ b/Cxx11/prk_util.h
@@ -268,7 +268,7 @@ namespace prk {
     template <class S, class E, class B>
     auto range(S start, E end, B blocking) {
 #if defined(USE_BOOST_IRANGE)
-        return boost::irange(static_cast<decltype(end)>(start), end, decltype(end)>(blocking) );
+        return boost::irange(static_cast<decltype(end)>(start), end, static_cast<decltype(end)>(blocking) );
 #elif defined(USE_RANGES_TS)
         // NOTE:
         // iota(s) | slice(s,e) | stride(b)  is faster than
diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index d1a56e5bd..5477603e0 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -71,8 +71,8 @@ TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
 # Parallel STL, Boost, etc.
 #
 BOOSTFLAG=-DUSE_BOOST -I/usr/local/Cellar/boost/1.65.1/include
-RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
-#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
+#RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
+RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
 PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG}
 KOKKOSDIR=/opt/kokkos/gcc
 KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG}
diff --git a/common/make.defs.intel b/common/make.defs.intel
index 0dea4bb44..a4d515ba0 100644
--- a/common/make.defs.intel
+++ b/common/make.defs.intel
@@ -66,9 +66,9 @@ TBBFLAG=-DUSE_TBB -tbb
 #
 # Parallel STL, Boost, etc.
 #
-BOOSTFLAG=-DUSE_BOOST -I/usr/local/Cellar/boost/1.65.1/include
+#BOOSTFLAG=-DUSE_BOOST -I/usr/local/Cellar/boost/1.65.1/include
 RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
-#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
+RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
 PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} ${RANGEFLAG}
 KOKKOSDIR=/opt/kokkos/intel
 KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl
diff --git a/common/make.defs.llvm b/common/make.defs.llvm
index cdbd6510a..481a624ea 100644
--- a/common/make.defs.llvm
+++ b/common/make.defs.llvm
@@ -95,8 +95,8 @@ TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
 # Parallel STL, Boost, etc.
 #
 BOOSTFLAG=-DUSE_BOOST -I/usr/local/Cellar/boost/1.65.1/include
-RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
-#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
+#RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
+RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
 PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG}
 KOKKOSDIR=/opt/kokkos/clang
 KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl

From b123d147652d056f83f3ffbcadb4244fc0ac6e73 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Tue, 1 May 2018 11:53:02 -0700
Subject: [PATCH 083/245] p2p openmp tasks improved (#338)

* improved p2p w/ OpenMP Tasks (C, C++, Fortran)

instead of an N:1 and 1:N dependency on grid[0], just do taskwait at the
end of the wavefront sweep.  also remove the unnecessary diagonal
dependency, which is implied by the horizontal and vertical
dependencies.

Author: Vishakha Agrawal of Intel

* interchange loops to match Fortran order
* add restrict to sweep_tile in C and C++
* add restrict pointer to grid.data() in C++
* replace auto with int in C++
---
 C1z/p2p-simd-openmp.c        |  2 +-
 C1z/p2p-sse.c                |  2 +-
 C1z/p2p-tasks-openmp.c       | 10 +++-------
 C1z/p2p.c                    |  2 +-
 Cxx11/p2p-tasks-openmp.cc    | 30 +++++++++++++-----------------
 Cxx11/p2p-vector.cc          | 30 ++++++++++++++++--------------
 FORTRAN/p2p-tasks-openmp.f90 | 15 +++++----------
 7 files changed, 40 insertions(+), 51 deletions(-)

diff --git a/C1z/p2p-simd-openmp.c b/C1z/p2p-simd-openmp.c
index 6a02b96a1..a9444d02f 100644
--- a/C1z/p2p-simd-openmp.c
+++ b/C1z/p2p-simd-openmp.c
@@ -63,7 +63,7 @@
 
 static inline void sweep_tile(int startm, int endm,
                               int startn, int endn,
-                              int n, double grid[])
+                              int n, double grid[restrict])
 {
   for (int i=startm; i<endm; i++) {
     OMP_SIMD
diff --git a/C1z/p2p-sse.c b/C1z/p2p-sse.c
index dcde7c67c..a257d1f09 100644
--- a/C1z/p2p-sse.c
+++ b/C1z/p2p-sse.c
@@ -74,7 +74,7 @@ void print_m128d(const char * label, __m128d r)
 
 static inline void sweep_tile(int startm, int endm,
                               int startn, int endn,
-                              int n, double grid[])
+                              int n, double grid[restrict])
 {
   for (int i=startm; i<endm; i++) {
     for (int j=startn; j<endn; j++) {
diff --git a/C1z/p2p-tasks-openmp.c b/C1z/p2p-tasks-openmp.c
index 55b32595d..6218d827a 100644
--- a/C1z/p2p-tasks-openmp.c
+++ b/C1z/p2p-tasks-openmp.c
@@ -63,7 +63,7 @@
 
 static inline void sweep_tile(int startm, int endm,
                               int startn, int endn,
-                              int n, double grid[])
+                              int n, double grid[restrict])
 {
   for (int i=startm; i<endm; i++) {
     for (int j=startn; j<endn; j++) {
@@ -134,9 +134,6 @@ int main(int argc, char * argv[])
   OMP_PARALLEL()
   OMP_MASTER
   {
-    int lic = (m/mc-1) * mc + 1;
-    int ljc = (n/nc-1) * nc + 1;
-
     OMP_TASKLOOP( firstprivate(n) shared(grid) )
     for (int i=0; i<m; i++) {
       for (int j=0; j<n; j++) {
@@ -158,14 +155,13 @@ int main(int argc, char * argv[])
 
       for (int i=1; i<m; i+=mc) {
         for (int j=1; j<n; j+=nc) {
-          OMP_TASK( depend(in:grid[0],grid[(i-mc)*n+j],grid[i*n+(j-nc)],grid[(i-mc)*n+(j-nc)]) depend(out:grid[i*n+j]) )
+          OMP_TASK( depend(in:grid[(i-mc)*n+j],grid[i*n+(j-nc)]) depend(out:grid[i*n+j]) )
           sweep_tile(i, MIN(m,i+mc), j, MIN(n,j+nc), n, grid);
         }
       }
-      OMP_TASK( depend(in:grid[(lic-1)*n+(ljc)]) depend(out:grid[0]) )
+      OMP_TASKWAIT
       grid[0*n+0] = -grid[(m-1)*n+(n-1)];
     }
-    OMP_TASKWAIT
     pipeline_time = prk_wtime() - pipeline_time;
   }
 
diff --git a/C1z/p2p.c b/C1z/p2p.c
index 410cf1df7..db21f57a8 100644
--- a/C1z/p2p.c
+++ b/C1z/p2p.c
@@ -63,7 +63,7 @@
 
 static inline void sweep_tile(int startm, int endm,
                               int startn, int endn,
-                              int n, double grid[])
+                              int n, double grid[restrict])
 {
   for (int i=startm; i<endm; i++) {
     for (int j=startn; j<endn; j++) {
diff --git a/Cxx11/p2p-tasks-openmp.cc b/Cxx11/p2p-tasks-openmp.cc
index 7db36cf1b..46ce6e334 100644
--- a/Cxx11/p2p-tasks-openmp.cc
+++ b/Cxx11/p2p-tasks-openmp.cc
@@ -63,10 +63,10 @@
 
 inline void sweep_tile(int startm, int endm,
                        int startn, int endn,
-                       int n, double grid[])
+                       int n, double * RESTRICT grid)
 {
-  for (auto i=startm; i<endm; i++) {
-    for (auto j=startn; j<endn; j++) {
+  for (int i=startm; i<endm; i++) {
+    for (int j=startn; j<endn; j++) {
       grid[i*n+j] = grid[(i-1)*n+j] + grid[i*n+(j-1)] - grid[(i-1)*n+(j-1)];
     }
   }
@@ -135,43 +135,39 @@ int main(int argc, char* argv[])
 
   auto pipeline_time = 0.0; // silence compiler warning
 
-  double * grid = new double[m*n];
+  double * RESTRICT grid = new double[m*n];
 
   OMP_PARALLEL()
   OMP_MASTER
   {
-    int lic = (m/mc-1) * mc + 1;
-    int ljc = (n/nc-1) * nc + 1;
-
     OMP_TASKLOOP( firstprivate(m,n) shared(grid) )
-    for (auto i=0; i<m; i++) {
-      for (auto j=0; j<n; j++) {
+    for (int i=0; i<m; i++) {
+      for (int j=0; j<n; j++) {
         grid[i*n+j] = 0.0;
       }
     }
     OMP_TASKWAIT
 
-    for (auto j=0; j<n; j++) {
+    for (int j=0; j<n; j++) {
       grid[0*n+j] = static_cast<double>(j);
     }
-    for (auto i=0; i<m; i++) {
+    for (int i=0; i<m; i++) {
       grid[i*n+0] = static_cast<double>(i);
     }
 
-    for (auto iter = 0; iter<=iterations; iter++) {
+    for (int iter = 0; iter<=iterations; iter++) {
 
       if (iter==1) pipeline_time = prk::wtime();
 
-      for (auto i=1; i<m; i+=mc) {
-        for (auto j=1; j<n; j+=nc) {
-          OMP_TASK( firstprivate(m,n) shared(grid) depend(in:grid[0],grid[(i-mc)*n+j],grid[i*n+(j-nc)],grid[(i-mc)*n+(j-nc)]) depend(out:grid[i*n+j]) )
+      for (int i=1; i<m; i+=mc) {
+        for (int j=1; j<n; j+=nc) {
+          OMP_TASK( firstprivate(m,n) shared(grid) depend(in:grid[(i-mc)*n+j],grid[i*n+(j-nc)]) depend(out:grid[i*n+j]) )
           sweep_tile(i, std::min(m,i+mc), j, std::min(n,j+nc), n, grid);
         }
       }
-      OMP_TASK( firstprivate(m,n) shared(grid) depend(in:grid[(lic-1)*n+(ljc)]) depend(out:grid[0]) )
+      OMP_TASKWAIT
       grid[0*n+0] = -grid[(m-1)*n+(n-1)];
     }
-    OMP_TASKWAIT
     pipeline_time = prk::wtime() - pipeline_time;
   }
 
diff --git a/Cxx11/p2p-vector.cc b/Cxx11/p2p-vector.cc
index de7337980..da4041642 100644
--- a/Cxx11/p2p-vector.cc
+++ b/Cxx11/p2p-vector.cc
@@ -63,10 +63,10 @@
 
 inline void sweep_tile(int startm, int endm,
                        int startn, int endn,
-                       int n, std::vector<double> & grid)
+                       int n, double * RESTRICT grid)
 {
-  for (auto i=startm; i<endm; i++) {
-    for (auto j=startn; j<endn; j++) {
+  for (int i=startm; i<endm; i++) {
+    for (int j=startn; j<endn; j++) {
       grid[i*n+j] = grid[(i-1)*n+j] + grid[i*n+(j-1)] - grid[(i-1)*n+(j-1)];
     }
   }
@@ -132,31 +132,33 @@ int main(int argc, char* argv[])
 
   {
     // set boundary values (bottom and left side of grid)
-    for (auto j=0; j<n; j++) {
+    for (int j=0; j<n; j++) {
       grid[0*n+j] = static_cast<double>(j);
     }
-    for (auto i=0; i<m; i++) {
+    for (int i=0; i<m; i++) {
       grid[i*n+0] = static_cast<double>(i);
     }
 
-    for (auto iter = 0; iter<=iterations; iter++) {
+    for (int iter = 0; iter<=iterations; iter++) {
 
       if (iter==1) pipeline_time = prk::wtime();
 
+      double * RESTRICT pgrid = grid.data();
+
       if (mc==m && nc==n) {
-        for (auto i=1; i<m; i++) {
-          for (auto j=1; j<n; j++) {
-            grid[i*n+j] = grid[(i-1)*n+j] + grid[i*n+(j-1)] - grid[(i-1)*n+(j-1)];
+        for (int i=1; i<m; i++) {
+          for (int j=1; j<n; j++) {
+            pgrid[i*n+j] = pgrid[(i-1)*n+j] + pgrid[i*n+(j-1)] - pgrid[(i-1)*n+(j-1)];
           }
         }
-      } else /* chunking */ {
-        for (auto i=1; i<m; i+=mc) {
-          for (auto j=1; j<n; j+=nc) {
-            sweep_tile(i, std::min(m,i+mc), j, std::min(n,j+nc), n, grid);
+      } else {
+        for (int i=1; i<m; i+=mc) {
+          for (int j=1; j<n; j+=nc) {
+            sweep_tile(i, std::min(m,i+mc), j, std::min(n,j+nc), n, pgrid);
           }
         }
       }
-      grid[0*n+0] = -grid[(m-1)*n+(n-1)];
+      pgrid[0*n+0] = -pgrid[(m-1)*n+(n-1)];
     }
     pipeline_time = prk::wtime() - pipeline_time;
   }
diff --git a/FORTRAN/p2p-tasks-openmp.f90 b/FORTRAN/p2p-tasks-openmp.f90
index 1a0601b6f..74c7dcd90 100644
--- a/FORTRAN/p2p-tasks-openmp.f90
+++ b/FORTRAN/p2p-tasks-openmp.f90
@@ -184,24 +184,19 @@ program main
 
     if (k.eq.1) t0 = omp_get_wtime()
 
-    do ic=2,m,mc
-      do jc=2,n,nc
-        !$omp task firstprivate(i,j,jc,mc,nc,m,n) shared(grid)                          &
-        !$omp&     depend(in:grid(1,1),grid(ic-mc,jc-nc),grid(ic-mc,jc),grid(ic,jc-nc)) &
+    do jc=2,n,nc
+      do ic=2,m,mc
+        !$omp task firstprivate(i,j,jc,mc,nc,m,n) shared(grid)  &
+        !$omp&     depend(in:grid(ic-mc,jc),grid(ic,jc-nc))     &
         !$omp&     depend(out:grid(ic,jc))
         call sweep_tile(ic,min(m,ic+mc-1),jc,min(n,jc+nc-1),m,n,grid)
         !$omp end task
       enddo
     enddo
-    !$omp task firstprivate(m,n) shared(grid)                 &
-    !$omp&     depend(in:grid(lic,ljc)) depend(out:grid(1,1))
+    !$omp taskwait
     grid(1,1) = -grid(m,n)
-    !$omp end task
-
   enddo
 
-  !$omp taskwait
-
   t1 = omp_get_wtime()
   pipeline_time = t1 - t0
 

From 5259f7ee18f6a746ef8f75a8cc9b41b3efa7ceb2 Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo@reble.org>
Date: Thu, 3 May 2018 19:05:41 -0500
Subject: [PATCH 084/245] TBB Flow Graph version of p2p (#340)

* Add TBB Flow Graph version of p2p Pipeline kernel

* add target to Makefile

* Add TBB flags for Flow Graph Analyzer tracing support

* replace block_node_body with lambda expression
---
 Cxx11/Makefile         |   4 +-
 Cxx11/p2p-tasks-tbb.cc | 271 +++++++++++++++++++++++++++++++++++++++++
 common/make.defs.intel |   1 +
 3 files changed, 274 insertions(+), 2 deletions(-)
 create mode 100644 Cxx11/p2p-tasks-tbb.cc

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index 3a26ead0b..484a232b4 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -69,7 +69,7 @@ all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sy
 
 p2p: p2p-vector p2p-doacross-vector-openmp p2p-hyperplane-vector-openmp p2p-tasks-openmp p2p-openmp-target \
      p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl p2p-hyperplane-vector-tbb \
-     p2p-hyperplane-sycl p2p-hyperplane-vector-ornlacc
+     p2p-hyperplane-sycl p2p-hyperplane-vector-ornlacc p2p-tasks-tbb
 
 stencil: stencil-valarray stencil-vector stencil-vector-async stencil-vector-openmp stencil-openmp-target \
 	 stencil-vector-taskloop stencil-vector-stl stencil-vector-pstl stencil-vector-raja \
@@ -102,7 +102,7 @@ opencl: p2p-innerloop-opencl stencil-opencl transpose-opencl nstream-opencl
 sycl: p2p-hyperplane-sycl stencil-sycl transpose-sycl nstream-sycl
 
 tbb: p2p-innerloop-vector-tbb p2p-vector-tbb stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb \
-     p2p-hyperplane-vector-tbb
+     p2p-hyperplane-vector-tbb p2p-tasks-tbb
 
 stl: stencil-vector-stl transpose-vector-stl nstream-vector-stl
 
diff --git a/Cxx11/p2p-tasks-tbb.cc b/Cxx11/p2p-tasks-tbb.cc
new file mode 100644
index 000000000..3142a1e42
--- /dev/null
+++ b/Cxx11/p2p-tasks-tbb.cc
@@ -0,0 +1,271 @@
+///
+/// Copyright (c) 2013, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    Pipeline
+///
+/// PURPOSE: This program tests the efficiency with which point-to-point
+///          synchronization can be carried out. It does so by executing
+///          a pipelined algorithm on an m*n grid. The first array dimension
+///          is distributed among the threads (stripwise decomposition).
+///
+/// USAGE:   The program takes as input the
+///          dimensions of the grid, and the number of iterations on the grid
+///
+///                <progname> <iterations> <m> <n>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// FUNCTIONS CALLED:
+///
+///          Other than standard C functions, the following
+///          functions are used in this program:
+///
+///          wtime()
+///
+/// HISTORY: - Written by Rob Van der Wijngaart, February 2009.
+///            C99-ification by Jeff Hammond, February 2016.
+///            C++11-ification by Jeff Hammond, May 2017.
+///            TBB implementation by Pablo Reble, April 2018.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+
+#include "tbb/flow_graph.h"
+#include "tbb/parallel_for.h"
+
+inline void sweep_tile(int startm, int endm,
+                       int startn, int endn,
+                       int n, double grid[])
+{
+  for (auto i=startm; i<endm; i++) {
+    for (auto j=startn; j<endn; j++) {
+      grid[i*n+j] = grid[(i-1)*n+j] + grid[i*n+(j-1)] - grid[(i-1)*n+(j-1)];
+    }
+  }
+}
+
+int main(int argc, char* argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/TBB Flow Graph pipeline execution on 2D grid" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Process and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  using namespace tbb::flow;
+  //graph g;
+
+  int iterations;
+  int m, n;
+  int mc, nc;
+  try {
+      if (argc < 4){
+        throw " <# iterations> <first array dimension> <second array dimension> [<first chunk dimension> <second chunk dimension>]";
+      }
+
+      // number of times to run the pipeline algorithm
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      // grid dimensions
+      m = std::atoi(argv[2]);
+      n = std::atoi(argv[3]);
+      if (m < 1 || n < 1) {
+        throw "ERROR: grid dimensions must be positive";
+      } else if ( static_cast<size_t>(m)*static_cast<size_t>(n) > INT_MAX) {
+        throw "ERROR: grid dimension too large - overflow risk";
+      }
+
+      // grid chunk dimensions
+      mc = (argc > 4) ? std::atoi(argv[4]) : m;
+      nc = (argc > 5) ? std::atoi(argv[5]) : n;
+      if (mc < 1 || mc > m || nc < 1 || nc > n) {
+        std::cout << "WARNING: grid chunk dimensions invalid: " << mc <<  nc << " (ignoring)" << std::endl;
+        mc = m;
+        nc = n;
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  const char* envvar = std::getenv("TBB_NUM_THREADS");
+  int num_threads = (envvar!=NULL) ? std::atoi(envvar) : tbb::task_scheduler_init::default_num_threads();
+  tbb::task_scheduler_init init(num_threads);
+
+  std::cout << "Number of threads    = " << num_threads << std::endl;
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Grid sizes           = " << m << ", " << n << std::endl;
+  std::cout << "Grid chunk sizes     = " << mc << ", " << nc << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Create Grid and allocate space
+  //////////////////////////////////////////////////////////////////////
+  // calculate number of tiles in n and m direction to create grid.
+  int num_blocks_n = (n / nc);
+  if(n%nc != 0) num_blocks_n++;
+  int num_blocks_m = (m / mc);
+  if(m%mc != 0) num_blocks_m++;
+
+  auto pipeline_time = 0.0; // silence compiler warning
+
+  double * grid = new double[m*n];
+
+  typedef tbb::flow::continue_node< tbb::flow::continue_msg > block_node_t;
+
+  graph g;
+  block_node_t *nodes[ num_blocks_n * num_blocks_m ];
+  // To enable tracing support for Flow Graph Analyzer
+  // set following MACRO and link against TBB preview library (-ltbb_preview)
+#if TBB_PREVIEW_FLOW_GRAPH_TRACE
+  char buffer[1024];
+  g.set_name("Pipeline");
+#endif
+
+  bool first_iter=true;
+  block_node_t b(g, [&](const tbb::flow::continue_msg &){
+    grid[0*n+0] = -grid[(m-1)*n+(n-1)];
+    if(first_iter) pipeline_time = prk::wtime();
+      first_iter = false;
+  });
+  for (int i=0; i<num_blocks_m; i+=1) {
+    for (int j=0; j<num_blocks_n; j+=1) {
+        block_node_t *tmp = new block_node_t(g, [=](const tbb::flow::continue_msg &){
+            sweep_tile((i*mc)+1, std::min(m,(i*mc)+mc+1), (j*nc)+1, std::min(n,(j*nc)+nc+1), n, grid);
+        });
+#if TBB_PREVIEW_FLOW_GRAPH_TRACE
+        sprintf(buffer, "block [ %d, %d ]", i, j );
+        tmp->set_name( buffer );
+#endif
+        nodes[i*num_blocks_n + j] = tmp;
+        if (i>0)
+          make_edge(*nodes[(i-1)*num_blocks_n + j ], *tmp );
+        if (j>0)
+          make_edge(*nodes[ i   *num_blocks_n + j-1], *tmp );
+        // Transitive dependencies from OpenMP task version:
+        //make_edge( *tmp, b );
+        //if (i>0 && j>0)
+        //  make_edge(*nodes[(i-1)*num_blocks_n + j-1], *tmp );
+    }
+  }
+  auto start = true;
+  source_node<continue_msg> s(g, [&](continue_msg &v) -> bool {
+    if(start) { 
+      v = continue_msg();
+      start = false;
+      return true;
+    }
+    return false;
+  }, false);
+  
+  limiter_node<continue_msg> l(g, iterations+1, 1);
+
+  make_edge( s, l );
+  make_edge( l, *nodes[0] );
+  make_edge( *nodes[(num_blocks_n * num_blocks_m) - 1], b);
+  make_edge( b, l );
+
+#if TBB_PREVIEW_FLOW_GRAPH_TRACE
+  s.set_name("Source");
+  b.set_name("Iteration Barrier");
+  l.set_name("Limiter");
+#endif
+
+  //////////////////////////////////////////////////////////////////////
+  // Perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  {
+
+    tbb::blocked_range2d<int> range(0, m, mc, 0, n, nc);
+    tbb::parallel_for( range, [&](decltype(range)& r) {
+      for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) {
+        for (auto j=r.cols().begin(); j!=r.cols().end(); ++j ) {
+          grid[i*n+j] = 0.0;
+        }
+      }
+    }, tbb_partitioner);
+    for (auto j=0; j<n; j++) {
+      grid[0*n+j] = static_cast<double>(j);
+    }
+    for (auto i=0; i<m; i++) {
+      grid[i*n+0] = static_cast<double>(i);
+    }
+
+    s.activate();
+    g.wait_for_all();
+    
+    pipeline_time = prk::wtime() - pipeline_time;
+
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  // Cleanup Flow Graph
+  //////////////////////////////////////////////////////////////////////
+
+  for (int i=0; i<num_blocks_m; i+=1) {
+    for (int j=0; j<num_blocks_n; j+=1) {
+      delete nodes[i*num_blocks_n + j];
+    }
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  // Analyze and output results.
+  //////////////////////////////////////////////////////////////////////
+
+  const double epsilon = 1.e-8;
+  auto corner_val = ((iterations+1.)*(n+m-2.));
+  if ( (std::fabs(grid[(m-1)*n+(n-1)] - corner_val)/corner_val) > epsilon) {
+    std::cout << "ERROR: checksum " << grid[(m-1)*n+(n-1)]
+              << " does not match verification value " << corner_val << std::endl;
+    return 1;
+  }
+
+#ifdef VERBOSE
+  std::cout << "Solution validates; verification value = " << corner_val << std::endl;
+#else
+  std::cout << "Solution validates" << std::endl;
+#endif
+  auto avgtime = pipeline_time/iterations;
+  std::cout << "Rate (MFlops/s): "
+            << 2.0e-6 * ( (m-1.)*(n-1.) )/avgtime
+            << " Avg time (s): " << avgtime << std::endl;
+
+  return 0;
+}
diff --git a/common/make.defs.intel b/common/make.defs.intel
index a4d515ba0..556d940b6 100644
--- a/common/make.defs.intel
+++ b/common/make.defs.intel
@@ -63,6 +63,7 @@ CILKFLAG=-intel-extensions # default
 # TBB
 #
 TBBFLAG=-DUSE_TBB -tbb
+#TBBFLAG=-DUSE_TBB -tbb_preview -DTBB_PREVIEW_FLOW_GRAPH_TRACE
 #
 # Parallel STL, Boost, etc.
 #

From 59b80af2fe981f1d094768eda45d3db2b8b6a6dc Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Thu, 3 May 2018 17:06:20 -0700
Subject: [PATCH 085/245] TBB flowgraph (#341)

* Add TBB Flow Graph version of p2p Pipeline kernel

* add target to Makefile

* Add TBB flags for Flow Graph Analyzer tracing support

* replace block_node_body with lambda expression

* add Travis for TBB tasks (flowgraph)
---
 travis/build-run-prk.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index 6c1e1a1ca..350bfb353 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -480,6 +480,7 @@ case "$PRK_TARGET" in
             $PRK_TARGET_PATH/p2p-innerloop-vector-tbb     10 1024
             $PRK_TARGET_PATH/p2p-hyperplane-vector-tbb    10 1024 1
             $PRK_TARGET_PATH/p2p-hyperplane-vector-tbb    10 1024 32
+            $PRK_TARGET_PATH/p2p-tasks-tbb                10 1024 1024 32 32
             $PRK_TARGET_PATH/stencil-vector-tbb           10 1000
             $PRK_TARGET_PATH/transpose-vector-tbb         10 1024 32
             $PRK_TARGET_PATH/nstream-vector-tbb           10 16777216 32

From eea8494ffafe56c6cbf000c285a5b09b1d419443 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Thu, 10 May 2018 15:04:17 -0700
Subject: [PATCH 086/245] build p2p-tasks-tbb

---
 travis/build-run-prk.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index 350bfb353..00b4395e6 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -476,7 +476,7 @@ case "$PRK_TARGET" in
                     export LD_LIBRARY_PATH=${TBBROOT}/lib:${LD_LIBRARY_PATH}
                     ;;
             esac
-            make -C $PRK_TARGET_PATH p2p-innerloop-vector-tbb p2p-hyperplane-vector-tbb stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb
+            make -C $PRK_TARGET_PATH p2p-innerloop-vector-tbb p2p-hyperplane-vector-tbb p2p-tasks-tbb stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb
             $PRK_TARGET_PATH/p2p-innerloop-vector-tbb     10 1024
             $PRK_TARGET_PATH/p2p-hyperplane-vector-tbb    10 1024 1
             $PRK_TARGET_PATH/p2p-hyperplane-vector-tbb    10 1024 32

From f09025bb86003f178f7a6213d6cde99634072a73 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Thu, 10 May 2018 15:52:18 -0700
Subject: [PATCH 087/245] Thrust nstream (#342)

* Thrust transpose working with host vector, device still busted
* Thrust nstream working with host and device
---
 .gitignore                       |   4 +
 Cxx11/Makefile                   |  19 +++-
 Cxx11/nstream-device-thrust.cu   | 179 +++++++++++++++++++++++++++++
 Cxx11/nstream-host-thrust.cc     | 179 +++++++++++++++++++++++++++++
 Cxx11/prk_util.h                 |  15 +++
 Cxx11/transpose-device-thrust.cu | 188 +++++++++++++++++++++++++++++++
 Cxx11/transpose-host-thrust.cc   | 156 +++++++++++++++++++++++++
 common/make.defs.gcc             |   8 +-
 8 files changed, 743 insertions(+), 5 deletions(-)
 create mode 100644 Cxx11/nstream-device-thrust.cu
 create mode 100644 Cxx11/nstream-host-thrust.cc
 create mode 100644 Cxx11/transpose-device-thrust.cu
 create mode 100644 Cxx11/transpose-host-thrust.cc

diff --git a/.gitignore b/.gitignore
index d5100141d..6cd0e154b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -179,7 +179,11 @@ Cxx11/transpose-vector-raja
 Cxx11/transpose-vector-rangefor
 Cxx11/transpose-vector-tbb
 Cxx11/transpose-vector-taskloop
+Cxx11/transpose-vector-async
+Cxx11/transpose-vector-thread
 Cxx11/transpose-kokkos
+Cxx11/transpose-device-thrust
+Cxx11/transpose-host-thrust
 Cxx11/transpose-cublas
 Cxx11/transpose-cuda
 Cxx11/grid1.cl
diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index 484a232b4..07be4fa0b 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -45,7 +45,8 @@ RANGEFLAGS = -DUSE_RANGES $(RANGEFLAG)
 STLFLAGS = $(STLFLAG) $(RANGEFLAGS)
 PSTLFLAGS = $(PSTLFLAG) $(RANGEFLAGS)
 RAJAFLAGS = $(RAJAFLAG)
-KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG)
+THRUSTFLAGS = $(THRUSTFLAG) $(RANGEFLAGS) -DUSE_THRUST
+KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) $(RANGEFLAGS)
 ORNLACCFLAGS = $(ORNLACCFLAG)
 
 ifdef OCCADIR
@@ -54,7 +55,7 @@ endif
 OCCAFLAGS = -DUSE_OCCA -I${OCCADIR}/include -Wl,-rpath -Wl,${OCCADIR}/lib -L${OCCADIR}/lib -locca
 
 .PHONY: all clean vector valarray openmp target opencl taskloop tbb stl pstl \
-	rangefor kokkos raja cuda cublas sycl boost-compute
+	rangefor kokkos raja cuda cublas sycl boost-compute thrust
 
 EXTRA=
 ifeq ($(shell uname -s),Darwin)
@@ -116,6 +117,11 @@ raja: p2p-vector-raja stencil-vector-raja transpose-vector-raja nstream-vector-r
 
 cuda: stencil-cuda transpose-cuda nstream-cuda
 
+thrust: nstream-host-thrust nstream-device-thrust \
+        transpose-host-thrust transpose-device-thrust
+
+cuda: transpose-cuda
+
 cublas: transpose-cublas nstream-cublas dgemm-cublas
 
 occa: transpose-occa nstream-occa
@@ -173,6 +179,14 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h
 	$(info PRK help: Set USE_PRK_KOKKOS_BACKEND={Threads,Serial,Cuda} when invoking make to not use OpenMP)
 	$(CXX) $(CXXFLAGS) $< $(KOKKOSFLAGS) -o $@
 
+# for host execution
+%-thrust: %-thrust.cc prk_util.h
+	$(CXX) $(CXXFLAGS) $< $(THRUSTFLAGS) -o $@
+
+# for device execution (must compiler as .cu)
+%-thrust: %-thrust.cu prk_util.h
+	$(NVCC) $(CUDAFLAGS) $(CPPFLAGS) $< $(THRUSTFLAGS) -o $@
+
 %-cuda: %-cuda.cu prk_util.h prk_cuda.h
 	$(NVCC) $(CUDAFLAGS) $(CPPFLAGS) $< -o $@
 
@@ -218,6 +232,7 @@ clean:
 	-rm -f *-rangefor
 	-rm -f *-raja
 	-rm -f *-kokkos
+	-rm -f *-thrust
 	-rm -f *-cuda
 	-rm -f *-cublas
 	-rm -f *-cblas
diff --git a/Cxx11/nstream-device-thrust.cu b/Cxx11/nstream-device-thrust.cu
new file mode 100644
index 000000000..7ea49dabd
--- /dev/null
+++ b/Cxx11/nstream-device-thrust.cu
@@ -0,0 +1,179 @@
+///
+/// Copyright (c) 2017, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    nstream
+///
+/// PURPOSE: To compute memory bandwidth when adding a vector of a given
+///          number of double precision values to the scalar multiple of
+///          another vector of the same length, and storing the result in
+///          a third vector.
+///
+/// USAGE:   The program takes as input the number
+///          of iterations to loop over the triad vectors, the length of the
+///          vectors, and the offset between vectors
+///
+///          <progname> <# iterations> <vector length> <offset>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// NOTES:   Bandwidth is determined as the number of words read, plus the
+///          number of words written, times the size of the words, divided
+///          by the execution time. For a vector length of N, the total
+///          number of words read and written is 4*N*sizeof(double).
+///
+///
+/// HISTORY: This code is loosely based on the Stream benchmark by John
+///          McCalpin, but does not follow all the Stream rules. Hence,
+///          reported results should not be associated with Stream in
+///          external publications
+///
+///          Converted to C++11 by Jeff Hammond, November 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/Thrust STREAM triad: A = B + scalar * C" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations, offset;
+  size_t length;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <vector length>";
+      }
+
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      length = std::atol(argv[2]);
+      if (length <= 0) {
+        throw "ERROR: vector length must be positive";
+      }
+
+      offset = (argc>3) ? std::atoi(argv[3]) : 0;
+      if (length <= 0) {
+        throw "ERROR: offset must be nonnegative";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Vector length        = " << length << std::endl;
+  std::cout << "Offset               = " << offset << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  auto nstream_time = 0.0;
+
+  thrust::device_vector<double> A(length);
+  thrust::device_vector<double> B(length);
+  thrust::device_vector<double> C(length);
+
+  auto range = prk::range(static_cast<size_t>(0), length);
+
+  double scalar(3);
+  {
+    thrust::fill(thrust::device, A.begin(), A.end(), 0.0);
+    thrust::fill(thrust::device, B.begin(), B.end(), 2.0);
+    thrust::fill(thrust::device, C.begin(), C.end(), 2.0);
+
+    auto nstream = [=] __host__ __device__ (thrust::tuple<double&,double,double> t) {
+        thrust::get<0>(t) +=  thrust::get<1>(t) + scalar * thrust::get<2>(t);
+    };
+
+    for (auto iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) nstream_time = prk::wtime();
+
+      thrust::for_each( thrust::device,
+                        thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin())),
+                        thrust::make_zip_iterator(thrust::make_tuple(A.end()  , B.end()  , C.end())),
+                        nstream);
+    }
+    nstream_time = prk::wtime() - nstream_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  double ar(0);
+  double br(2);
+  double cr(2);
+  for (auto i=0; i<=iterations; i++) {
+      ar += br + scalar * cr;
+  }
+
+  ar *= length;
+
+  //double asum = thrust::reduce(A.begin(), A.end(), 0.0, thrust::plus<double>());
+  double asum = thrust::transform_reduce(A.begin(),
+                                         A.end(),
+                                         [=] __host__ __device__ (double x) -> double { return fabs(x); },
+                                         0.0,
+                                         thrust::plus<double>());
+
+  double epsilon(1.e-8);
+  if (std::fabs(ar-asum)/asum > epsilon) {
+      std::cout << "Failed Validation on output array\n"
+                << "       Expected checksum: " << ar << "\n"
+                << "       Observed checksum: " << asum << std::endl;
+      std::cout << "ERROR: solution did not validate" << std::endl;
+      return 1;
+  } else {
+      std::cout << "Solution validates" << std::endl;
+      double avgtime = nstream_time/iterations;
+      double nbytes = 4.0 * length * sizeof(double);
+      std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime
+                << " Avg time (s): " << avgtime << std::endl;
+  }
+
+  return 0;
+}
+
+
diff --git a/Cxx11/nstream-host-thrust.cc b/Cxx11/nstream-host-thrust.cc
new file mode 100644
index 000000000..5bc29d145
--- /dev/null
+++ b/Cxx11/nstream-host-thrust.cc
@@ -0,0 +1,179 @@
+///
+/// Copyright (c) 2017, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    nstream
+///
+/// PURPOSE: To compute memory bandwidth when adding a vector of a given
+///          number of double precision values to the scalar multiple of
+///          another vector of the same length, and storing the result in
+///          a third vector.
+///
+/// USAGE:   The program takes as input the number
+///          of iterations to loop over the triad vectors, the length of the
+///          vectors, and the offset between vectors
+///
+///          <progname> <# iterations> <vector length> <offset>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// NOTES:   Bandwidth is determined as the number of words read, plus the
+///          number of words written, times the size of the words, divided
+///          by the execution time. For a vector length of N, the total
+///          number of words read and written is 4*N*sizeof(double).
+///
+///
+/// HISTORY: This code is loosely based on the Stream benchmark by John
+///          McCalpin, but does not follow all the Stream rules. Hence,
+///          reported results should not be associated with Stream in
+///          external publications
+///
+///          Converted to C++11 by Jeff Hammond, November 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/Thrust STREAM triad: A = B + scalar * C" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations, offset;
+  size_t length;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <vector length>";
+      }
+
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      length = std::atol(argv[2]);
+      if (length <= 0) {
+        throw "ERROR: vector length must be positive";
+      }
+
+      offset = (argc>3) ? std::atoi(argv[3]) : 0;
+      if (length <= 0) {
+        throw "ERROR: offset must be nonnegative";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Vector length        = " << length << std::endl;
+  std::cout << "Offset               = " << offset << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  auto nstream_time = 0.0;
+
+  thrust::host_vector<double> A(length);
+  thrust::host_vector<double> B(length);
+  thrust::host_vector<double> C(length);
+
+  auto range = prk::range(static_cast<size_t>(0), length);
+
+  double scalar(3);
+  {
+    thrust::fill(thrust::host, A.begin(), A.end(), 0.0);
+    thrust::fill(thrust::host, B.begin(), B.end(), 2.0);
+    thrust::fill(thrust::host, C.begin(), C.end(), 2.0);
+
+    auto nstream = [=] __host__ __device__ (thrust::tuple<double&,double,double> t) {
+        thrust::get<0>(t) +=  thrust::get<1>(t) + scalar * thrust::get<2>(t);
+    };
+
+    for (auto iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) nstream_time = prk::wtime();
+
+      thrust::for_each( thrust::host,
+                        thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin())),
+                        thrust::make_zip_iterator(thrust::make_tuple(A.end()  , B.end()  , C.end())),
+                        nstream);
+    }
+    nstream_time = prk::wtime() - nstream_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  double ar(0);
+  double br(2);
+  double cr(2);
+  for (auto i=0; i<=iterations; i++) {
+      ar += br + scalar * cr;
+  }
+
+  ar *= length;
+
+  //double asum = thrust::reduce(A.begin(), A.end(), 0.0, thrust::plus<double>());
+  double asum = thrust::transform_reduce(A.begin(),
+                                         A.end(),
+                                         [=] __host__ __device__ (double x) -> double { return std::fabs(x); },
+                                         0.0,
+                                         thrust::plus<double>());
+
+  double epsilon(1.e-8);
+  if (std::fabs(ar-asum)/asum > epsilon) {
+      std::cout << "Failed Validation on output array\n"
+                << "       Expected checksum: " << ar << "\n"
+                << "       Observed checksum: " << asum << std::endl;
+      std::cout << "ERROR: solution did not validate" << std::endl;
+      return 1;
+  } else {
+      std::cout << "Solution validates" << std::endl;
+      double avgtime = nstream_time/iterations;
+      double nbytes = 4.0 * length * sizeof(double);
+      std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime
+                << " Avg time (s): " << avgtime << std::endl;
+  }
+
+  return 0;
+}
+
+
diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h
index 868f8b8c9..76363d8e2 100644
--- a/Cxx11/prk_util.h
+++ b/Cxx11/prk_util.h
@@ -225,6 +225,21 @@ const T prk_reduce(I first, I last, T init) {
 # include "RAJA/RAJA.hpp"
 #endif
 
+#ifdef USE_THRUST
+# ifdef __NVCC__
+#  include <thrust/device_vector.h>
+# endif
+# include <thrust/host_vector.h>
+# include <thrust/fill.h>
+# include <thrust/sequence.h>
+# include <thrust/for_each.h>
+# include <thrust/transform.h>
+# include <thrust/transform_reduce.h>
+# include <thrust/iterator/counting_iterator.h>
+# include <thrust/execution_policy.h>
+# include <thrust/functional.h>
+#endif
+
 #ifdef USE_SYCL
 # include "CL/sycl.hpp"
 #endif
diff --git a/Cxx11/transpose-device-thrust.cu b/Cxx11/transpose-device-thrust.cu
new file mode 100644
index 000000000..b4c9a1874
--- /dev/null
+++ b/Cxx11/transpose-device-thrust.cu
@@ -0,0 +1,188 @@
+///
+/// Copyright (c) 2013, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    transpose
+///
+/// PURPOSE: This program measures the time for the transpose of a
+///          column-major stored matrix into a row-major stored matrix.
+///
+/// USAGE:   Program input is the matrix order and the number of times to
+///          repeat the operation:
+///
+///          transpose <matrix_size> <# iterations>
+///
+///          The output consists of diagnostics to make sure the
+///          transpose worked and timing statistics.
+///
+/// HISTORY: Written by  Rob Van der Wijngaart, February 2009.
+///          Converted to C++11 by Jeff Hammond, February 2016 and May 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+
+struct x : public thrust::unary_function<void,int>
+{
+    int i;
+    int order;
+    thrust::device_vector<double> & A;
+    thrust::device_vector<double> & B;
+
+    x(int i, int order, thrust::device_vector<double> & A, thrust::device_vector<double> & B) :
+        i(i), order(order), A(A), B(B) {}
+
+    __host__ __device__
+    void operator()(int j)
+    {
+        B[i*order+j] += A[j*order+i];
+        A[j*order+i] += 1.0;
+        return;
+    }
+};
+
+//__device__
+void transpose(const int order, thrust::device_vector<double> & A, thrust::device_vector<double> & B)
+{
+    thrust::counting_iterator<int> start(0);
+    thrust::counting_iterator<int> end = start + order;
+    thrust::for_each( thrust::device, start, end, [=,&A,&B] (int i) {
+      thrust::for_each( thrust::device, start, end, x(i,order,A,B) );
+    });
+}
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/Thrust Matrix transpose: B = A^T" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations;
+  int order;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <matrix order>";
+      }
+
+      // number of times to do the transpose
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      // order of a the matrix
+      order = std::atoi(argv[2]);
+      if (order <= 0) {
+        throw "ERROR: Matrix Order must be greater than 0";
+      } else if (order > std::floor(std::sqrt(INT_MAX))) {
+        throw "ERROR: matrix dimension too large - overflow risk";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations  = " << iterations << std::endl;
+  std::cout << "Matrix order          = " << order << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Allocate space for the input and transpose matrix
+  //////////////////////////////////////////////////////////////////////
+
+  thrust::device_vector<double> A(order*order);
+  thrust::device_vector<double> B(order*order);
+  // fill A with the sequence 0 to order^2-1 as doubles
+  thrust::sequence(thrust::device, A.begin(), A.end() );
+  thrust::fill(thrust::device, B.begin(), B.end(), 0.0);
+
+  auto range = boost::irange(0,order);
+
+  auto trans_time = 0.0;
+
+  for (auto iter = 0; iter<=iterations; iter++) {
+
+    if (iter==1) trans_time = prk::wtime();
+
+#if 1
+    transpose(order, A, B);
+#else
+    thrust::for_each( std::begin(range), std::end(range), [=,&A,&B] (int i) {
+      thrust::for_each( std::begin(range), std::end(range), [=,&A,&B] (int j) {
+        B[i*order+j] += A[j*order+i];
+        A[j*order+i] += 1.0;
+      });
+    });
+#endif
+  }
+  trans_time = prk::wtime() - trans_time;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  // TODO: replace with std::generate, std::accumulate, or similar
+  const auto addit = (iterations+1.) * (iterations/2.);
+  auto abserr = 0.0;
+  for (auto i : range) {
+    for (auto j : range) {
+      const int ij = i*order+j;
+      const int ji = j*order+i;
+      const double reference = static_cast<double>(ij)*(1.+iterations)+addit;
+      abserr += std::fabs(B[ji] - reference);
+    }
+  }
+
+#ifdef VERBOSE
+  std::cout << "Sum of absolute differences: " << abserr << std::endl;
+#endif
+
+  const auto epsilon = 1.0e-8;
+  if (abserr < epsilon) {
+    std::cout << "Solution validates" << std::endl;
+    auto avgtime = trans_time/iterations;
+    auto bytes = (size_t)order * (size_t)order * sizeof(double);
+    std::cout << "Rate (MB/s): " << 1.0e-6 * (2L*bytes)/avgtime
+              << " Avg time (s): " << avgtime << std::endl;
+  } else {
+    std::cout << "ERROR: Aggregate squared error " << abserr
+              << " exceeds threshold " << epsilon << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
+
+
diff --git a/Cxx11/transpose-host-thrust.cc b/Cxx11/transpose-host-thrust.cc
new file mode 100644
index 000000000..53066208b
--- /dev/null
+++ b/Cxx11/transpose-host-thrust.cc
@@ -0,0 +1,156 @@
+///
+/// Copyright (c) 2013, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    transpose
+///
+/// PURPOSE: This program measures the time for the transpose of a
+///          column-major stored matrix into a row-major stored matrix.
+///
+/// USAGE:   Program input is the matrix order and the number of times to
+///          repeat the operation:
+///
+///          transpose <matrix_size> <# iterations>
+///
+///          The output consists of diagnostics to make sure the
+///          transpose worked and timing statistics.
+///
+/// HISTORY: Written by  Rob Van der Wijngaart, February 2009.
+///          Converted to C++11 by Jeff Hammond, February 2016 and May 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/Thrust Matrix transpose: B = A^T" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations;
+  int order;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <matrix order>";
+      }
+
+      // number of times to do the transpose
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      // order of a the matrix
+      order = std::atoi(argv[2]);
+      if (order <= 0) {
+        throw "ERROR: Matrix Order must be greater than 0";
+      } else if (order > std::floor(std::sqrt(INT_MAX))) {
+        throw "ERROR: matrix dimension too large - overflow risk";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations  = " << iterations << std::endl;
+  std::cout << "Matrix order          = " << order << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Allocate space for the input and transpose matrix
+  //////////////////////////////////////////////////////////////////////
+
+  thrust::host_vector<double> A(order*order);
+  thrust::host_vector<double> B(order*order);
+  // fill A with the sequence 0 to order^2-1 as doubles
+  thrust::sequence(thrust::host, A.begin(), A.end() );
+  thrust::fill(thrust::host, B.begin(), B.end(), 0.0);
+
+  auto range = boost::irange(0,order);
+
+  auto trans_time = 0.0;
+
+  for (auto iter = 0; iter<=iterations; iter++) {
+
+    if (iter==1) trans_time = prk::wtime();
+
+    // transpose
+    thrust::for_each( thrust::host, std::begin(range), std::end(range), [&] (int i) {
+      thrust::for_each( thrust::host, std::begin(range), std::end(range), [&] (int j) {
+        B[i*order+j] += A[j*order+i];
+        A[j*order+i] += 1.0;
+      });
+    });
+  }
+  trans_time = prk::wtime() - trans_time;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  // TODO: replace with std::generate, std::accumulate, or similar
+  const auto addit = (iterations+1.) * (iterations/2.);
+  auto abserr = 0.0;
+  for (auto i : range) {
+    for (auto j : range) {
+      const int ij = i*order+j;
+      const int ji = j*order+i;
+      const double reference = static_cast<double>(ij)*(1.+iterations)+addit;
+      abserr += std::fabs(B[ji] - reference);
+    }
+  }
+
+#ifdef VERBOSE
+  std::cout << "Sum of absolute differences: " << abserr << std::endl;
+#endif
+
+  const auto epsilon = 1.0e-8;
+  if (abserr < epsilon) {
+    std::cout << "Solution validates" << std::endl;
+    auto avgtime = trans_time/iterations;
+    auto bytes = (size_t)order * (size_t)order * sizeof(double);
+    std::cout << "Rate (MB/s): " << 1.0e-6 * (2L*bytes)/avgtime
+              << " Avg time (s): " << avgtime << std::endl;
+  } else {
+    std::cout << "ERROR: Aggregate squared error " << abserr
+              << " exceeds threshold " << epsilon << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
+
+
diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index 5477603e0..ad1f2fcf2 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -78,6 +78,8 @@ KOKKOSDIR=/opt/kokkos/gcc
 KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG}
 RAJADIR=/opt/raja/gcc
 RAJAFLAG=-DUSE_RAJA -I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
+THRUSTDIR=/Users/jrhammon/Work/NVIDIA/thrust
+THRUSTFLAG=-DUSE_THRUST -I${THRUSTDIR} -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP
 #
 # CBLAS for C++ DGEMM
 #
@@ -86,10 +88,10 @@ CBLASFLAG=-DACCELERATE -framework Accelerate
 # CUDA flags
 #
 # Mac w/ CUDA emulation via https://github.com/hughperkins/coriander
-NVCC=/opt/llvm/cocl/bin/cocl
+#NVCC=/opt/llvm/cocl/bin/cocl
 # Linux w/ NVIDIA CUDA
-#NVCC=nvcc
-CUDAFLAGS=-g -O3 -std=c++11
+NVCC=nvcc
+CUDAFLAGS=-g -O3 -std=c++11 -arch=sm_50
 # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233
 CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED
 #

From 79637063cfac0123c05853a232dd64ae811848a1 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Fri, 11 May 2018 22:06:24 -0700
Subject: [PATCH 088/245] nstream needs to use size_t loop index

---
 Cxx11/nstream-vector-pstl.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Cxx11/nstream-vector-pstl.cc b/Cxx11/nstream-vector-pstl.cc
index dbc52aaf4..852db5de5 100644
--- a/Cxx11/nstream-vector-pstl.cc
+++ b/Cxx11/nstream-vector-pstl.cc
@@ -126,9 +126,10 @@ int main(int argc, char * argv[])
 
   {
 #if defined(USE_PSTL) && defined(USE_INTEL_PSTL)
-    std::for_each( pstl::execution::par_unseq, std::begin(range), std::end(range), [&] (int i) {
+    std::for_each( pstl::execution::par_unseq, std::begin(range), std::end(range), [&] (size_t i) {
 #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \
                         && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) )
+#warning GNU parallel
     __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (size_t i) {
 #else
     std::for_each( std::begin(range), std::end(range), [&] (size_t i) {
@@ -143,7 +144,7 @@ int main(int argc, char * argv[])
       if (iter==1) nstream_time = prk::wtime();
 
 #if defined(USE_PSTL) && defined(USE_INTEL_PSTL)
-      std::for_each( pstl::execution::par_unseq, std::begin(range), std::end(range), [&] (int i) {
+      std::for_each( pstl::execution::par_unseq, std::begin(range), std::end(range), [&] (size_t i) {
 #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \
                         && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) )
       __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (size_t i) {

From bcb65326ea1b590acd6b97880898c626d3dbcf88 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Fri, 11 May 2018 22:12:55 -0700
Subject: [PATCH 089/245] GCC 8 released [ci skip]

---
 common/make.defs.gcc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index ad1f2fcf2..586cec08c 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -4,7 +4,7 @@
 #
 # Base compilers and language options
 #
-VERSION=-7
+VERSION=-8
 # C99 is required in some implementations.
 CC=gcc${VERSION} -std=c11 -pthread
 #EXTRA_CLIBS=-lrt
@@ -73,6 +73,7 @@ TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
 BOOSTFLAG=-DUSE_BOOST -I/usr/local/Cellar/boost/1.65.1/include
 #RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
 RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
+#PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} ${RANGEFLAG}
 PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG}
 KOKKOSDIR=/opt/kokkos/gcc
 KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG}

From 292029c2f419283f392dd45c4b9b5fdc9954ca28 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Fri, 11 May 2018 22:13:31 -0700
Subject: [PATCH 090/245] update C++ support matrix in README [ci skip]

---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 871ea9272..03fedd850 100644
--- a/README.md
+++ b/README.md
@@ -92,12 +92,14 @@ f = see footnotes
 | SYCL                 |     |    y    |     y     |    y    |        |       |
 | Boost.Compute        |     |         |           |    y    |        |       |
 | Parallel STL         |  y  |    y    |     y     |    y    |        |       |
-| TBB                  |  i  |    y    |     y     |    y    |        |       |
+| Thrust               |  y  |         |           |         |        |       |
+| TBB                  |  y  |    y    |     y     |    y    |        |       |
 | Kokkos               |  y  |    y    |     y     |    y    |        |       |
 | RAJA                 |  y  |    y    |     y     |    y    |        |       |
 | CUDA                 |  i  |    y    |     y     |    y    |        |       |
-| CUBLAS               |     |         |     y     |    y    |        |       |
+| CUBLAS               |     |         |     y     |    y    |        |   y   |
 | CBLAS                |     |         |           |         |        |   y   |
+| OpenACC              |  y  |         |           |         |        |       |
 
 * [SYCL](http://sycl.tech/)
 * [Boost.Compute](http://boostorg.github.io/compute/)

From c3fd2463e02c97c93651fcb854ff1d06c5301bb8 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Fri, 11 May 2018 22:31:14 -0700
Subject: [PATCH 091/245] p2p scalar references (#343)

* apply scalar optimizations to p2p

Intel Fortran turns array references into scalars.  Intel C/C++ will do
it for 2D arrays but that's tricky here without VLAs.  The optimization
can be performed by hand, which is what this commit is.

Thanks to Martyn Corden of Intel for analyzing this issue and providing
the improved implementation.

* extra p2p kernel into seperate tile
* need static for inline to link
---
 C1z/p2p-kernel.h                       | 33 ++++++++++++++
 C1z/p2p-simd-openmp.c                  | 14 +-----
 C1z/p2p-tasks-openmp.c                 | 12 +----
 C1z/p2p.c                              | 12 +----
 Cxx11/p2p-hyperplane-sycl.cc           | 27 +----------
 Cxx11/p2p-hyperplane-vector-openmp.cc  | 29 +-----------
 Cxx11/p2p-hyperplane-vector-ornlacc.cc |  3 +-
 Cxx11/p2p-hyperplane-vector-pstl.cc    | 29 +-----------
 Cxx11/p2p-hyperplane-vector-tbb.cc     | 29 +-----------
 Cxx11/p2p-kernel.h                     | 63 ++++++++++++++++++++++++++
 Cxx11/p2p-tasks-openmp.cc              | 12 +----
 Cxx11/p2p-tasks-tbb.cc                 | 12 +----
 Cxx11/p2p-vector.cc                    | 20 ++++----
 13 files changed, 118 insertions(+), 177 deletions(-)
 create mode 100644 C1z/p2p-kernel.h
 create mode 100644 Cxx11/p2p-kernel.h

diff --git a/C1z/p2p-kernel.h b/C1z/p2p-kernel.h
new file mode 100644
index 000000000..ef2ea082a
--- /dev/null
+++ b/C1z/p2p-kernel.h
@@ -0,0 +1,33 @@
+#if 1
+
+static inline void sweep_tile(int startm, int endm,
+                              int startn, int endn,
+                              int n, double * restrict grid)
+{
+  for (int i=startm; i<endm; i++) {
+    for (int j=startn; j<endn; j++) {
+      grid[i*n+j] = grid[(i-1)*n+j] + grid[i*n+(j-1)] - grid[(i-1)*n+(j-1)];
+    }
+  }
+}
+
+#else
+
+static inline void sweep_tile(int startm, int endm,
+                              int startn, int endn,
+                              int n, double * restrict grid)
+{
+    for (int i=startm; i<endm; i++) {
+        double olda = grid[  i  *n+(startn-1)];
+        double oldb = grid[(i-1)*n+(startn-1)];
+        for (int j=startn; j<endn; j++) {
+            const double newb = grid[(i-1)*n+j];
+            const double newa = newb - oldb + olda;
+            grid[i*n+j] = newa;
+            olda = newa;
+            oldb = newb;
+        }
+    }
+}
+
+#endif
diff --git a/C1z/p2p-simd-openmp.c b/C1z/p2p-simd-openmp.c
index a9444d02f..7b98240ad 100644
--- a/C1z/p2p-simd-openmp.c
+++ b/C1z/p2p-simd-openmp.c
@@ -60,19 +60,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
-
-static inline void sweep_tile(int startm, int endm,
-                              int startn, int endn,
-                              int n, double grid[restrict])
-{
-  for (int i=startm; i<endm; i++) {
-    OMP_SIMD
-    for (int j=startn; j<endn; j++) {
-      OMP(ordered simd)
-      grid[i*n+j] = grid[(i-1)*n+j] + grid[i*n+(j-1)] - grid[(i-1)*n+(j-1)];
-    }
-  }
-}
+#include "p2p-kernel.h"
 
 int main(int argc, char * argv[])
 {
diff --git a/C1z/p2p-tasks-openmp.c b/C1z/p2p-tasks-openmp.c
index 6218d827a..8db9efdf2 100644
--- a/C1z/p2p-tasks-openmp.c
+++ b/C1z/p2p-tasks-openmp.c
@@ -60,17 +60,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
-
-static inline void sweep_tile(int startm, int endm,
-                              int startn, int endn,
-                              int n, double grid[restrict])
-{
-  for (int i=startm; i<endm; i++) {
-    for (int j=startn; j<endn; j++) {
-      grid[i*n+j] = grid[(i-1)*n+j] + grid[i*n+(j-1)] - grid[(i-1)*n+(j-1)];
-    }
-  }
-}
+#include "p2p-kernel.h"
 
 int main(int argc, char * argv[])
 {
diff --git a/C1z/p2p.c b/C1z/p2p.c
index db21f57a8..894d8cd1f 100644
--- a/C1z/p2p.c
+++ b/C1z/p2p.c
@@ -60,17 +60,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
-
-static inline void sweep_tile(int startm, int endm,
-                              int startn, int endn,
-                              int n, double grid[restrict])
-{
-  for (int i=startm; i<endm; i++) {
-    for (int j=startn; j<endn; j++) {
-      grid[i*n+j] = grid[(i-1)*n+j] + grid[i*n+(j-1)] - grid[(i-1)*n+(j-1)];
-    }
-  }
-}
+#include "p2p-kernel.h"
 
 int main(int argc, char * argv[])
 {
diff --git a/Cxx11/p2p-hyperplane-sycl.cc b/Cxx11/p2p-hyperplane-sycl.cc
index 305f5de38..68e7fc712 100644
--- a/Cxx11/p2p-hyperplane-sycl.cc
+++ b/Cxx11/p2p-hyperplane-sycl.cc
@@ -60,32 +60,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
-
-inline void sweep_tile_sequential(int startm, int endm,
-                                  int startn, int endn,
-                                  int n, std::vector<double> & grid)
-{
-  for (auto i=startm; i<endm; i++) {
-    for (auto j=startn; j<endn; j++) {
-      grid[i*n+j] = grid[(i-1)*n+j] + grid[i*n+(j-1)] - grid[(i-1)*n+(j-1)];
-    }
-  }
-}
-
-#if 0
-inline void sweep_tile_hyperplane(int startm, int endm,
-                                  int startn, int endn,
-                                  int n, std::vector<double> & grid)
-{
-  for (auto i=2; i<=2*n-2; i++) {
-    for (auto j=std::max(2,i-n+2); j<=std::min(i,n); j++) {
-      const auto x = i-j+1;
-      const auto y = j-1;
-      grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)];
-    }
-  }
-}
-#endif
+#include "p2p-kernel.h"
 
 int main(int argc, char* argv[])
 {
diff --git a/Cxx11/p2p-hyperplane-vector-openmp.cc b/Cxx11/p2p-hyperplane-vector-openmp.cc
index 4a3f317ae..6bd35bcfe 100644
--- a/Cxx11/p2p-hyperplane-vector-openmp.cc
+++ b/Cxx11/p2p-hyperplane-vector-openmp.cc
@@ -60,32 +60,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
-
-inline void sweep_tile_sequential(int startm, int endm,
-                                  int startn, int endn,
-                                  int n, double grid[])
-{
-  for (auto i=startm; i<endm; i++) {
-    for (auto j=startn; j<endn; j++) {
-      grid[i*n+j] = grid[(i-1)*n+j] + grid[i*n+(j-1)] - grid[(i-1)*n+(j-1)];
-    }
-  }
-}
-
-#if 0
-inline void sweep_tile_hyperplane(int startm, int endm,
-                                  int startn, int endn,
-                                  int n, double grid[])
-{
-  for (auto i=2; i<=2*n-2; i++) {
-    for (auto j=std::max(2,i-n+2); j<=std::min(i,n); j++) {
-      const auto x = i-j+1;
-      const auto y = j-1;
-      grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)];
-    }
-  }
-}
-#endif
+#include "p2p-kernel.h"
 
 int main(int argc, char* argv[])
 {
@@ -198,7 +173,7 @@ int main(int argc, char* argv[])
           for (int j=std::max(2,i-(nb+1)+2); j<=std::min(i,nb+1); j++) {
             const int ib = nc*(i-j+1-1)+1;
             const int jb = nc*(j-1-1)+1;
-            sweep_tile_sequential(ib, std::min(n,ib+nc), jb, std::min(n,jb+nc), n, grid);
+            sweep_tile(ib, std::min(n,ib+nc), jb, std::min(n,jb+nc), n, grid);
           }
         }
       }
diff --git a/Cxx11/p2p-hyperplane-vector-ornlacc.cc b/Cxx11/p2p-hyperplane-vector-ornlacc.cc
index 05aac1ced..932cc1d89 100644
--- a/Cxx11/p2p-hyperplane-vector-ornlacc.cc
+++ b/Cxx11/p2p-hyperplane-vector-ornlacc.cc
@@ -60,6 +60,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
+#include "p2p-kernel.h"
 
 int main(int argc, char* argv[])
 {
@@ -153,7 +154,7 @@ int main(int argc, char* argv[])
           for (int j=std::max(2,i-(nb+1)+2); j<=std::min(i,nb+1); j++) {
             const int ib = nc*(i-j)+1;
             const int jb = nc*(j-2)+1;
-            //sweep_tile_sequential(ib, std::min(n,ib+nc), jb, std::min(n,jb+nc), n, grid);
+            //sweep_tile(ib, std::min(n,ib+nc), jb, std::min(n,jb+nc), n, grid);
             #pragma acc loop vector
             for (int i=ib; i<std::min(n,ib+nc); i++) {
               for (int j=jb; j<std::min(n,jb+nc); j++) {
diff --git a/Cxx11/p2p-hyperplane-vector-pstl.cc b/Cxx11/p2p-hyperplane-vector-pstl.cc
index 91b0392f0..3e9030f33 100644
--- a/Cxx11/p2p-hyperplane-vector-pstl.cc
+++ b/Cxx11/p2p-hyperplane-vector-pstl.cc
@@ -60,32 +60,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
-
-inline void sweep_tile_sequential(int startm, int endm,
-                                  int startn, int endn,
-                                  int n, std::vector<double> & grid)
-{
-  for (auto i=startm; i<endm; i++) {
-    for (auto j=startn; j<endn; j++) {
-      grid[i*n+j] = grid[(i-1)*n+j] + grid[i*n+(j-1)] - grid[(i-1)*n+(j-1)];
-    }
-  }
-}
-
-#if 0
-inline void sweep_tile_hyperplane(int startm, int endm,
-                                  int startn, int endn,
-                                  int n, std::vector<double> & grid)
-{
-  for (auto i=2; i<=2*n-2; i++) {
-    for (auto j=std::max(2,i-n+2); j<=std::min(i,n); j++) {
-      const auto x = i-j+1;
-      const auto y = j-1;
-      grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)];
-    }
-  }
-}
-#endif
+#include "p2p-kernel.h"
 
 int main(int argc, char* argv[])
 {
@@ -190,7 +165,7 @@ int main(int argc, char* argv[])
 #endif
           const int ib = nc*(i-j)+1;
           const int jb = nc*(j-2)+1;
-          sweep_tile_sequential(ib, std::min(n,ib+nc), jb, std::min(n,jb+nc), n, grid);
+          sweep_tile(ib, std::min(n,ib+nc), jb, std::min(n,jb+nc), n, grid);
         });
       }
     }
diff --git a/Cxx11/p2p-hyperplane-vector-tbb.cc b/Cxx11/p2p-hyperplane-vector-tbb.cc
index 6c4ad9aac..863580a0c 100644
--- a/Cxx11/p2p-hyperplane-vector-tbb.cc
+++ b/Cxx11/p2p-hyperplane-vector-tbb.cc
@@ -60,32 +60,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
-
-inline void sweep_tile_sequential(int startm, int endm,
-                                  int startn, int endn,
-                                  int n, std::vector<double> & grid)
-{
-  for (auto i=startm; i<endm; i++) {
-    for (auto j=startn; j<endn; j++) {
-      grid[i*n+j] = grid[(i-1)*n+j] + grid[i*n+(j-1)] - grid[(i-1)*n+(j-1)];
-    }
-  }
-}
-
-#if 0
-inline void sweep_tile_hyperplane(int startm, int endm,
-                                  int startn, int endn,
-                                  int n, std::vector<double> & grid)
-{
-  for (auto i=2; i<=2*n-2; i++) {
-    for (auto j=std::max(2,i-n+2); j<=std::min(i,n); j++) {
-      const auto x = i-j+1;
-      const auto y = j-1;
-      grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)];
-    }
-  }
-}
-#endif
+#include "p2p-kernel.h"
 
 int main(int argc, char* argv[])
 {
@@ -176,7 +151,7 @@ int main(int argc, char* argv[])
         tbb::parallel_for( std::max(2,i-(nb+1)+2), std::min(i,nb+1)+1, [=,&grid](int j) {
           const int ib = nc*(i-j)+1;
           const int jb = nc*(j-2)+1;
-          sweep_tile_sequential(ib, std::min(n,ib+nc), jb, std::min(n,jb+nc), n, grid);
+          sweep_tile(ib, std::min(n,ib+nc), jb, std::min(n,jb+nc), n, grid);
         });
       }
     }
diff --git a/Cxx11/p2p-kernel.h b/Cxx11/p2p-kernel.h
new file mode 100644
index 000000000..f402eba37
--- /dev/null
+++ b/Cxx11/p2p-kernel.h
@@ -0,0 +1,63 @@
+#define RESTRICT __restrict__
+
+#if 1
+
+inline void sweep_tile(int startm, int endm,
+                       int startn, int endn,
+                       int n, double * RESTRICT grid)
+{
+  for (int i=startm; i<endm; i++) {
+    for (int j=startn; j<endn; j++) {
+      grid[i*n+j] = grid[(i-1)*n+j] + grid[i*n+(j-1)] - grid[(i-1)*n+(j-1)];
+    }
+  }
+}
+
+inline void sweep_tile(int startm, int endm,
+                       int startn, int endn,
+                       int n, std::vector<double> & grid)
+{
+  for (auto i=startm; i<endm; i++) {
+    for (auto j=startn; j<endn; j++) {
+      grid[i*n+j] = grid[(i-1)*n+j] + grid[i*n+(j-1)] - grid[(i-1)*n+(j-1)];
+    }
+  }
+}
+
+#else
+
+inline void sweep_tile(int startm, int endm,
+                       int startn, int endn,
+                       int n, double * RESTRICT grid)
+{
+    for (int i=startm; i<endm; i++) {
+        double olda = grid[  i  *n+(startn-1)];
+        double oldb = grid[(i-1)*n+(startn-1)];
+        for (int j=startn; j<endn; j++) {
+            double const newb = grid[(i-1)*n+j];
+            double const newa = newb - oldb + olda;
+            grid[i*n+j] = newa;
+            olda = newa;
+            oldb = newb;
+        }
+    }
+}
+
+inline void sweep_tile(int startm, int endm,
+                       int startn, int endn,
+                       int n, std::vector<double> & grid)
+{
+    for (int i=startm; i<endm; i++) {
+        double olda = grid[  i  *n+(startn-1)];
+        double oldb = grid[(i-1)*n+(startn-1)];
+        for (int j=startn; j<endn; j++) {
+            double const newb = grid[(i-1)*n+j];
+            double const newa = newb - oldb + olda;
+            grid[i*n+j] = newa;
+            olda = newa;
+            oldb = newb;
+        }
+    }
+}
+
+#endif
diff --git a/Cxx11/p2p-tasks-openmp.cc b/Cxx11/p2p-tasks-openmp.cc
index 46ce6e334..a1da7f5fa 100644
--- a/Cxx11/p2p-tasks-openmp.cc
+++ b/Cxx11/p2p-tasks-openmp.cc
@@ -60,17 +60,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
-
-inline void sweep_tile(int startm, int endm,
-                       int startn, int endn,
-                       int n, double * RESTRICT grid)
-{
-  for (int i=startm; i<endm; i++) {
-    for (int j=startn; j<endn; j++) {
-      grid[i*n+j] = grid[(i-1)*n+j] + grid[i*n+(j-1)] - grid[(i-1)*n+(j-1)];
-    }
-  }
-}
+#include "p2p-kernel.h"
 
 int main(int argc, char* argv[])
 {
diff --git a/Cxx11/p2p-tasks-tbb.cc b/Cxx11/p2p-tasks-tbb.cc
index 3142a1e42..c19a55d44 100644
--- a/Cxx11/p2p-tasks-tbb.cc
+++ b/Cxx11/p2p-tasks-tbb.cc
@@ -61,21 +61,11 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
+#include "p2p-kernel.h"
 
 #include "tbb/flow_graph.h"
 #include "tbb/parallel_for.h"
 
-inline void sweep_tile(int startm, int endm,
-                       int startn, int endn,
-                       int n, double grid[])
-{
-  for (auto i=startm; i<endm; i++) {
-    for (auto j=startn; j<endn; j++) {
-      grid[i*n+j] = grid[(i-1)*n+j] + grid[i*n+(j-1)] - grid[(i-1)*n+(j-1)];
-    }
-  }
-}
-
 int main(int argc, char* argv[])
 {
   std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
diff --git a/Cxx11/p2p-vector.cc b/Cxx11/p2p-vector.cc
index da4041642..26a39cf6b 100644
--- a/Cxx11/p2p-vector.cc
+++ b/Cxx11/p2p-vector.cc
@@ -60,17 +60,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
-
-inline void sweep_tile(int startm, int endm,
-                       int startn, int endn,
-                       int n, double * RESTRICT grid)
-{
-  for (int i=startm; i<endm; i++) {
-    for (int j=startn; j<endn; j++) {
-      grid[i*n+j] = grid[(i-1)*n+j] + grid[i*n+(j-1)] - grid[(i-1)*n+(j-1)];
-    }
-  }
-}
+#include "p2p-kernel.h"
 
 int main(int argc, char* argv[])
 {
@@ -147,8 +137,14 @@ int main(int argc, char* argv[])
 
       if (mc==m && nc==n) {
         for (int i=1; i<m; i++) {
+          double olda = grid[  i  *n];
+          double oldb = grid[(i-1)*n];
           for (int j=1; j<n; j++) {
-            pgrid[i*n+j] = pgrid[(i-1)*n+j] + pgrid[i*n+(j-1)] - pgrid[(i-1)*n+(j-1)];
+            double const newb = grid[(i-1)*n+j];
+            double const newa = newb - oldb + olda;
+            grid[i*n+j] = newa;
+            olda = newa;
+            oldb = newb;
           }
         }
       } else {

From 1f13c2dad584a21d1532a2dbf5cf2f005a87e711 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Thu, 17 May 2018 08:42:14 -0700
Subject: [PATCH 092/245] thrust has nstream, not p2p

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 03fedd850..8bd22e28a 100644
--- a/README.md
+++ b/README.md
@@ -92,7 +92,7 @@ f = see footnotes
 | SYCL                 |     |    y    |     y     |    y    |        |       |
 | Boost.Compute        |     |         |           |    y    |        |       |
 | Parallel STL         |  y  |    y    |     y     |    y    |        |       |
-| Thrust               |  y  |         |           |         |        |       |
+| Thrust               |     |         |           |    y    |        |       |
 | TBB                  |  y  |    y    |     y     |    y    |        |       |
 | Kokkos               |  y  |    y    |     y     |    y    |        |       |
 | RAJA                 |  y  |    y    |     y     |    y    |        |       |

From d8473d71f9960233ed08d74570a67e8c56d65b8a Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Mon, 21 May 2018 10:01:07 -0700
Subject: [PATCH 093/245] Thrust nstream fix (#346)

add device sync
---
 Cxx11/nstream-device-thrust.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Cxx11/nstream-device-thrust.cu b/Cxx11/nstream-device-thrust.cu
index 7ea49dabd..13cd1a4e5 100644
--- a/Cxx11/nstream-device-thrust.cu
+++ b/Cxx11/nstream-device-thrust.cu
@@ -63,6 +63,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
+#include "prk_cuda.h"
 
 int main(int argc, char * argv[])
 {
@@ -134,6 +135,7 @@ int main(int argc, char * argv[])
                         thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin())),
                         thrust::make_zip_iterator(thrust::make_tuple(A.end()  , B.end()  , C.end())),
                         nstream);
+      prk::CUDA::check( cudaDeviceSynchronize() );
     }
     nstream_time = prk::wtime() - nstream_time;
   }

From 23149c19cc873a1458c77f1cf101bf3ab62d0e1f Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Tue, 22 May 2018 08:24:12 -0700
Subject: [PATCH 094/245] remove std::transform (#344)

* remove std::transform
* preserve access pattern
---
 Cxx11/stencil-vector-openmp.cc | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/Cxx11/stencil-vector-openmp.cc b/Cxx11/stencil-vector-openmp.cc
index 98343a798..5f5e59f42 100644
--- a/Cxx11/stencil-vector-openmp.cc
+++ b/Cxx11/stencil-vector-openmp.cc
@@ -204,22 +204,17 @@ int main(int argc, char* argv[])
       // Apply the stencil operator
       stencil(n, tile_size, in, out);
       // Add constant to solution to force refresh of neighbor data, if any
-#ifdef _OPENMP
       OMP_FOR( collapse(2) )
       for (auto it=0; it<n; it+=tile_size) {
         for (auto jt=0; jt<n; jt+=tile_size) {
           for (auto i=it; i<std::min(n,it+tile_size); i++) {
-            OMP_SIMD
+            PRAGMA_SIMD
             for (auto j=jt; j<std::min(n,jt+tile_size); j++) {
               in[i*n+j] += 1.0;
             }
           }
         }
       }
-
-#else
-      std::transform(in.begin(), in.end(), in.begin(), [](double c) { return c+=1.0; });
-#endif
     }
     OMP_BARRIER
     OMP_MASTER

From 4729d3adec8302764a650c8676c7cd769a0a80fe Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 23 May 2018 06:18:44 -0700
Subject: [PATCH 095/245] split C++ headers (#349)

* refactor headers
* remove unnecessary preprocessor token
* relocate preprocess tokens
* update make.def examples
* further localize include files where needed
---
 .gitignore                              |   6 +
 Cxx11/Makefile                          |  18 +-
 Cxx11/dgemm-cblas.cc                    |   2 +-
 Cxx11/dgemm-cublas.cu                   |   2 +-
 Cxx11/dgemm-vector.cc                   |   2 +-
 Cxx11/nstream-host-thrust.cc            |   1 +
 Cxx11/nstream-kokkos.cc                 |   1 +
 Cxx11/nstream-occa.cc                   |   2 +
 Cxx11/nstream-sycl.cc                   |   2 +
 Cxx11/nstream-valarray-boost-compute.cc |   2 +
 Cxx11/nstream-vector-boost-compute.cc   |   2 +-
 Cxx11/nstream-vector-pstl.cc            |   1 +
 Cxx11/nstream-vector-raja.cc            |   1 +
 Cxx11/nstream-vector-tbb.cc             |   1 +
 Cxx11/p2p-hyperplane-sycl.cc            |   2 +
 Cxx11/p2p-hyperplane-vector-openmp.cc   |   1 +
 Cxx11/p2p-hyperplane-vector-pstl.cc     |   1 +
 Cxx11/p2p-hyperplane-vector-tbb.cc      |   1 +
 Cxx11/p2p-innerloop-vector-tbb.cc       |   1 +
 Cxx11/p2p-tasks-tbb.cc                  |   4 +-
 Cxx11/p2p-vector-raja.cc                |   1 +
 Cxx11/p2p-vector-tbb.cc                 |   1 +
 Cxx11/prk_kokkos.h                      |  41 +++++
 Cxx11/prk_openmp.h                      |  94 ++++++++++
 Cxx11/prk_pstl.h                        |  52 ++++++
 Cxx11/prk_raja.h                        |  40 +++++
 Cxx11/prk_ranges.h                      |  74 ++++++++
 Cxx11/prk_simd.h                        |  52 ++++++
 Cxx11/prk_tbb.h                         |  55 ++++++
 Cxx11/prk_thrust.h                      |  50 ++++++
 Cxx11/prk_util.h                        | 218 +++---------------------
 Cxx11/stencil-kokkos.cc                 |   1 +
 Cxx11/stencil-sycl.cc                   |   2 +
 Cxx11/stencil-vector-pstl.cc            |   1 +
 Cxx11/stencil-vector-raja.cc            |   1 +
 Cxx11/stencil-vector-tbb.cc             |   1 +
 Cxx11/transpose-host-thrust.cc          |   1 +
 Cxx11/transpose-kokkos.cc               |   1 +
 Cxx11/transpose-occa.cc                 |   2 +
 Cxx11/transpose-sycl.cc                 |   2 +
 Cxx11/transpose-vector-pstl.cc          |   1 +
 Cxx11/transpose-vector-raja.cc          |   1 +
 Cxx11/transpose-vector-tbb.cc           |   1 +
 common/make.defs.cray                   |   4 +
 common/make.defs.gcc                    |  14 +-
 common/make.defs.intel                  |  23 +--
 common/make.defs.llvm                   |  19 ++-
 47 files changed, 569 insertions(+), 237 deletions(-)
 create mode 100644 Cxx11/prk_kokkos.h
 create mode 100644 Cxx11/prk_openmp.h
 create mode 100644 Cxx11/prk_pstl.h
 create mode 100644 Cxx11/prk_raja.h
 create mode 100644 Cxx11/prk_ranges.h
 create mode 100644 Cxx11/prk_simd.h
 create mode 100644 Cxx11/prk_tbb.h
 create mode 100644 Cxx11/prk_thrust.h

diff --git a/.gitignore b/.gitignore
index 6cd0e154b..91cb027fc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -112,6 +112,12 @@ C1z/transpose-openmp
 C1z/transpose-target
 C1z/transpose-taskloop
 C1z/transpose-ispc
+Cxx11/boost
+Cxx11/compute
+Cxx11/triSYCL
+Cxx11/occa
+Cxx11/pstl
+Cxx11/range-v3
 Cxx11/dgemm-vector
 Cxx11/dgemm-cblas
 Cxx11/dgemm-cublas
diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index 07be4fa0b..652c423ba 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -31,22 +31,22 @@ endif
 #ASMFLAGS = -fsource-asm -fverbose-asm -fasm-blocks -fcode-asm
 ASMFLAGS = -fverbose-asm
 
-OMPFLAGS = $(OPENMPFLAG)
+OMPFLAGS = $(OPENMPFLAG) -DUSE_OPENMP
 TARGETFLAGS = $(OFFLOADFLAG)
 OPENCLFLAGS = $(OPENCLFLAG)
 # We do not yet handle all possible exceptions...
 #OPENCLFLAGS += -D__CL_ENABLE_EXCEPTIONS
-SYCLFLAGS = $(SYCLFLAG) -DUSE_2D_INDEXING=0
+SYCLFLAGS = $(SYCLFLAG) -DUSE_SYCL -DUSE_2D_INDEXING=0
 ORNLACCFLAGS = $(ORNLACCFLAG)
-TBBFLAGS = $(TBBFLAG) -DPRK_TBB_PARTITIONER=$(PRK_TBB_PARTITIONER)
+TBBFLAGS = $(TBBFLAG) -DUSE_TBB -DPRK_TBB_PARTITIONER=$(PRK_TBB_PARTITIONER)
 CBLASFLAGS = $(CBLASFLAG) $(OPENMPFLAG)
-BOOSTFLAGS = $(BOOSTFLAG)
-RANGEFLAGS = -DUSE_RANGES $(RANGEFLAG)
+BOOSTFLAGS = $(BOOSTFLAG) -DUSE_BOOST
+RANGEFLAGS = $(RANGEFLAG) -DUSE_RANGES
 STLFLAGS = $(STLFLAG) $(RANGEFLAGS)
-PSTLFLAGS = $(PSTLFLAG) $(RANGEFLAGS)
-RAJAFLAGS = $(RAJAFLAG)
+PSTLFLAGS = $(PSTLFLAG) $(RANGEFLAGS) -DUSE_PSTL
+RAJAFLAGS = $(RAJAFLAG) -DUSE_RAJA
 THRUSTFLAGS = $(THRUSTFLAG) $(RANGEFLAGS) -DUSE_THRUST
-KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) $(RANGEFLAGS)
+KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) $(RANGEFLAGS) -DUSE_KOKKOS
 ORNLACCFLAGS = $(ORNLACCFLAG)
 
 ifdef OCCADIR
@@ -170,7 +170,7 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h
 	$(CXX) $(CXXFLAGS) $< $(RANGEFLAGS) -o $@
 
 %-boost-compute: %-boost-compute.cc prk_util.h
-	$(CXX) $(CXXFLAGS) $< $(BOOSTFLAGS) -DUSE_BOOST_COMPUTE $(OPENCLFLAGS) -o $@
+	$(CXX) $(CXXFLAGS) $< $(BOOSTFLAGS) $(OPENCLFLAGS) -o $@
 
 %-raja: %-raja.cc prk_util.h
 	$(CXX) $(CXXFLAGS) $< $(RAJAFLAGS) -o $@
diff --git a/Cxx11/dgemm-cblas.cc b/Cxx11/dgemm-cblas.cc
index cb0e44f51..8390b7c11 100644
--- a/Cxx11/dgemm-cblas.cc
+++ b/Cxx11/dgemm-cblas.cc
@@ -302,7 +302,7 @@ int main(int argc, char * argv[])
   const double reference = 0.25 * std::pow(forder,3) * std::pow(forder-1.0,2) * (iterations+1);
   double residuum(0);
   for (int b=0; b<matrices; ++b) {
-      const auto checksum = prk_reduce(C[b].begin(), C[b].end(), 0.0);
+      const auto checksum = prk::reduce(C[b].begin(), C[b].end(), 0.0);
       residuum += std::abs(checksum-reference)/reference;
   }
   residuum/=matrices;
diff --git a/Cxx11/dgemm-cublas.cu b/Cxx11/dgemm-cublas.cu
index bced87c6c..d780980e1 100644
--- a/Cxx11/dgemm-cublas.cu
+++ b/Cxx11/dgemm-cublas.cu
@@ -309,7 +309,7 @@ int main(int argc, char * argv[])
   const double reference = 0.25 * std::pow(forder,3) * std::pow(forder-1.0,2) * (iterations+1);
   double residuum(0);
   for (int b=0; b<matrices; ++b) {
-      const auto checksum = prk_reduce( &(h_c[b*order*order+0]), &(h_c[b*order*order+nelems]), 0.0);
+      const auto checksum = prk::reduce( &(h_c[b*order*order+0]), &(h_c[b*order*order+nelems]), 0.0);
       residuum += std::abs(checksum-reference)/reference;
   }
   residuum/=matrices;
diff --git a/Cxx11/dgemm-vector.cc b/Cxx11/dgemm-vector.cc
index 973c0df97..2734d7933 100644
--- a/Cxx11/dgemm-vector.cc
+++ b/Cxx11/dgemm-vector.cc
@@ -186,7 +186,7 @@ int main(int argc, char * argv[])
 
   const auto forder = static_cast<double>(order);
   const auto reference = 0.25 * std::pow(forder,3) * std::pow(forder-1.0,2) * (iterations+1);
-  const auto checksum = prk_reduce(C.begin(), C.end(), 0.0);
+  const auto checksum = prk::reduce(C.begin(), C.end(), 0.0);
 
   const auto epsilon = 1.0e-8;
   const auto residuum = std::abs(checksum-reference)/reference;
diff --git a/Cxx11/nstream-host-thrust.cc b/Cxx11/nstream-host-thrust.cc
index 5bc29d145..c06c89108 100644
--- a/Cxx11/nstream-host-thrust.cc
+++ b/Cxx11/nstream-host-thrust.cc
@@ -63,6 +63,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
+#include "prk_thrust.h"
 
 int main(int argc, char * argv[])
 {
diff --git a/Cxx11/nstream-kokkos.cc b/Cxx11/nstream-kokkos.cc
index 7e468abcf..d03a47207 100644
--- a/Cxx11/nstream-kokkos.cc
+++ b/Cxx11/nstream-kokkos.cc
@@ -63,6 +63,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
+#include "prk_kokkos.h"
 
 // We build with OpenMP unless it is not available...
 #ifndef PRK_KOKKOS_BACKEND
diff --git a/Cxx11/nstream-occa.cc b/Cxx11/nstream-occa.cc
index 10b0b47fd..6d584e893 100644
--- a/Cxx11/nstream-occa.cc
+++ b/Cxx11/nstream-occa.cc
@@ -61,6 +61,8 @@
 ///
 //////////////////////////////////////////////////////////////////////
 
+#include "occa.hpp"
+
 #include "prk_util.h"
 
 int main(int argc, char * argv[])
diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc
index c5d390341..2193d4811 100644
--- a/Cxx11/nstream-sycl.cc
+++ b/Cxx11/nstream-sycl.cc
@@ -62,6 +62,8 @@
 ///
 //////////////////////////////////////////////////////////////////////
 
+#include "CL/sycl.hpp"
+
 #include "prk_util.h"
 
 int main(int argc, char * argv[])
diff --git a/Cxx11/nstream-valarray-boost-compute.cc b/Cxx11/nstream-valarray-boost-compute.cc
index 50c54846f..da3ded46f 100644
--- a/Cxx11/nstream-valarray-boost-compute.cc
+++ b/Cxx11/nstream-valarray-boost-compute.cc
@@ -62,6 +62,8 @@
 ///
 //////////////////////////////////////////////////////////////////////
 
+#include "boost/compute.hpp"
+
 #include "prk_util.h"
 
 namespace compute = boost::compute;
diff --git a/Cxx11/nstream-vector-boost-compute.cc b/Cxx11/nstream-vector-boost-compute.cc
index fec24fbbf..619c02374 100644
--- a/Cxx11/nstream-vector-boost-compute.cc
+++ b/Cxx11/nstream-vector-boost-compute.cc
@@ -62,7 +62,7 @@
 ///
 //////////////////////////////////////////////////////////////////////
 
-#define LAMBDA_MAKE_TUPLE 1
+#include "boost/compute.hpp"
 
 #include "prk_util.h"
 
diff --git a/Cxx11/nstream-vector-pstl.cc b/Cxx11/nstream-vector-pstl.cc
index 852db5de5..9eb18c78f 100644
--- a/Cxx11/nstream-vector-pstl.cc
+++ b/Cxx11/nstream-vector-pstl.cc
@@ -63,6 +63,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
+#include "prk_pstl.h"
 
 // See ParallelSTL.md for important information.
 
diff --git a/Cxx11/nstream-vector-raja.cc b/Cxx11/nstream-vector-raja.cc
index 31c6434e7..62f92832f 100644
--- a/Cxx11/nstream-vector-raja.cc
+++ b/Cxx11/nstream-vector-raja.cc
@@ -63,6 +63,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
+#include "prk_raja.h"
 
 #if defined(RAJA_ENABLE_OPENMP)
   typedef RAJA::omp_parallel_for_exec thread_exec;
diff --git a/Cxx11/nstream-vector-tbb.cc b/Cxx11/nstream-vector-tbb.cc
index cb73d3eda..0fbc777c2 100644
--- a/Cxx11/nstream-vector-tbb.cc
+++ b/Cxx11/nstream-vector-tbb.cc
@@ -63,6 +63,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
+#include "prk_tbb.h"
 
 int main(int argc, char * argv[])
 {
diff --git a/Cxx11/p2p-hyperplane-sycl.cc b/Cxx11/p2p-hyperplane-sycl.cc
index 68e7fc712..a738beffa 100644
--- a/Cxx11/p2p-hyperplane-sycl.cc
+++ b/Cxx11/p2p-hyperplane-sycl.cc
@@ -59,6 +59,8 @@
 ///
 //////////////////////////////////////////////////////////////////////
 
+#include "CL/sycl.hpp"
+
 #include "prk_util.h"
 #include "p2p-kernel.h"
 
diff --git a/Cxx11/p2p-hyperplane-vector-openmp.cc b/Cxx11/p2p-hyperplane-vector-openmp.cc
index 6bd35bcfe..471ce336c 100644
--- a/Cxx11/p2p-hyperplane-vector-openmp.cc
+++ b/Cxx11/p2p-hyperplane-vector-openmp.cc
@@ -60,6 +60,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
+#include "prk_openmp.h"
 #include "p2p-kernel.h"
 
 int main(int argc, char* argv[])
diff --git a/Cxx11/p2p-hyperplane-vector-pstl.cc b/Cxx11/p2p-hyperplane-vector-pstl.cc
index 3e9030f33..e17412ac2 100644
--- a/Cxx11/p2p-hyperplane-vector-pstl.cc
+++ b/Cxx11/p2p-hyperplane-vector-pstl.cc
@@ -60,6 +60,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
+#include "prk_pstl.h"
 #include "p2p-kernel.h"
 
 int main(int argc, char* argv[])
diff --git a/Cxx11/p2p-hyperplane-vector-tbb.cc b/Cxx11/p2p-hyperplane-vector-tbb.cc
index 863580a0c..9c523a369 100644
--- a/Cxx11/p2p-hyperplane-vector-tbb.cc
+++ b/Cxx11/p2p-hyperplane-vector-tbb.cc
@@ -60,6 +60,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
+#include "prk_tbb.h"
 #include "p2p-kernel.h"
 
 int main(int argc, char* argv[])
diff --git a/Cxx11/p2p-innerloop-vector-tbb.cc b/Cxx11/p2p-innerloop-vector-tbb.cc
index 2bff51f15..1f58ab081 100644
--- a/Cxx11/p2p-innerloop-vector-tbb.cc
+++ b/Cxx11/p2p-innerloop-vector-tbb.cc
@@ -60,6 +60,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
+#include "prk_tbb.h"
 
 int main(int argc, char* argv[])
 {
diff --git a/Cxx11/p2p-tasks-tbb.cc b/Cxx11/p2p-tasks-tbb.cc
index c19a55d44..266c87ad6 100644
--- a/Cxx11/p2p-tasks-tbb.cc
+++ b/Cxx11/p2p-tasks-tbb.cc
@@ -61,11 +61,9 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
+#include "prk_tbb.h"
 #include "p2p-kernel.h"
 
-#include "tbb/flow_graph.h"
-#include "tbb/parallel_for.h"
-
 int main(int argc, char* argv[])
 {
   std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
diff --git a/Cxx11/p2p-vector-raja.cc b/Cxx11/p2p-vector-raja.cc
index 7dfeea21d..00164aa94 100644
--- a/Cxx11/p2p-vector-raja.cc
+++ b/Cxx11/p2p-vector-raja.cc
@@ -60,6 +60,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
+#include "prk_raja.h"
 
 int main(int argc, char* argv[])
 {
diff --git a/Cxx11/p2p-vector-tbb.cc b/Cxx11/p2p-vector-tbb.cc
index bcc45b27b..74cf57819 100644
--- a/Cxx11/p2p-vector-tbb.cc
+++ b/Cxx11/p2p-vector-tbb.cc
@@ -60,6 +60,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
+#include "prk_tbb.h"
 
 void SequentialSweep(int m, int n, std::vector<double> & grid)
 {
diff --git a/Cxx11/prk_kokkos.h b/Cxx11/prk_kokkos.h
new file mode 100644
index 000000000..760ae91d7
--- /dev/null
+++ b/Cxx11/prk_kokkos.h
@@ -0,0 +1,41 @@
+///
+/// Copyright (c) 2018, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef PRK_KOKKOS_H
+#define PRK_KOKKOS_H
+
+#ifdef USE_KOKKOS
+# include <Kokkos_Core.hpp>
+# include <Kokkos_Concepts.hpp>
+# include <Kokkos_MemoryTraits.hpp>
+#endif
+
+#endif /* PRK_KOKKOS_H */
diff --git a/Cxx11/prk_openmp.h b/Cxx11/prk_openmp.h
new file mode 100644
index 000000000..4d6396b9b
--- /dev/null
+++ b/Cxx11/prk_openmp.h
@@ -0,0 +1,94 @@
+///
+/// Copyright (c) 2018, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef PRK_OPENMP_H
+#define PRK_OPENMP_H
+
+#define PRAGMA(x) _Pragma(#x)
+
+#ifdef _OPENMP
+# include <omp.h>
+# define OMP(x) PRAGMA(omp x)
+# define OMP_PARALLEL(x) PRAGMA(omp parallel x)
+# define OMP_PARALLEL_FOR_REDUCE(x) PRAGMA(omp parallel for reduction (x) )
+# define OMP_MASTER PRAGMA(omp master)
+# define OMP_BARRIER PRAGMA(omp barrier)
+# define OMP_FOR(x) PRAGMA(omp for x)
+# define OMP_FOR_REDUCE(x) PRAGMA(omp for reduction (x) )
+// OpenMP SIMD if supported, else not.
+# if (_OPENMP >= 201300)
+#  define OMP_SIMD PRAGMA(omp simd)
+#  define OMP_FOR_SIMD PRAGMA(omp for simd)
+#  define OMP_TASK(x) PRAGMA(omp task x)
+#  define OMP_TASKLOOP(x) PRAGMA(omp taskloop x )
+#  if defined(__INTEL_COMPILER)
+#   define OMP_TASKLOOP_COLLAPSE(n,x) PRAGMA(omp taskloop x )
+#  else
+#   define OMP_TASKLOOP_COLLAPSE(n,x) PRAGMA(omp taskloop collapse(n) x )
+#  endif
+#  define OMP_TASKWAIT PRAGMA(omp taskwait)
+#  define OMP_ORDERED(x) PRAGMA(omp ordered x)
+#  define OMP_TARGET(x) PRAGMA(omp target x)
+#  define OMP_DECLARE_TARGET PRAGMA(omp declare target)
+#  define OMP_END_DECLARE_TARGET PRAGMA(omp end declare target)
+# else
+#  define OMP_SIMD
+#  define OMP_FOR_SIMD PRAGMA(omp for)
+#  define OMP_TASK(x)
+#  define OMP_TASKLOOP(x)
+#  define OMP_TASKLOOP_COLLAPSE(n,x)
+#  define OMP_TASKWAIT
+#  define OMP_ORDERED(x)
+#  define OMP_TARGET(x)
+#  define OMP_DECLARE_TARGET
+#  define OMP_END_DECLARE_TARGET
+# endif
+#else
+# define OMP(x)
+# define OMP_PARALLEL(x)
+# define OMP_PARALLEL_FOR_REDUCE(x)
+# define OMP_MASTER
+# define OMP_BARRIER
+# define OMP_FOR(x)
+# define OMP_FOR_REDUCE(x)
+# define OMP_SIMD
+# define OMP_FOR_SIMD
+# define OMP_TASK(x)
+# define OMP_TASKLOOP(x)
+# define OMP_TASKLOOP_COLLAPSE(n,x)
+# define OMP_TASKWAIT
+# define OMP_ORDERED(x)
+# define OMP_TARGET(x)
+# define OMP_DECLARE_TARGET
+# define OMP_END_DECLARE_TARGET
+#endif
+
+#endif /* PRK_OPENMP_H */
diff --git a/Cxx11/prk_pstl.h b/Cxx11/prk_pstl.h
new file mode 100644
index 000000000..5c89d765f
--- /dev/null
+++ b/Cxx11/prk_pstl.h
@@ -0,0 +1,52 @@
+///
+/// Copyright (c) 2018, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef PRK_PSTL_H
+#define PRK_PSTL_H
+
+#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1800)
+#define USE_INTEL_PSTL
+#endif
+
+#ifdef USE_PSTL
+# ifdef USE_INTEL_PSTL
+#  include <pstl/execution>
+#  include <pstl/algorithm>
+#  include <pstl/numeric>
+#  include <pstl/memory>
+# elif defined(__GNUC__) && defined(__GNUC_MINOR__) && \
+       ( (__GNUC__ >= 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) )
+#  include <parallel/algorithm>
+#  include <parallel/numeric>
+# endif
+#endif
+
+#endif /* PRK_PSTL_H */
diff --git a/Cxx11/prk_raja.h b/Cxx11/prk_raja.h
new file mode 100644
index 000000000..fb0bb25b8
--- /dev/null
+++ b/Cxx11/prk_raja.h
@@ -0,0 +1,40 @@
+///
+/// Copyright (c) 2018, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef PRK_RAJA_H
+#define PRK_RAJA_H
+
+#ifdef USE_RAJA
+# define RAJA_ENABLE_NESTED 1
+# include "RAJA/RAJA.hpp"
+#endif
+
+#endif /* PRK_RAJA_H */
diff --git a/Cxx11/prk_ranges.h b/Cxx11/prk_ranges.h
new file mode 100644
index 000000000..d794016ff
--- /dev/null
+++ b/Cxx11/prk_ranges.h
@@ -0,0 +1,74 @@
+///
+/// Copyright (c) 2018, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef PRK_RANGES_H
+#define PRK_RANGES_H
+
+#if defined(USE_RANGES)
+# if defined(USE_RANGES_IRANGE)
+#  include "boost/range/irange.hpp"
+# elif defined(USE_RANGES_TS)
+#  include "range/v3/view/iota.hpp"
+#  include "range/v3/view/slice.hpp"
+#  include "range/v3/view/stride.hpp"
+# else
+#  error You have not provided a version of ranges to use.
+# endif
+#endif
+
+namespace prk {
+
+    template <class S, class E>
+    auto range(S start, E end) {
+#if defined(USE_BOOST_IRANGE)
+        return boost::irange(static_cast<decltype(end)>(start), end);
+#elif defined(USE_RANGES_TS)
+        return ranges::view::iota(static_cast<decltype(end)>(start), end);
+#endif
+    }
+
+    template <class S, class E, class B>
+    auto range(S start, E end, B blocking) {
+#if defined(USE_BOOST_IRANGE)
+        return boost::irange(static_cast<decltype(end)>(start), end, static_cast<decltype(end)>(blocking) );
+#elif defined(USE_RANGES_TS)
+        // NOTE:
+        // iota(s) | slice(s,e) | stride(b)  is faster than
+        // iota(s,e) | stride(b) for some reason.
+        return ranges::view::iota(static_cast<decltype(end)>(start)) |
+               ranges::view::slice(static_cast<decltype(end)>(start), end) |
+               ranges::view::stride(static_cast<decltype(end)>(blocking));
+#endif
+    }
+
+} // namespace prk
+
+#endif /* PRK_RANGES_H */
diff --git a/Cxx11/prk_simd.h b/Cxx11/prk_simd.h
new file mode 100644
index 000000000..742bc4fcb
--- /dev/null
+++ b/Cxx11/prk_simd.h
@@ -0,0 +1,52 @@
+///
+/// Copyright (c) 2018, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef PRK_SIMD_H
+#define PRK_SIMD_H
+
+#define PRAGMA(x) _Pragma(#x)
+
+#if defined(__INTEL_COMPILER)
+# define PRAGMA_SIMD PRAGMA(vector) PRAGMA(ivdep)
+// According to https://github.com/LLNL/RAJA/pull/310, this improves lambda performance
+# define PRAGMA_INLINE PRAGMA(forceinline recursive)
+#elif defined(__GNUC__) && defined(__GNUC_MINOR__) && ( ( (__GNUC__ == 4) && (__GNUC_MINOR__ == 9) ) || (__GNUC__ >= 5) )
+# define PRAGMA_SIMD PRAGMA(GCC ivdep)
+# define PRAGMA_INLINE PRAGMA(inline)
+#elif defined(__clang__)
+# define PRAGMA_SIMD PRAGMA(clang loop vectorize(assume_safety))
+# define PRAGMA_INLINE
+#else
+# define PRAGMA_SIMD
+# define PRAGMA_INLINE
+#endif
+
+#endif /* PRK_SIMD_H */
diff --git a/Cxx11/prk_tbb.h b/Cxx11/prk_tbb.h
new file mode 100644
index 000000000..86abfd5c9
--- /dev/null
+++ b/Cxx11/prk_tbb.h
@@ -0,0 +1,55 @@
+///
+/// Copyright (c) 2018, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef PRK_TBB_H
+#define PRK_TBB_H
+
+#ifdef USE_TBB
+# include <tbb/tbb.h>
+# include <tbb/parallel_for.h>
+# include <tbb/blocked_range.h>
+# include <tbb/flow_graph.h>
+# if ( PRK_TBB_PARTITIONER == 1)
+//#  warning STATIC
+   tbb::static_partitioner tbb_partitioner;
+# elif ( PRK_TBB_PARTITIONER == 2)
+//#  warning AFFINITY
+   tbb::affinity_partitioner tbb_partitioner;
+# elif ( PRK_TBB_PARTITIONER == 3)
+//#  warning SIMPLE
+   tbb::simple_partitioner tbb_partitioner;
+# else
+//#  warning AUTO
+   tbb::auto_partitioner tbb_partitioner;
+# endif
+#endif
+
+#endif /* PRK_TBB_H */
diff --git a/Cxx11/prk_thrust.h b/Cxx11/prk_thrust.h
new file mode 100644
index 000000000..4ffd50c34
--- /dev/null
+++ b/Cxx11/prk_thrust.h
@@ -0,0 +1,50 @@
+///
+/// Copyright (c) 2018, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef PRK_THRUST_H
+#define PRK_THRUST_H
+
+#ifdef USE_THRUST
+# ifdef __NVCC__
+#  include <thrust/device_vector.h>
+# endif
+# include <thrust/host_vector.h>
+# include <thrust/fill.h>
+# include <thrust/sequence.h>
+# include <thrust/for_each.h>
+# include <thrust/transform.h>
+# include <thrust/transform_reduce.h>
+# include <thrust/iterator/counting_iterator.h>
+# include <thrust/execution_policy.h>
+# include <thrust/functional.h>
+#endif
+
+#endif /* PRK_THRUST_H */
diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h
index 76363d8e2..2c0be683f 100644
--- a/Cxx11/prk_util.h
+++ b/Cxx11/prk_util.h
@@ -64,22 +64,6 @@
 #include <numeric>
 #include <algorithm>
 
-template<class I, class T>
-const T prk_reduce(I first, I last, T init) {
-#if (defined(__cplusplus) && (__cplusplus >= 201703L)) && !defined(__GNUC__)
-    return std::reduce(first, last, init);
-#elif (defined(__cplusplus) && (__cplusplus >= 201103L))
-    return std::accumulate(first, last, init);
-#else
-    // unreachable, but preserved as reference implementation
-    T r(0);
-    for (I i=first; i!=last; ++i) {
-        r += *i;
-    }
-    return r;
-#endif
-}
-
 // These headers are busted with NVCC and GCC 5.4.0
 // The <future> header is busted with Cray C++ 8.6.1.
 #if !defined(__NVCC__) && !defined(_CRAYC)
@@ -87,174 +71,39 @@ const T prk_reduce(I first, I last, T init) {
 #include <future>
 #endif
 
-#define PRAGMA(x) _Pragma(#x)
+#include "prk_simd.h"
 
-#ifdef _OPENMP
-# include <omp.h>
-# define OMP(x) PRAGMA(omp x)
-# define OMP_PARALLEL(x) PRAGMA(omp parallel x)
-# define OMP_PARALLEL_FOR_REDUCE(x) PRAGMA(omp parallel for reduction (x) )
-# define OMP_MASTER PRAGMA(omp master)
-# define OMP_BARRIER PRAGMA(omp barrier)
-# define OMP_FOR(x) PRAGMA(omp for x)
-# define OMP_FOR_REDUCE(x) PRAGMA(omp for reduction (x) )
-// OpenMP SIMD if supported, else not.
-# if (_OPENMP >= 201300)
-#  define OMP_SIMD PRAGMA(omp simd)
-#  define OMP_FOR_SIMD PRAGMA(omp for simd)
-#  define OMP_TASK(x) PRAGMA(omp task x)
-#  define OMP_TASKLOOP(x) PRAGMA(omp taskloop x )
-#  if defined(__INTEL_COMPILER)
-#   define OMP_TASKLOOP_COLLAPSE(n,x) PRAGMA(omp taskloop x )
-#  else
-#   define OMP_TASKLOOP_COLLAPSE(n,x) PRAGMA(omp taskloop collapse(n) x )
-#  endif
-#  define OMP_TASKWAIT PRAGMA(omp taskwait)
-#  define OMP_ORDERED(x) PRAGMA(omp ordered x)
-#  define OMP_TARGET(x) PRAGMA(omp target x)
-#  define OMP_DECLARE_TARGET PRAGMA(omp declare target)
-#  define OMP_END_DECLARE_TARGET PRAGMA(omp end declare target)
-# else
-#  define OMP_SIMD
-#  define OMP_FOR_SIMD PRAGMA(omp for)
-#  define OMP_TASK(x)
-#  define OMP_TASKLOOP(x)
-#  define OMP_TASKLOOP_COLLAPSE(n,x)
-#  define OMP_TASKWAIT
-#  define OMP_ORDERED(x)
-#  define OMP_TARGET(x)
-#  define OMP_DECLARE_TARGET
-#  define OMP_END_DECLARE_TARGET
-# endif
-#else
-# define OMP(x)
-# define OMP_PARALLEL(x)
-# define OMP_PARALLEL_FOR_REDUCE(x)
-# define OMP_MASTER
-# define OMP_BARRIER
-# define OMP_FOR(x)
-# define OMP_FOR_REDUCE(x)
-# define OMP_SIMD
-# define OMP_FOR_SIMD
-# define OMP_TASK(x)
-# define OMP_TASKLOOP(x)
-# define OMP_TASKLOOP_COLLAPSE(n,x)
-# define OMP_TASKWAIT
-# define OMP_ORDERED(x)
-# define OMP_TARGET(x)
-# define OMP_DECLARE_TARGET
-# define OMP_END_DECLARE_TARGET
+#ifdef USE_RANGES
+# include "prk_ranges.h"
 #endif
 
-#if defined(__INTEL_COMPILER)
-# define PRAGMA_SIMD PRAGMA(vector) PRAGMA(ivdep)
-// According to https://github.com/LLNL/RAJA/pull/310, this improves lambda performance
-# define PRAGMA_INLINE PRAGMA(forceinline recursive)
-#elif defined(__GNUC__) && defined(__GNUC_MINOR__) && ( ( (__GNUC__ == 4) && (__GNUC_MINOR__ == 9) ) || (__GNUC__ >= 5) )
-# define PRAGMA_SIMD PRAGMA(GCC ivdep)
-# define PRAGMA_INLINE PRAGMA(inline)
-#elif defined(__clang__)
-# define PRAGMA_SIMD PRAGMA(clang loop vectorize(assume_safety))
-# define PRAGMA_INLINE
-#else
-# define PRAGMA_SIMD
-# define PRAGMA_INLINE
-#endif
-
-#ifdef USE_TBB
-# include <tbb/tbb.h>
-# include <tbb/parallel_for.h>
-# include <tbb/blocked_range.h>
-# if ( PRK_TBB_PARTITIONER == 1)
-//#  warning STATIC
-   tbb::static_partitioner tbb_partitioner;
-# elif ( PRK_TBB_PARTITIONER == 2)
-//#  warning AFFINITY
-   tbb::affinity_partitioner tbb_partitioner;
-# elif ( PRK_TBB_PARTITIONER == 3)
-//#  warning SIMPLE
-   tbb::simple_partitioner tbb_partitioner;
-# else
-//#  warning AUTO
-   tbb::auto_partitioner tbb_partitioner;
-# endif
-#endif
-
-#if defined(USE_RANGES)
-# if defined(USE_BOOST_IRANGE)
-#  include "boost/range/irange.hpp"
-# elif defined(USE_RANGES_TS)
-#  include "range/v3/view/iota.hpp"
-#  include "range/v3/view/slice.hpp"
-#  include "range/v3/view/stride.hpp"
-# else
-#  error You have not provided a version of ranges to use.
-# endif
-#endif
-
-#if defined(USE_BOOST_COMPUTE)
-# include "boost/compute.hpp"
-# include "boost/compute/container/valarray.hpp"
-#endif
-
-#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1800)
-#define USE_INTEL_PSTL
-#endif
-
-#ifdef USE_PSTL
-# ifdef USE_INTEL_PSTL
-#  include <pstl/execution>
-#  include <pstl/algorithm>
-#  include <pstl/numeric>
-#  include <pstl/memory>
-# elif defined(__GNUC__) && defined(__GNUC_MINOR__) && \
-       ( (__GNUC__ >= 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) )
-#  include <parallel/algorithm>
-#  include <parallel/numeric>
-# endif
-#endif
-
-#ifdef USE_KOKKOS
-# include <Kokkos_Core.hpp>
-# include <Kokkos_Concepts.hpp>
-# include <Kokkos_MemoryTraits.hpp>
-#endif
-
-#ifdef USE_RAJA
-# define RAJA_ENABLE_NESTED 1
-# include "RAJA/RAJA.hpp"
-#endif
-
-#ifdef USE_THRUST
-# ifdef __NVCC__
-#  include <thrust/device_vector.h>
-# endif
-# include <thrust/host_vector.h>
-# include <thrust/fill.h>
-# include <thrust/sequence.h>
-# include <thrust/for_each.h>
-# include <thrust/transform.h>
-# include <thrust/transform_reduce.h>
-# include <thrust/iterator/counting_iterator.h>
-# include <thrust/execution_policy.h>
-# include <thrust/functional.h>
-#endif
-
-#ifdef USE_SYCL
-# include "CL/sycl.hpp"
-#endif
-
-#ifdef USE_OCCA
-# include "occa.hpp"
+#ifdef USE_OPENMP
+# include "prk_openmp.h"
 #endif
 
 #define RESTRICT __restrict__
 
 namespace prk {
 
+    template<class I, class T>
+    const T reduce(I first, I last, T init) {
+#if (defined(__cplusplus) && (__cplusplus >= 201703L)) && !defined(__GNUC__)
+        return std::reduce(first, last, init);
+#elif (defined(__cplusplus) && (__cplusplus >= 201103L))
+        return std::accumulate(first, last, init);
+#else
+        // unreachable, but preserved as reference implementation
+        T r(0);
+        for (I i=first; i!=last; ++i) {
+            r += *i;
+        }
+        return r;
+#endif
+    }
+
     static inline double wtime(void)
     {
-#ifdef _OPENMP
+#if defined(USE_OPENMP) && defined(_OPENMP)
         return omp_get_wtime();
 #else
         using t = std::chrono::high_resolution_clock;
@@ -271,29 +120,6 @@ namespace prk {
         return ( numerator / denominator + (numerator % denominator > 0) );
     }
 
-    template <class S, class E>
-    auto range(S start, E end) {
-#if defined(USE_BOOST_IRANGE)
-        return boost::irange(static_cast<decltype(end)>(start), end);
-#elif defined(USE_RANGES_TS)
-        return ranges::view::iota(static_cast<decltype(end)>(start), end);
-#endif
-    }
-
-    template <class S, class E, class B>
-    auto range(S start, E end, B blocking) {
-#if defined(USE_BOOST_IRANGE)
-        return boost::irange(static_cast<decltype(end)>(start), end, static_cast<decltype(end)>(blocking) );
-#elif defined(USE_RANGES_TS)
-        // NOTE:
-        // iota(s) | slice(s,e) | stride(b)  is faster than
-        // iota(s,e) | stride(b) for some reason.
-        return ranges::view::iota(static_cast<decltype(end)>(start)) |
-               ranges::view::slice(static_cast<decltype(end)>(start), end) |
-               ranges::view::stride(static_cast<decltype(end)>(blocking));
-#endif
-    }
-
 } // namespace prk
 
 #endif /* PRK_UTIL_H */
diff --git a/Cxx11/stencil-kokkos.cc b/Cxx11/stencil-kokkos.cc
index d2eb5db2a..b92bd4a57 100644
--- a/Cxx11/stencil-kokkos.cc
+++ b/Cxx11/stencil-kokkos.cc
@@ -61,6 +61,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
+#include "prk_kokkos.h"
 
 typedef Kokkos::View<double**, Kokkos::LayoutRight> matrix;
 //typedef Kokkos::View<double**, Kokkos::LayoutLeft> matrix;
diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc
index 9989b5bdc..e42eaef50 100644
--- a/Cxx11/stencil-sycl.cc
+++ b/Cxx11/stencil-sycl.cc
@@ -60,6 +60,8 @@
 ///
 //////////////////////////////////////////////////////////////////////
 
+#include "CL/sycl.hpp"
+
 #include "prk_util.h"
 #include "stencil_sycl.hpp"
 
diff --git a/Cxx11/stencil-vector-pstl.cc b/Cxx11/stencil-vector-pstl.cc
index 8495032ca..6c14800af 100644
--- a/Cxx11/stencil-vector-pstl.cc
+++ b/Cxx11/stencil-vector-pstl.cc
@@ -61,6 +61,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
+#include "prk_pstl.h"
 // See ParallelSTL.md for important information.
 #if defined(USE_PSTL) && defined(USE_INTEL_PSTL)
 #include "stencil_pstl.hpp"
diff --git a/Cxx11/stencil-vector-raja.cc b/Cxx11/stencil-vector-raja.cc
index 3bcecb4ec..cff3421f3 100644
--- a/Cxx11/stencil-vector-raja.cc
+++ b/Cxx11/stencil-vector-raja.cc
@@ -61,6 +61,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
+#include "prk_raja.h"
 
 // This must be before the stencil header, which uses this.
 #ifdef RAJA_ENABLE_OPENMP
diff --git a/Cxx11/stencil-vector-tbb.cc b/Cxx11/stencil-vector-tbb.cc
index 2f5c27488..81a252019 100644
--- a/Cxx11/stencil-vector-tbb.cc
+++ b/Cxx11/stencil-vector-tbb.cc
@@ -61,6 +61,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
+#include "prk_tbb.h"
 #include "stencil_tbb.hpp"
 
 void nothing(const int n, const int t, std::vector<double> & in, std::vector<double> & out)
diff --git a/Cxx11/transpose-host-thrust.cc b/Cxx11/transpose-host-thrust.cc
index 53066208b..11482700a 100644
--- a/Cxx11/transpose-host-thrust.cc
+++ b/Cxx11/transpose-host-thrust.cc
@@ -50,6 +50,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
+#include "prk_thrust.h"
 
 int main(int argc, char * argv[])
 {
diff --git a/Cxx11/transpose-kokkos.cc b/Cxx11/transpose-kokkos.cc
index aff072f53..fa35ebb6e 100644
--- a/Cxx11/transpose-kokkos.cc
+++ b/Cxx11/transpose-kokkos.cc
@@ -50,6 +50,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
+#include "prk_kokkos.h"
 
 int main(int argc, char * argv[])
 {
diff --git a/Cxx11/transpose-occa.cc b/Cxx11/transpose-occa.cc
index 5b05b73ce..888d6a230 100644
--- a/Cxx11/transpose-occa.cc
+++ b/Cxx11/transpose-occa.cc
@@ -51,6 +51,8 @@
 ///
 //////////////////////////////////////////////////////////////////////
 
+#include "occa.hpp"
+
 #include "prk_util.h"
 
 int main(int argc, char * argv[])
diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc
index 5055374d2..1c8489806 100644
--- a/Cxx11/transpose-sycl.cc
+++ b/Cxx11/transpose-sycl.cc
@@ -49,6 +49,8 @@
 ///
 //////////////////////////////////////////////////////////////////////
 
+#include "CL/sycl.hpp"
+
 #include "prk_util.h"
 
 int main(int argc, char * argv[])
diff --git a/Cxx11/transpose-vector-pstl.cc b/Cxx11/transpose-vector-pstl.cc
index 222322bd8..f51f76e7f 100644
--- a/Cxx11/transpose-vector-pstl.cc
+++ b/Cxx11/transpose-vector-pstl.cc
@@ -50,6 +50,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
+#include "prk_pstl.h"
 
 // See ParallelSTL.md for important information.
 
diff --git a/Cxx11/transpose-vector-raja.cc b/Cxx11/transpose-vector-raja.cc
index 84738694d..59b757eea 100644
--- a/Cxx11/transpose-vector-raja.cc
+++ b/Cxx11/transpose-vector-raja.cc
@@ -50,6 +50,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
+#include "prk_raja.h"
 
 const int tile_size = 32;
 
diff --git a/Cxx11/transpose-vector-tbb.cc b/Cxx11/transpose-vector-tbb.cc
index 45ea4bc5b..d154677fd 100644
--- a/Cxx11/transpose-vector-tbb.cc
+++ b/Cxx11/transpose-vector-tbb.cc
@@ -53,6 +53,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
+#include "prk_tbb.h"
 
 int main(int argc, char * argv[])
 {
diff --git a/common/make.defs.cray b/common/make.defs.cray
index aee737d77..b1c59b8f4 100644
--- a/common/make.defs.cray
+++ b/common/make.defs.cray
@@ -27,6 +27,10 @@ ORNLACCFLAG=-h acc
 # NERSC: "module load boost"
 BOOSTFLAG=-DUSE_BOOST -DUSE_BOOST_COMPUTE -I$${BOOST_DIR}/include
 #
+# CBLAS for C++ DGEMM
+#
+CBLASFLAG= # LibSci likely included by default
+#
 # MPI
 #
 # cc wraps gcc, icc or craycc, depending on your PrgEng module.
diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index 586cec08c..e4eda09f5 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -31,6 +31,7 @@ DEFAULT_OPT_FLAGS+=-Wall
 # OpenMP flags
 #
 OPENMPFLAG=-fopenmp
+OPENMPSIMDFLAG=-fopenmp-simd
 OFFLOADFLAG=-foffload="-O3 -v"
 ORNLACCFLAG=-fopenacc
 #
@@ -48,7 +49,7 @@ OPENCLFLAG=-framework OpenCL
 # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
 SYCLDIR=./triSYCL
 SYCLCXX=${CXX} -std=c++17 ${OPENMPFLAG}
-SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include
+SYCLFLAG=-I$(SYCLDIR)/include
 # ProGTX
 # https://github.com/ProGTX/sycl-gtx
 #SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx
@@ -70,17 +71,16 @@ TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
 #
 # Parallel STL, Boost, etc.
 #
-BOOSTFLAG=-DUSE_BOOST -I/usr/local/Cellar/boost/1.65.1/include
+BOOSTFLAG=-I/usr/local/Cellar/boost/1.65.1/include
 #RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
 RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
-#PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} ${RANGEFLAG}
-PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG}
+PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG}
 KOKKOSDIR=/opt/kokkos/gcc
-KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG}
+KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG}
 RAJADIR=/opt/raja/gcc
-RAJAFLAG=-DUSE_RAJA -I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
+RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
 THRUSTDIR=/Users/jrhammon/Work/NVIDIA/thrust
-THRUSTFLAG=-DUSE_THRUST -I${THRUSTDIR} -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP
+THRUSTFLAG=-I${THRUSTDIR} -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP
 #
 # CBLAS for C++ DGEMM
 #
diff --git a/common/make.defs.intel b/common/make.defs.intel
index 556d940b6..00a781cac 100644
--- a/common/make.defs.intel
+++ b/common/make.defs.intel
@@ -29,6 +29,7 @@ DEFAULT_OPT_FLAGS+=-qopt-report=5
 # OpenMP flags
 #
 OPENMPFLAG=-qopenmp
+OPENMPSIMDFLAG=-qopenmp-simd
 OFFLOADFLAG=-qopenmp-offload=host
 #
 # OpenCL flags
@@ -45,7 +46,7 @@ OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL
 # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
 SYCLDIR=./triSYCL
 SYCLCXX=${CXX} ${OPENMPFLAG}
-SYCLFLAG=-DUSE_SYCL -std=gnu++14 -I$(SYCLDIR)/include
+SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include
 # ProGTX
 # https://github.com/ProGTX/sycl-gtx
 #SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx
@@ -54,7 +55,7 @@ SYCLFLAG=-DUSE_SYCL -std=gnu++14 -I$(SYCLDIR)/include
 #
 # OCCA
 #
-OCCADIR=${HOME}/prk-repo/Cxx11/occa
+#OCCADIR=${HOME}/prk-repo/Cxx11/occa
 #
 # Cilk
 #
@@ -62,19 +63,21 @@ CILKFLAG=-intel-extensions # default
 #
 # TBB
 #
-TBBFLAG=-DUSE_TBB -tbb
-#TBBFLAG=-DUSE_TBB -tbb_preview -DTBB_PREVIEW_FLOW_GRAPH_TRACE
+TBBFLAG=-tbb
+#TBBFLAG=-tbb_preview -DTBB_PREVIEW_FLOW_GRAPH_TRACE
 #
 # Parallel STL, Boost, etc.
 #
-#BOOSTFLAG=-DUSE_BOOST -I/usr/local/Cellar/boost/1.65.1/include
+BOOSTFLAG=-I/usr/local/Cellar/boost/1.65.1/include
 RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
-RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
-PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} ${RANGEFLAG}
+#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
+PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG}
 KOKKOSDIR=/opt/kokkos/intel
-KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl
+KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl
 RAJADIR=/opt/raja/intel
-RAJAFLAG=-DUSE_RAJA -I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
+RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
+#THRUSTDIR=/opt/nvidia/thrust
+#THRUSTFLAG=-I${THRUSTDIR} -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP
 #
 # CBLAS for C++ DGEMM
 #
@@ -85,7 +88,7 @@ CBLASFLAG=-DMKL -mkl
 # Mac w/ CUDA emulation via https://github.com/hughperkins/coriander
 #NVCC=/opt/llvm/cocl/bin/cocl
 # Linux w/ NVIDIA CUDA
-NVCC=nvcc
+NVCC=nvcc -arch=sm_50
 CUDAFLAGS=-g -O3 -std=c++11
 # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233
 CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED
diff --git a/common/make.defs.llvm b/common/make.defs.llvm
index 481a624ea..f4a54c4f8 100644
--- a/common/make.defs.llvm
+++ b/common/make.defs.llvm
@@ -34,8 +34,9 @@ DEFAULT_OPT_FLAGS=-g -O3 -mtune=native -ffast-math
 # OpenMP flags
 #
 OPENMPFLAG=-fopenmp
+OPENMPSIMDFLAG=-fopenmp-simd
 OFFLOADFLAG=-fopenmp
-ORNLACCFLAG= # Flang does not support OpenACC
+#ORNLACCFLAG= # Flang does not support OpenACC
 # Klondike weirdness
 # OPENMPFLAG+=-L/opt/intel/compilers_and_libraries_2018.0.082/linux/compiler/lib/intel64_lin -liomp5
 # Mac weirdness
@@ -75,8 +76,8 @@ SYCLFLAG+=-std=c++14
 # triSYCL
 # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
 SYCLDIR=./triSYCL
-SYCLCXX=${CXX} -std=gnu++14 ${OPENMPFLAG}
-SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include $(DEFAULT_OPT_FLAGS)
+SYCLCXX=${CXX} ${OPENMPFLAG}
+SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include $(DEFAULT_OPT_FLAGS)
 # ProGTX
 # https://github.com/ProGTX/sycl-gtx
 #SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx
@@ -94,14 +95,16 @@ TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
 #
 # Parallel STL, Boost, etc.
 #
-BOOSTFLAG=-DUSE_BOOST -I/usr/local/Cellar/boost/1.65.1/include
+BOOSTFLAG=-I/usr/local/Cellar/boost/1.65.1/include
 #RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
 RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
-PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG}
+PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG}
 KOKKOSDIR=/opt/kokkos/clang
-KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl
+KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl
 RAJADIR=/opt/raja/clang
-RAJAFLAG=-DUSE_RAJA -I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
+RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
+#THRUSTDIR=/opt/nvidia/thrust
+#THRUSTFLAG=-I${THRUSTDIR} -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP
 #
 # CBLAS for C++ DGEMM
 #
@@ -112,7 +115,7 @@ CBLASFLAG=-DACCELERATE -framework Accelerate
 # Mac w/ CUDA emulation via https://github.com/hughperkins/coriander
 NVCC=/opt/llvm/cocl/bin/cocl
 # Linux w/ NVIDIA CUDA
-#NVCC=nvcc
+#NVCC=nvcc -arch=sm_50
 CUDAFLAGS=-g -O3 -std=c++11
 # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233
 CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED

From ec0919f6503bd72d7b79731636a655e7600bf5d4 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 30 May 2018 22:11:14 -0700
Subject: [PATCH 096/245] add CBLAS transpose (#350)

* add CBLAS transpose

- MKL and Accelerate are supported via extensions
- add Travis support
- add 'cblas' target to Makefile

* add CBLAS transpose to docs

* remove Rob from default owners so errant code review requests stop happening

* remove cblas_int

* fix unrelated issue with homebrew ompi
---
 CODEOWNERS               |   4 +-
 Cxx11/Makefile           |   2 +
 Cxx11/dgemm-cblas.cc     |  24 +++---
 Cxx11/transpose-cblas.cc | 178 +++++++++++++++++++++++++++++++++++++++
 README.md                |   2 +-
 travis/build-run-prk.sh  |   3 +-
 travis/install-mpi.sh    |   2 +
 7 files changed, 199 insertions(+), 16 deletions(-)
 create mode 100644 Cxx11/transpose-cblas.cc

diff --git a/CODEOWNERS b/CODEOWNERS
index dafe2ca29..6f426a040 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -2,7 +2,7 @@
 # Each line is a file pattern followed by one or more owners.
 
 # These owners will be the default owners for everything in the repo.
-*       @jeffhammond @rfvander
+*       @jeffhammond
 
 # Order is important. The last matching pattern has the most precedence.
 # So if a pull request only touches javascript files, only these owners
@@ -15,8 +15,8 @@ Cxx11/*                     @jeffhammond
 FENIX/*                     @rfvander @marcgamell
 FG_MPI/*                    @rfvander
 FORTRAN/*                   @jeffhammond
-GRAPPA/*                    @nelsonje
 FORTRAN/*coarray.f90        @afanfa @zbeekman @jeffhammond
+GRAPPA/*                    @nelsonje
 JULIA/*                     @kpamnany @jeffhammond
 LEGION/*                    @magnatelee @elliottslaughter @apokayi @rfvander
 MPI1/*                      @rfvander
diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index 652c423ba..255c706e2 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -124,6 +124,8 @@ cuda: transpose-cuda
 
 cublas: transpose-cublas nstream-cublas dgemm-cublas
 
+cblas: transpose-cblas dgemm-cblas
+
 occa: transpose-occa nstream-occa
 
 ornlacc: p2p-hyperplane-vector-ornlacc
diff --git a/Cxx11/dgemm-cblas.cc b/Cxx11/dgemm-cblas.cc
index 8390b7c11..61a9292fb 100644
--- a/Cxx11/dgemm-cblas.cc
+++ b/Cxx11/dgemm-cblas.cc
@@ -63,14 +63,14 @@
 
 #if defined(MKL)
 #include <mkl.h>
-typedef MKL_INT cblas_int;
+#ifdef MKL_ILP64
+#error Use the MKL library for 32-bit integers!
+#endif
 #elif defined(ACCELERATE)
 // The location of cblas.h is not in the system include path when -framework Accelerate is provided.
 #include <Accelerate/Accelerate.h>
-typedef int cblas_int;
 #else
 #include <cblas.h>
-typedef int cblas_int;
 #endif
 
 #ifdef PRK_DEBUG
@@ -95,7 +95,7 @@ void prk_dgemm(const int order,
                const std::vector<double> & B,
                      std::vector<double> & C)
 {
-    const cblas_int n = order;
+    const int n = order;
     const double alpha = 1.0;
     const double beta  = 1.0;
 
@@ -108,7 +108,7 @@ void prk_dgemm(const int order, const int batches,
                const std::vector<std::vector<double>> & B,
                      std::vector<std::vector<double>> & C)
 {
-    const cblas_int n = order;
+    const int n = order;
     const double alpha = 1.0;
     const double beta  = 1.0;
 
@@ -123,7 +123,7 @@ void prk_dgemm(const int order, const int batches, const int nt,
                const std::vector<std::vector<double>> & B,
                      std::vector<std::vector<double>> & C)
 {
-    const cblas_int n = order;
+    const int n = order;
     const double alpha = 1.0;
     const double beta  = 1.0;
 
@@ -141,17 +141,17 @@ void prk_dgemm(const int order, const int batches,
                double** & B,
                double** & C)
 {
-    const cblas_int n = order;
+    const int n = order;
     const double alpha = 1.0;
     const double beta  = 1.0;
 
-    const cblas_int group_count = 1;
-    const cblas_int group_size[group_count] = { batches };
+    const int group_count = 1;
+    const int group_size[group_count] = { batches };
 
     const CBLAS_TRANSPOSE transa_array[group_count] = { CblasNoTrans };
     const CBLAS_TRANSPOSE transb_array[group_count] = { CblasNoTrans };
 
-    const cblas_int n_array[group_count] = { n };
+    const int n_array[group_count] = { n };
 
     const double alpha_array[group_count] = { alpha };
     const double beta_array[group_count]  = { beta };
@@ -182,7 +182,7 @@ void prk_dgemm(const int order, const int batches,
 int main(int argc, char * argv[])
 {
   std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
-  std::cout << "C++11 CBLAS Dense matrix-matrix multiplication: C += A x B" << std::endl;
+  std::cout << "C++11/CBLAS Dense matrix-matrix multiplication: C += A x B" << std::endl;
 
   //////////////////////////////////////////////////////////////////////
   /// Read and test input parameters
@@ -193,7 +193,7 @@ int main(int argc, char * argv[])
   int batches = 0;
   int batch_threads = 1;
   try {
-      if (argc < 2) {
+      if (argc < 3) {
         throw "Usage: <# iterations> <matrix order> [<batches> <batch threads>]";
       }
 
diff --git a/Cxx11/transpose-cblas.cc b/Cxx11/transpose-cblas.cc
new file mode 100644
index 000000000..9f7f17b07
--- /dev/null
+++ b/Cxx11/transpose-cblas.cc
@@ -0,0 +1,178 @@
+///
+/// Copyright (c) 2018, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    transpose
+///
+/// PURPOSE: This program measures the time for the transpose of a
+///          column-major stored matrix into a row-major stored matrix.
+///
+/// USAGE:   Program input is the matrix order and the number of times to
+///          repeat the operation:
+///
+///          transpose <matrix_size> <# iterations>
+///
+///          The output consists of diagnostics to make sure the
+///          transpose worked and timing statistics.
+///
+/// HISTORY: Written by  Rob Van der Wijngaart, February 2009.
+///          Converted to C++11 by Jeff Hammond, February 2016 and May 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+
+#if defined(MKL)
+#include <mkl.h>
+#ifdef MKL_ILP64
+#error Use the MKL library for 32-bit integers!
+#endif
+#elif defined(ACCELERATE)
+// The location of cblas.h is not in the system include path when -framework Accelerate is provided.
+#include <Accelerate/Accelerate.h>
+#else
+#include <cblas.h>
+#endif
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/CBLAS Matrix transpose: B = A^T" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations;
+  int order;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <matrix order>";
+      }
+
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      order = std::atoi(argv[2]);
+      if (order <= 0) {
+        throw "ERROR: Matrix Order must be greater than 0";
+      } else if (order > std::floor(std::sqrt(INT_MAX))) {
+        throw "ERROR: matrix dimension too large - overflow risk";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Matrix order         = " << order << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  auto trans_time = 0.0;
+
+  std::vector<double> A(order*order);
+  std::vector<double> B(order*order,0.0);
+  std::vector<double> T(order*order);
+  double one[1] = {1.0};
+
+  // fill A with the sequence 0 to order^2-1 as doubles
+  std::iota(A.begin(), A.end(), 0.0);
+
+  {
+    for (auto iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) trans_time = prk::wtime();
+
+      // T = transpose(A)
+#if defined(MKL)
+      mkl_domatcopy('R','T', order, order, 1.0, &(A[0]), order, &(T[0]), order);
+#elif defined(ACCELERATE)
+      vDSP_mtransD(&(A[0]), 1, &(T[0]), 1, order, order);
+#else
+#warning No CBLAS transpose extension available!
+      for (auto i=0;i<order; i++) {
+        for (auto j=0;j<order;j++) {
+          T2[i*order+j] = A[j*order+i];
+        }
+      }
+#endif
+      // B += T
+      cblas_daxpy(order*order, 1.0, &(T[0]), 1, &(B[0]), 1);
+      // A += 1
+      cblas_daxpy(order*order, 1.0, one, 0, &(A[0]), 1);
+    }
+    trans_time = prk::wtime() - trans_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  const auto addit = (iterations+1.) * (iterations/2.);
+  double abserr(0);
+  // TODO: replace with std::generate, std::accumulate, or similar
+  for (auto j=0; j<order; j++) {
+    for (auto i=0; i<order; i++) {
+      const int ij = i*order+j;
+      const int ji = j*order+i;
+      const double reference = static_cast<double>(ij)*(1.+iterations)+addit;
+      abserr += std::fabs(B[ji] - reference);
+    }
+  }
+
+#ifdef VERBOSE
+  std::cout << "Sum of absolute differences: " << abserr << std::endl;
+#endif
+
+  const auto epsilon = 1.0e-8;
+  if (abserr < epsilon) {
+    std::cout << "Solution validates" << std::endl;
+    auto avgtime = trans_time/iterations;
+    auto bytes = (size_t)order * (size_t)order * sizeof(double);
+    std::cout << "Rate (MB/s): " << 1.0e-6 * (2L*bytes)/avgtime
+              << " Avg time (s): " << avgtime << std::endl;
+  } else {
+    std::cout << "ERROR: Aggregate squared error " << abserr
+              << " exceeds threshold " << epsilon << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
+
+
diff --git a/README.md b/README.md
index 8bd22e28a..7214b0a9e 100644
--- a/README.md
+++ b/README.md
@@ -98,7 +98,7 @@ f = see footnotes
 | RAJA                 |  y  |    y    |     y     |    y    |        |       |
 | CUDA                 |  i  |    y    |     y     |    y    |        |       |
 | CUBLAS               |     |         |     y     |    y    |        |   y   |
-| CBLAS                |     |         |           |         |        |   y   |
+| CBLAS                |     |         |     y     |         |        |   y   |
 | OpenACC              |  y  |         |           |         |        |       |
 
 * [SYCL](http://sycl.tech/)
diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index 00b4395e6..432592d05 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -344,7 +344,8 @@ case "$PRK_TARGET" in
         # C++11 with CBLAS
         if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then
             echo "CBLASFLAG=-DACCELERATE -framework Accelerate" >> common/make.defs
-            make -C $PRK_TARGET_PATH dgemm-cblas
+            make -C $PRK_TARGET_PATH transpose-cblas dgemm-cblas
+            $PRK_TARGET_PATH/transpose-cblas    10 1024
             $PRK_TARGET_PATH/dgemm-cblas        10 400
         fi
 
diff --git a/travis/install-mpi.sh b/travis/install-mpi.sh
index 236f2f419..e1691c272 100755
--- a/travis/install-mpi.sh
+++ b/travis/install-mpi.sh
@@ -27,6 +27,8 @@ case "$os" in
                 brew upgrade mpich || brew install mpich || true
                 ;;
             openmpi)
+                brew upgrade gcc || brew install gcc || true
+                brew link --overwrite gcc || true
                 brew upgrade openmpi || brew install openmpi || true
                 ;;
             *)

From 180230e096f4caab81d0c7f1255a6a2eb7cf522b Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Sun, 3 Jun 2018 20:39:46 -0700
Subject: [PATCH 097/245] use std instead of pstl namespace and abstract it
 away (#353)

---
 Cxx11/generate-cxx-stencil.py       |  4 +--
 Cxx11/nstream-vector-pstl.cc        |  4 +--
 Cxx11/p2p-hyperplane-vector-pstl.cc |  4 +--
 Cxx11/prk_pstl.h                    |  1 +
 Cxx11/stencil-vector-pstl.cc        | 10 ++++----
 Cxx11/stencil_pstl.hpp              | 40 ++++++++++++++---------------
 Cxx11/transpose-vector-pstl.cc      |  4 +--
 7 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/Cxx11/generate-cxx-stencil.py b/Cxx11/generate-cxx-stencil.py
index b3b573887..a4154e9d3 100755
--- a/Cxx11/generate-cxx-stencil.py
+++ b/Cxx11/generate-cxx-stencil.py
@@ -47,8 +47,8 @@ def codegen(src,pattern,stencil_size,radius,W,model):
     elif (model=='pstl'):
         src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {\n')
         src.write('    auto inside = prk::range('+str(radius)+',n-'+str(radius)+');\n')
-        src.write('    std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) {\n')
-        src.write('      std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) {\n')
+        src.write('    std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) {\n')
+        src.write('      std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) {\n')
     elif (model=='raja'):
         src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {\n')
         #src.write('    RAJA::forallN<RAJA::NestedPolicy<RAJA::ExecList<thread_exec, RAJA::simd_exec>>>\n')
diff --git a/Cxx11/nstream-vector-pstl.cc b/Cxx11/nstream-vector-pstl.cc
index 9eb18c78f..0bab633b0 100644
--- a/Cxx11/nstream-vector-pstl.cc
+++ b/Cxx11/nstream-vector-pstl.cc
@@ -127,7 +127,7 @@ int main(int argc, char * argv[])
 
   {
 #if defined(USE_PSTL) && defined(USE_INTEL_PSTL)
-    std::for_each( pstl::execution::par_unseq, std::begin(range), std::end(range), [&] (size_t i) {
+    std::for_each( exec::par_unseq, std::begin(range), std::end(range), [&] (size_t i) {
 #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \
                         && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) )
 #warning GNU parallel
@@ -145,7 +145,7 @@ int main(int argc, char * argv[])
       if (iter==1) nstream_time = prk::wtime();
 
 #if defined(USE_PSTL) && defined(USE_INTEL_PSTL)
-      std::for_each( pstl::execution::par_unseq, std::begin(range), std::end(range), [&] (size_t i) {
+      std::for_each( exec::par_unseq, std::begin(range), std::end(range), [&] (size_t i) {
 #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \
                         && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) )
       __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (size_t i) {
diff --git a/Cxx11/p2p-hyperplane-vector-pstl.cc b/Cxx11/p2p-hyperplane-vector-pstl.cc
index e17412ac2..132b26a45 100644
--- a/Cxx11/p2p-hyperplane-vector-pstl.cc
+++ b/Cxx11/p2p-hyperplane-vector-pstl.cc
@@ -139,7 +139,7 @@ int main(int argc, char* argv[])
         const auto end   = std::min(i,n)+1;
         auto range = prk::range(begin,end);
 #if defined(USE_PSTL) && defined(USE_INTEL_PSTL)
-        std::for_each( pstl::execution::par, std::begin(range), std::end(range), [&] (auto j) {
+        std::for_each( exec::par, std::begin(range), std::end(range), [&] (auto j) {
 #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \
                         && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) )
         __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (auto j) {
@@ -157,7 +157,7 @@ int main(int argc, char* argv[])
         const auto end   = std::min(i,nb+1)+1;
         auto range = prk::range(begin,end);
 #if defined(USE_PSTL) && defined(USE_INTEL_PSTL)
-        std::for_each( pstl::execution::par, std::begin(range), std::end(range), [&] (auto j) {
+        std::for_each( exec::par, std::begin(range), std::end(range), [&] (auto j) {
 #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \
                         && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) )
         __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (auto j) {
diff --git a/Cxx11/prk_pstl.h b/Cxx11/prk_pstl.h
index 5c89d765f..11e0368bb 100644
--- a/Cxx11/prk_pstl.h
+++ b/Cxx11/prk_pstl.h
@@ -47,6 +47,7 @@
 #  include <parallel/algorithm>
 #  include <parallel/numeric>
 # endif
+namespace exec = std::execution;
 #endif
 
 #endif /* PRK_PSTL_H */
diff --git a/Cxx11/stencil-vector-pstl.cc b/Cxx11/stencil-vector-pstl.cc
index 6c14800af..a328b1420 100644
--- a/Cxx11/stencil-vector-pstl.cc
+++ b/Cxx11/stencil-vector-pstl.cc
@@ -183,8 +183,8 @@ int main(int argc, char* argv[])
   // initialize the input and output arrays
   auto range = prk::range(0,n);
 #if defined(USE_PSTL) && defined(USE_INTEL_PSTL)
-  std::for_each( pstl::execution::par, std::begin(range), std::end(range), [&] (int i) {
-    std::for_each( pstl::execution::unseq, std::begin(range), std::end(range), [&] (int j) {
+  std::for_each( exec::par, std::begin(range), std::end(range), [&] (int i) {
+    std::for_each( exec::unseq, std::begin(range), std::end(range), [&] (int j) {
 #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \
                         && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) )
   __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (int i) {
@@ -205,8 +205,8 @@ int main(int argc, char* argv[])
     // Add constant to solution to force refresh of neighbor data, if any
 #if 0
 #if defined(USE_PSTL) && defined(USE_INTEL_PSTL)
-    std::for_each( pstl::execution::par, std::begin(range), std::end(range), [&] (int i) {
-      std::for_each( pstl::execution::unseq, std::begin(range), std::end(range), [&] (int j) {
+    std::for_each( exec::par, std::begin(range), std::end(range), [&] (int i) {
+      std::for_each( exec::unseq, std::begin(range), std::end(range), [&] (int j) {
 #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \
                         && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) )
       __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (int i) {
@@ -220,7 +220,7 @@ int main(int argc, char* argv[])
     });
 #else
 #if defined(USE_PSTL) && defined(USE_INTEL_PSTL)
-    std::transform( pstl::execution::par_unseq, in.begin(), in.end(), in.begin(), [](double c) { return c+=1.0; });
+    std::transform( exec::par_unseq, in.begin(), in.end(), in.begin(), [](double c) { return c+=1.0; });
 #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \
                         && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) )
     __gnu_parallel::transform( in.begin(), in.end(), in.begin(), [](double c) { return c+=1.0; });
diff --git a/Cxx11/stencil_pstl.hpp b/Cxx11/stencil_pstl.hpp
index 8713da4d8..03f24fcb5 100644
--- a/Cxx11/stencil_pstl.hpp
+++ b/Cxx11/stencil_pstl.hpp
@@ -1,7 +1,7 @@
 void star1(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
     auto inside = prk::range(1,n-1);
-    std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) {
-      std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) {
+    std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) {
+      std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-1)*n+(j+0)] * -0.5
                           +in[(i+0)*n+(j+-1)] * -0.5
                           +in[(i+0)*n+(j+1)] * 0.5
@@ -12,8 +12,8 @@ void star1(const int n, const int t, std::vector<double> & in, std::vector<doubl
 
 void star2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
     auto inside = prk::range(2,n-2);
-    std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) {
-      std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) {
+    std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) {
+      std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-2)*n+(j+0)] * -0.125
                           +in[(i+-1)*n+(j+0)] * -0.25
                           +in[(i+0)*n+(j+-2)] * -0.125
@@ -28,8 +28,8 @@ void star2(const int n, const int t, std::vector<double> & in, std::vector<doubl
 
 void star3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
     auto inside = prk::range(3,n-3);
-    std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) {
-      std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) {
+    std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) {
+      std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556
                           +in[(i+-2)*n+(j+0)] * -0.0833333333333
                           +in[(i+-1)*n+(j+0)] * -0.166666666667
@@ -48,8 +48,8 @@ void star3(const int n, const int t, std::vector<double> & in, std::vector<doubl
 
 void star4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
     auto inside = prk::range(4,n-4);
-    std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) {
-      std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) {
+    std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) {
+      std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125
                           +in[(i+-3)*n+(j+0)] * -0.0416666666667
                           +in[(i+-2)*n+(j+0)] * -0.0625
@@ -72,8 +72,8 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
 
 void star5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
     auto inside = prk::range(5,n-5);
-    std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) {
-      std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) {
+    std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) {
+      std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02
                           +in[(i+-4)*n+(j+0)] * -0.025
                           +in[(i+-3)*n+(j+0)] * -0.0333333333333
@@ -100,8 +100,8 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
 
 void grid1(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
     auto inside = prk::range(1,n-1);
-    std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) {
-      std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) {
+    std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) {
+      std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-1)*n+(j+-1)] * -0.25
                           +in[(i+-1)*n+(j+0)] * -0.25
                           +in[(i+0)*n+(j+-1)] * -0.25
@@ -115,8 +115,8 @@ void grid1(const int n, const int t, std::vector<double> & in, std::vector<doubl
 
 void grid2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
     auto inside = prk::range(2,n-2);
-    std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) {
-      std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) {
+    std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) {
+      std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625
                           +in[(i+-2)*n+(j+-1)] * -0.0208333333333
                           +in[(i+-2)*n+(j+0)] * -0.0208333333333
@@ -144,8 +144,8 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
 
 void grid3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
     auto inside = prk::range(3,n-3);
-    std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) {
-      std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) {
+    std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) {
+      std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778
                           +in[(i+-3)*n+(j+-2)] * -0.00555555555556
                           +in[(i+-3)*n+(j+-1)] * -0.00555555555556
@@ -195,8 +195,8 @@ void grid3(const int n, const int t, std::vector<double> & in, std::vector<doubl
 
 void grid4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
     auto inside = prk::range(4,n-4);
-    std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) {
-      std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) {
+    std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) {
+      std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625
                           +in[(i+-4)*n+(j+-3)] * -0.00223214285714
                           +in[(i+-4)*n+(j+-2)] * -0.00223214285714
@@ -276,8 +276,8 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
 
 void grid5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
     auto inside = prk::range(5,n-5);
-    std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) {
-      std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) {
+    std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) {
+      std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) {
             out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01
                           +in[(i+-5)*n+(j+-4)] * -0.00111111111111
                           +in[(i+-5)*n+(j+-3)] * -0.00111111111111
diff --git a/Cxx11/transpose-vector-pstl.cc b/Cxx11/transpose-vector-pstl.cc
index f51f76e7f..e94172bd6 100644
--- a/Cxx11/transpose-vector-pstl.cc
+++ b/Cxx11/transpose-vector-pstl.cc
@@ -116,8 +116,8 @@ int main(int argc, char * argv[])
 
     // transpose
 #if defined(USE_PSTL) && defined(USE_INTEL_PSTL)
-  std::for_each( pstl::execution::par, std::begin(range), std::end(range), [&] (int i) {
-    std::for_each( pstl::execution::unseq, std::begin(range), std::end(range), [&] (int j) {
+  std::for_each( exec::par, std::begin(range), std::end(range), [&] (int i) {
+    std::for_each( exec::unseq, std::begin(range), std::end(range), [&] (int j) {
 #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \
                         && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) )
   __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (int i) {

From 3fb516a7a543f7c4d3485bea713b4f1bcacd62ac Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Sun, 3 Jun 2018 20:39:59 -0700
Subject: [PATCH 098/245] Fix GCC-8 warnings (#352)

* use std instead of pstl namespace and abstract it away

* fix all valid GCC-8 compiler warnings
---
 FORTRAN/Makefile                    |  2 +-
 FORTRAN/dgemm-openmp-target.f90     |  2 +-
 FORTRAN/dgemm-pretty.f90            |  2 +-
 FORTRAN/dgemm-taskloop-openmp.f90   |  2 +-
 FORTRAN/dgemm.f90                   |  2 +-
 FORTRAN/nstream-openmp-target.f90   | 12 +-----------
 FORTRAN/nstream-ornlacc.f90         |  2 +-
 FORTRAN/nstream-pretty.f90          |  3 +--
 FORTRAN/nstream-taskloop-openmp.f90 |  2 +-
 FORTRAN/nstream.f90                 |  6 +++++-
 FORTRAN/p2p-innerloop-ornlacc.f90   |  2 ++
 FORTRAN/p2p-ornlacc.f90             |  1 -
 FORTRAN/transpose-openmp-target.f90 | 22 +++++++++++-----------
 13 files changed, 27 insertions(+), 33 deletions(-)

diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile
index 898a237c4..4d479881c 100644
--- a/FORTRAN/Makefile
+++ b/FORTRAN/Makefile
@@ -65,7 +65,7 @@ ornlacc: p2p-ornlacc p2p-innerloop-ornlacc stencil-ornlacc transpose-ornlacc nst
 	$(FC) $(FCFLAGS) $< -o $@
 
 stencil: stencil.f90 stencil_serial.f90
-	$(FC) $(FCFLAGS) -c stencil_serial.f90 -o stencil_serial.o
+	#$(FC) $(FCFLAGS) -c stencil_serial.f90 -o stencil_serial.o
 	$(FC) $(FCFLAGS) $< -o $@
 
 %-pretty: %-pretty.f90
diff --git a/FORTRAN/dgemm-openmp-target.f90 b/FORTRAN/dgemm-openmp-target.f90
index a3b5a6e41..ed2193bba 100644
--- a/FORTRAN/dgemm-openmp-target.f90
+++ b/FORTRAN/dgemm-openmp-target.f90
@@ -197,7 +197,7 @@ program main
   if (residuum .lt. epsilon) then
     write(*,'(a)') 'Solution validates'
     avgtime = dgemm_time/iterations
-    nflops = 2 * forder**3
+    nflops = 2 * int(order,INT64)**3
     write(*,'(a,f13.6,a,f10.6)') 'Rate (MF/s): ',(1.d-6*nflops)/avgtime, &
            ' Avg time (s): ', avgtime
   else
diff --git a/FORTRAN/dgemm-pretty.f90 b/FORTRAN/dgemm-pretty.f90
index 650aa4243..e1e6ac7c2 100644
--- a/FORTRAN/dgemm-pretty.f90
+++ b/FORTRAN/dgemm-pretty.f90
@@ -178,7 +178,7 @@ program main
   if (residuum .lt. epsilon) then
     write(*,'(a)') 'Solution validates'
     avgtime = dgemm_time/iterations
-    nflops = 2 * forder**3
+    nflops = 2 * int(order,INT64)**3
     write(*,'(a,f13.6,a,f10.6)') 'Rate (MF/s): ',(1.d-6*nflops)/avgtime, &
            ' Avg time (s): ', avgtime
   else
diff --git a/FORTRAN/dgemm-taskloop-openmp.f90 b/FORTRAN/dgemm-taskloop-openmp.f90
index 67c1a3884..b127dd356 100644
--- a/FORTRAN/dgemm-taskloop-openmp.f90
+++ b/FORTRAN/dgemm-taskloop-openmp.f90
@@ -252,7 +252,7 @@ program main
   if (residuum .lt. epsilon) then
     write(*,'(a)') 'Solution validates'
     avgtime = dgemm_time/iterations
-    nflops = 2 * forder**3
+    nflops = 2 * int(order,INT64)**3
     write(*,'(a,f13.6,a,f10.6)') 'Rate (MF/s): ',(1.d-6*nflops)/avgtime, &
            ' Avg time (s): ', avgtime
   else
diff --git a/FORTRAN/dgemm.f90 b/FORTRAN/dgemm.f90
index dd7d18aaa..5f678c981 100644
--- a/FORTRAN/dgemm.f90
+++ b/FORTRAN/dgemm.f90
@@ -304,7 +304,7 @@ program main
   if (residuum .lt. epsilon) then
     write(*,'(a)') 'Solution validates'
     avgtime = dgemm_time/iterations
-    nflops = 2 * forder**3
+    nflops = 2 * int(order,INT64)**3
     write(*,'(a,f13.6,a,f10.6)') 'Rate (MF/s): ',(1.d-6*nflops)/avgtime, &
            ' Avg time (s): ', avgtime
   else
diff --git a/FORTRAN/nstream-openmp-target.f90 b/FORTRAN/nstream-openmp-target.f90
index 96c4b1679..954a86b1e 100644
--- a/FORTRAN/nstream-openmp-target.f90
+++ b/FORTRAN/nstream-openmp-target.f90
@@ -62,20 +62,10 @@
 !
 ! *******************************************************************
 
-function prk_get_wtime() result(t)
-  use iso_fortran_env
-  implicit none
-  real(kind=REAL64) ::  t
-  integer(kind=INT64) :: c, r
-  call system_clock(count = c, count_rate = r)
-  t = real(c,REAL64) / real(r,REAL64)
-end function prk_get_wtime
-
 program main
   use iso_fortran_env
   use omp_lib
   implicit none
-  real(kind=REAL64) :: prk_get_wtime
   ! for argument parsing
   integer :: err
   integer :: arglen
@@ -228,7 +218,7 @@ program main
   else
     write(*,'(a17)') 'Solution validates'
     avgtime = nstream_time/iterations;
-    bytes = 4.0 * int(length,INT64) * storage_size(A)/8
+    bytes = 4 * int(length,INT64) * storage_size(A)/8
     write(*,'(a12,f15.3,1x,a12,e15.6)')    &
         'Rate (MB/s): ', 1.d-6*bytes/avgtime, &
         'Avg time (s): ', avgtime
diff --git a/FORTRAN/nstream-ornlacc.f90 b/FORTRAN/nstream-ornlacc.f90
index 033dee814..5769d4e35 100644
--- a/FORTRAN/nstream-ornlacc.f90
+++ b/FORTRAN/nstream-ornlacc.f90
@@ -220,7 +220,7 @@ program main
   else
     write(*,'(a17)') 'Solution validates'
     avgtime = nstream_time/iterations;
-    bytes = 4.0 * int(length,INT64) * storage_size(A)/8
+    bytes = 4 * int(length,INT64) * storage_size(A)/8
     write(*,'(a12,f15.3,1x,a12,e15.6)')    &
         'Rate (MB/s): ', 1.d-6*bytes/avgtime, &
         'Avg time (s): ', avgtime
diff --git a/FORTRAN/nstream-pretty.f90 b/FORTRAN/nstream-pretty.f90
index a15e365ec..e6c9038fb 100644
--- a/FORTRAN/nstream-pretty.f90
+++ b/FORTRAN/nstream-pretty.f90
@@ -88,7 +88,6 @@ program main
   real(kind=REAL64) :: scalar
   integer(kind=INT64) :: bytes
   ! runtime variables
-  integer(kind=INT64) :: i
   integer(kind=INT32) :: k
   real(kind=REAL64) ::  asum, ar, br, cr, ref
   real(kind=REAL64) ::  t0, t1, nstream_time, avgtime
@@ -205,7 +204,7 @@ program main
   else
     write(*,'(a17)') 'Solution validates'
     avgtime = nstream_time/iterations;
-    bytes = 4.0 * int(length,INT64) * storage_size(A)/8
+    bytes = 4 * int(length,INT64) * storage_size(A)/8
     write(*,'(a12,f15.3,1x,a12,e15.6)')    &
         'Rate (MB/s): ', 1.d-6*bytes/avgtime, &
         'Avg time (s): ', avgtime
diff --git a/FORTRAN/nstream-taskloop-openmp.f90 b/FORTRAN/nstream-taskloop-openmp.f90
index 636e45d73..65d8fd056 100644
--- a/FORTRAN/nstream-taskloop-openmp.f90
+++ b/FORTRAN/nstream-taskloop-openmp.f90
@@ -226,7 +226,7 @@ program main
   else
     write(*,'(a17)') 'Solution validates'
     avgtime = nstream_time/iterations;
-    bytes = 4.0 * int(length,INT64) * storage_size(A)/8
+    bytes = 4 * int(length,INT64) * storage_size(A)/8
     write(*,'(a12,f15.3,1x,a12,e15.6)')    &
         'Rate (MB/s): ', 1.d-6*bytes/avgtime, &
         'Avg time (s): ', avgtime
diff --git a/FORTRAN/nstream.f90 b/FORTRAN/nstream.f90
index 9d35024b5..6aa9c1529 100644
--- a/FORTRAN/nstream.f90
+++ b/FORTRAN/nstream.f90
@@ -62,6 +62,7 @@
 !
 ! *******************************************************************
 
+#ifndef _OPENMP
 function prk_get_wtime() result(t)
   use iso_fortran_env
   implicit none
@@ -70,6 +71,7 @@ function prk_get_wtime() result(t)
   call system_clock(count = c, count_rate = r)
   t = real(c,REAL64) / real(r,REAL64)
 end function prk_get_wtime
+#endif
 
 program main
   use iso_fortran_env
@@ -77,7 +79,9 @@ program main
   use omp_lib
 #endif
   implicit none
+#ifndef _OPENMP
   real(kind=REAL64) :: prk_get_wtime
+#endif
   ! for argument parsing
   integer :: err
   integer :: arglen
@@ -288,7 +292,7 @@ program main
   else
     write(*,'(a17)') 'Solution validates'
     avgtime = nstream_time/iterations;
-    bytes = 4.0 * int(length,INT64) * storage_size(A)/8
+    bytes = 4 * int(length,INT64) * storage_size(A)/8
     write(*,'(a12,f15.3,1x,a12,e15.6)')    &
         'Rate (MB/s): ', 1.d-6*bytes/avgtime, &
         'Avg time (s): ', avgtime
diff --git a/FORTRAN/p2p-innerloop-ornlacc.f90 b/FORTRAN/p2p-innerloop-ornlacc.f90
index 32c24a4d4..ee35ca58a 100644
--- a/FORTRAN/p2p-innerloop-ornlacc.f90
+++ b/FORTRAN/p2p-innerloop-ornlacc.f90
@@ -132,6 +132,8 @@ program main
     stop 1
   endif
 
+  t0 = 0;
+
   do j=1,n
     do i=1,n
       grid(i,j) = 0.0d0
diff --git a/FORTRAN/p2p-ornlacc.f90 b/FORTRAN/p2p-ornlacc.f90
index 6a9a97e23..18ee965e2 100644
--- a/FORTRAN/p2p-ornlacc.f90
+++ b/FORTRAN/p2p-ornlacc.f90
@@ -78,7 +78,6 @@ program main
   real(kind=REAL64), allocatable :: grid(:,:)           ! array holding grid values
   ! runtime variables
   integer(kind=INT32) :: i, j, k
-  integer ::  me, nt
   real(kind=REAL64) ::  t0, t1, pipeline_time, avgtime  ! timing parameters
   real(kind=REAL64), parameter ::  epsilon=1.D-8        ! error tolerance
 
diff --git a/FORTRAN/transpose-openmp-target.f90 b/FORTRAN/transpose-openmp-target.f90
index b226f89af..1da28346a 100644
--- a/FORTRAN/transpose-openmp-target.f90
+++ b/FORTRAN/transpose-openmp-target.f90
@@ -67,7 +67,7 @@ program main
   integer(kind=INT64) ::  bytes                     ! combined size of matrices
   ! runtime variables
   integer(kind=INT32) ::  i, j, k
-  integer(kind=INT32) ::  it, jt, tile_size
+  !integer(kind=INT32) ::  it, jt, tile_size
   real(kind=REAL64) ::  abserr, addit, temp         ! squared error
   real(kind=REAL64) ::  t0, t1, trans_time, avgtime ! timing parameters
   real(kind=REAL64), parameter ::  epsilon=1.D-8    ! error tolerance
@@ -102,16 +102,16 @@ program main
   endif
 
   ! same default as the C implementation
-  tile_size = 32
-  if (command_argument_count().gt.2) then
-      call get_command_argument(3,argtmp,arglen,err)
-      if (err.eq.0) read(argtmp,'(i32)') tile_size
-  endif
-  if ((tile_size .lt. 1).or.(tile_size.gt.order)) then
-    write(*,'(a,i5,a,i5)') 'WARNING: tile_size ',tile_size,&
-                           ' must be >= 1 and <= ',order
-    tile_size = order ! no tiling
-  endif
+  !tile_size = 32
+  !if (command_argument_count().gt.2) then
+  !    call get_command_argument(3,argtmp,arglen,err)
+  !    if (err.eq.0) read(argtmp,'(i32)') tile_size
+  !endif
+  !if ((tile_size .lt. 1).or.(tile_size.gt.order)) then
+  !  write(*,'(a,i5,a,i5)') 'WARNING: tile_size ',tile_size,&
+  !                         ' must be >= 1 and <= ',order
+  !  tile_size = order ! no tiling
+  !endif
 
   ! ********************************************************************
   ! ** Allocate space for the input and transpose matrix

From 505328db16d543453fa465cf9b32861adb07b4f7 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Sun, 3 Jun 2018 20:48:33 -0700
Subject: [PATCH 099/245] deprecate Cilk (#357)

remove Cilk from default build
---
 C1z/Makefile         | 2 +-
 common/make.defs.gcc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/C1z/Makefile b/C1z/Makefile
index a636e7ca9..0df8225c1 100644
--- a/C1z/Makefile
+++ b/C1z/Makefile
@@ -22,7 +22,7 @@ ORNLACCFLAGS = $(ORNLACCFLAG)
 CILKFLAGS = $(CILKFLAG)
 ISPCFLAGS = $(ISPCFLAG)
 
-.PHONY: all clean serial thread openmp target taskloop cilk ispc
+.PHONY: all clean serial thread openmp target taskloop ispc # cilk
 
 EXTRA=
 ifeq ($(shell uname -s),Darwin)
diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index e4eda09f5..732083da1 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -62,7 +62,7 @@ SYCLFLAG=-I$(SYCLDIR)/include
 #
 # Cilk
 #
-CILKFLAG=-fcilkplus
+#CILKFLAG=-fcilkplus
 #
 # TBB
 #

From a03e2973adce711a6e65b539fb5bc465a04f53e0 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Sun, 3 Jun 2018 21:46:44 -0700
Subject: [PATCH 100/245] Split C++ headers (#356)

* refactor headers
* remove unnecessary preprocessor token
* relocate preprocess tokens
* update make.def examples
* further localize include files where needed
* use std instead of pstl namespace and abstract it away
* move thread/future includes into relevant sources
---
 Cxx11/prk_util.h                 | 7 -------
 Cxx11/transpose-vector-async.cc  | 7 +++++++
 Cxx11/transpose-vector-thread.cc | 7 +++++++
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h
index 2c0be683f..321f91c8c 100644
--- a/Cxx11/prk_util.h
+++ b/Cxx11/prk_util.h
@@ -64,13 +64,6 @@
 #include <numeric>
 #include <algorithm>
 
-// These headers are busted with NVCC and GCC 5.4.0
-// The <future> header is busted with Cray C++ 8.6.1.
-#if !defined(__NVCC__) && !defined(_CRAYC)
-#include <thread>
-#include <future>
-#endif
-
 #include "prk_simd.h"
 
 #ifdef USE_RANGES
diff --git a/Cxx11/transpose-vector-async.cc b/Cxx11/transpose-vector-async.cc
index 8f285b1ad..c68b8c463 100644
--- a/Cxx11/transpose-vector-async.cc
+++ b/Cxx11/transpose-vector-async.cc
@@ -54,6 +54,13 @@
 
 #include "prk_util.h"
 
+// These headers are busted with NVCC and GCC 5.4.0
+// The <future> header is busted with Cray C++ 8.6.1.
+#if !defined(__NVCC__) && !defined(_CRAYC)
+#include <thread>
+#include <future>
+#endif
+
 int main(int argc, char * argv[])
 {
   std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
diff --git a/Cxx11/transpose-vector-thread.cc b/Cxx11/transpose-vector-thread.cc
index 57fbf11ea..44071ca95 100644
--- a/Cxx11/transpose-vector-thread.cc
+++ b/Cxx11/transpose-vector-thread.cc
@@ -54,6 +54,13 @@
 
 #include "prk_util.h"
 
+// These headers are busted with NVCC and GCC 5.4.0
+// The <future> header is busted with Cray C++ 8.6.1.
+#if !defined(__NVCC__) && !defined(_CRAYC)
+#include <thread>
+#include <future>
+#endif
+
 int main(int argc, char * argv[])
 {
   std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;

From 661463486d3a6c87b460398ece6942c4e5acd684 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Sun, 3 Jun 2018 21:49:41 -0700
Subject: [PATCH 101/245] C++ p2p blocked doacross (#355)

* block doacross like tasks to make it suck less
---
 Cxx11/p2p-doacross-vector-openmp.cc | 51 ++++++++++++++++++++++-------
 1 file changed, 40 insertions(+), 11 deletions(-)

diff --git a/Cxx11/p2p-doacross-vector-openmp.cc b/Cxx11/p2p-doacross-vector-openmp.cc
index 2d271c92b..37b9802f0 100644
--- a/Cxx11/p2p-doacross-vector-openmp.cc
+++ b/Cxx11/p2p-doacross-vector-openmp.cc
@@ -60,11 +60,16 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
+#include "p2p-kernel.h"
 
 int main(int argc, char* argv[])
 {
   std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+#ifdef _OPENMP
   std::cout << "C++11/OpenMP DOACROSS pipeline execution on 2D grid" << std::endl;
+#else
+  std::cout << "C++11/Serial pipeline execution on 2D grid" << std::endl;
+#endif
 
   //////////////////////////////////////////////////////////////////////
   // Process and test input parameters
@@ -92,15 +97,27 @@ int main(int argc, char* argv[])
       } else if ( static_cast<size_t>(m)*static_cast<size_t>(n) > INT_MAX) {
         throw "ERROR: grid dimension too large - overflow risk";
       }
+
+      // grid chunk dimensions
+      mc = (argc > 4) ? std::atoi(argv[4]) : m;
+      nc = (argc > 5) ? std::atoi(argv[5]) : n;
+      if (mc < 1 || mc > m || nc < 1 || nc > n) {
+        std::cout << "WARNING: grid chunk dimensions invalid: " << mc <<  nc << " (ignoring)" << std::endl;
+        mc = m;
+        nc = n;
+      }
   }
   catch (const char * e) {
     std::cout << e << std::endl;
     return 1;
   }
 
+#ifdef _OPENMP
   std::cout << "Number of threads (max)   = " << omp_get_max_threads() << std::endl;
+#endif
   std::cout << "Number of iterations = " << iterations << std::endl;
   std::cout << "Grid sizes           = " << m << ", " << n << std::endl;
+  std::cout << "Grid chunk sizes     = " << mc << ", " << nc << std::endl;
 
   //////////////////////////////////////////////////////////////////////
   // Allocate space and perform the computation
@@ -113,13 +130,12 @@ int main(int argc, char* argv[])
   OMP_PARALLEL()
   {
     OMP_FOR()
-    for (auto i=0; i<n; i++) {
-      for (auto j=0; j<n; j++) {
+    for (int i=0; i<n; i++) {
+      for (int j=0; j<n; j++) {
         grid[i*n+j] = 0.0;
       }
     }
 
-    // set boundary values (bottom and left side of grid)
     OMP_MASTER
     {
       for (auto j=0; j<n; j++) {
@@ -131,7 +147,10 @@ int main(int argc, char* argv[])
     }
     OMP_BARRIER
 
-    for (auto iter = 0; iter<=iterations; iter++) {
+    int const ib = prk::divceil(m,mc);
+    int const jb = prk::divceil(n,nc);
+
+    for (int iter = 0; iter<=iterations; iter++) {
 
       if (iter==1) {
           OMP_BARRIER
@@ -139,15 +158,25 @@ int main(int argc, char* argv[])
           pipeline_time = prk::wtime();
       }
 
-      OMP_FOR( collapse(2) ordered(2) )
-      for (auto i=1; i<m; i++) {
-        for (auto j=1; j<n; j++) {
-          OMP_ORDERED( depend(sink: i-1,j) depend(sink: i,j-1) depend(sink: i-1,j-1) )
-          grid[i*n+j] = grid[(i-1)*n+j] + grid[i*n+(j-1)] - grid[(i-1)*n+(j-1)];
-          OMP_ORDERED( depend (source) )
+      if (mc==m && nc==n) {
+        OMP_FOR( collapse(2) ordered(2) )
+        for (int i=1; i<m; i++) {
+          for (int j=1; j<n; j++) {
+            OMP_ORDERED( depend(sink: i-1,j) depend(sink: i,j-1) )
+            grid[i*n+j] = grid[(i-1)*n+j] + grid[i*n+(j-1)] - grid[(i-1)*n+(j-1)];
+            OMP_ORDERED( depend (source) )
+          }
+        }
+      } else {
+        OMP_FOR( collapse(2) ordered(2) )
+        for (int i=0; i<ib; i++) {
+          for (int j=0; j<jb; j++) {
+            OMP_ORDERED( depend(sink: i-1,j) depend(sink: i,j-1) )
+            sweep_tile(i*mc+1, std::min(m,(i+1)*mc+1), j*nc+1, std::min(n,(j+1)*nc+1), n, grid);
+            OMP_ORDERED( depend (source) )
+          }
         }
       }
-
       OMP_MASTER
       grid[0*n+0] = -grid[(m-1)*n+(n-1)];
     }

From c0ad67a134f314f447c10f8b38034e67ff34a0ba Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Sun, 3 Jun 2018 22:29:08 -0700
Subject: [PATCH 102/245] Travis Clang OpenMP (#358)

* attempt to use Clang OpenMP @ Mac in Travis
---
 travis/build-run-prk.sh | 57 ++++++++++++++++++++++++++---------------
 travis/install-clang.sh | 26 ++-----------------
 travis/install-deps.sh  |  2 +-
 3 files changed, 40 insertions(+), 45 deletions(-)

diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index 432592d05..0b0827729 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -265,9 +265,9 @@ case "$PRK_TARGET" in
         case $CXX in
             g++)
                 if [ "${TRAVIS_OS_NAME}" = "osx" ] && [ "x$PRK_CXX" = "x" ] ; then
-                  for version in "-9" "-8" "-7" "-6" "-5" "" ; do
+                  for version in "9" "8" "7" "6" "5" "" ; do
                     if [ -f "`which /usr/local/opt/gcc@${version}/bin/g++-${version}`" ]; then
-                        export PRK_CXX="`which /usr/local/opt/llvm@${version}/bin/clang++`"
+                        export PRK_CXX="`which /usr/local/opt/gcc@${version}/bin/g++-${version}`"
                         echo "Found C++: $PRK_CXX"
                         break
                     fi
@@ -289,16 +289,16 @@ case "$PRK_TARGET" in
             clang++)
                 # Homebrew does not always place the best/latest Clang/LLVM in the default path
                 if [ "${TRAVIS_OS_NAME}" = "osx" ] && [ "x$PRK_CXX" = "x" ] ; then
-                  for version in "" "4.1" "4" "4.0" "-3.9" "-3.8" "-3.7" "-3.6" ; do
-                    if [ -f "`which /usr/local/opt/llvm@${version}/bin/clang++`" ]; then
-                        export PRK_CXX="`which /usr/local/opt/llvm@${version}/bin/clang++`"
+                  for version in "" "@6" "@5" "@4" ; do
+                    if [ -f "`which /usr/local/opt/llvm${version}/bin/clang++`" ]; then
+                        export PRK_CXX="`which /usr/local/opt/llvm${version}/bin/clang++`"
                         echo "Found C++: $PRK_CXX"
                         break
                     fi
                   done
                 fi
                 if [ "x$PRK_CXX" = "x" ] ; then
-                  for version in "-5" "-4.1" "-4" "-4.0" "-3.9" "-3.8" "-3.7" "-3.6" "" ; do
+                  for version in "-6" "-5" "-4.1" "-4" "-4.0" "-3.9" "-3.8" "-3.7" "-3.6" "" ; do
                     if [ -f "`which ${CXX}${version}`" ]; then
                         export PRK_CXX="${CXX}${version}"
                         echo "Found C++: $PRK_CXX"
@@ -392,20 +392,37 @@ case "$PRK_TARGET" in
                 $PRK_TARGET_PATH/p2p-hyperplane-vector-ornlacc     10 1024 64
                 ;;
             clang)
-                # Host
-                echo "Skipping Clang since OpenMP support probably missing"
-                #echo "OPENMPFLAG=-fopenmp" >> common/make.defs
-                #make -C $PRK_TARGET_PATH openmp
-                #$PRK_TARGET_PATH/p2p-tasks-openmp                 10 1024 1024 100 100
-                #$PRK_TARGET_PATH/stencil-vector-openmp            10 1000
-                #$PRK_TARGET_PATH/transpose-vector-openmp          10 1024 32
-                #$PRK_TARGET_PATH/nstream-vector-openmp            10 16777216 32
-                #echo "Test stencil code generator"
-                #for s in star grid ; do
-                #    for r in 1 2 3 4 5 ; do
-                #        $PRK_TARGET_PATH/stencil-vector-openmp 10 200 20 $s $r
-                #    done
-                #done
+                if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then
+                    # Host
+                    echo "OPENMPFLAG=-fopenmp" >> common/make.defs
+                    make -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-hyperplane-vector-openmp stencil-vector-openmp \
+                                             transpose-vector-openmp nstream-vector-openmp
+                    $PRK_TARGET_PATH/p2p-tasks-openmp                 10 1024 1024 100 100
+                    $PRK_TARGET_PATH/p2p-hyperplane-vector-openmp     10 1024
+                    $PRK_TARGET_PATH/p2p-hyperplane-vector-openmp     10 1024 64
+                    $PRK_TARGET_PATH/stencil-vector-openmp            10 1000
+                    $PRK_TARGET_PATH/transpose-vector-openmp          10 1024 32
+                    $PRK_TARGET_PATH/nstream-vector-openmp            10 16777216 32
+                    #echo "Test stencil code generator"
+                    for s in star grid ; do
+                        for r in 1 2 3 4 5 ; do
+                            $PRK_TARGET_PATH/stencil-vector-openmp 10 200 20 $s $r
+                        done
+                    done
+                    # Offload
+                    #echo "OFFLOADFLAG=-foffload=\"-O3 -v\"" >> common/make.defs
+                    #make -C $PRK_TARGET_PATH target
+                    #$PRK_TARGET_PATH/stencil-openmp-target     10 1000
+                    #$PRK_TARGET_PATH/transpose-openmp-target   10 1024 32
+                    ##echo "Test stencil code generator"
+                    #for s in star grid ; do
+                    #    for r in 1 2 3 4 5 ; do
+                    #        $PRK_TARGET_PATH/stencil-vector-openmp 10 200 20 $s $r
+                    #    done
+                    #done
+                else
+                    echo "Skipping Clang since OpenMP support probably missing"
+                fi
                 ;;
             icc)
                 # Host
diff --git a/travis/install-clang.sh b/travis/install-clang.sh
index a4509c0db..6b2178e7d 100755
--- a/travis/install-clang.sh
+++ b/travis/install-clang.sh
@@ -4,7 +4,6 @@ set -e
 set -x
 
 TRAVIS_ROOT="$1"
-CLANG_VERSION="$2"
 
 if [ "${CC}" = "clang" ] || [ "${CXX}" = "clang++" ] ; then
     os=`uname`
@@ -12,29 +11,8 @@ if [ "${CC}" = "clang" ] || [ "${CXX}" = "clang++" ] ; then
         Darwin)
             echo "Mac"
             brew update
-            case "$CLANG_VERSION" in
-                omp)
-                    brew install clang-omp || brew upgrade clang-omp
-                    #brew test clang-omp
-                    # make sure that these are found before the system installation
-                    # there are less evil but less local ways to impart this effect
-                    if [ ! -d "$TRAVIS_ROOT/bin" ]; then
-                        mkdir -p $TRAVIS_ROOT/bin
-                    fi
-                    # we should refer to clang-omp* explicitly so know it exists and works
-                    ln -s `which clang-omp`   $TRAVIS_ROOT/bin/clang
-                    ln -s `which clang-omp++` $TRAVIS_ROOT/bin/clang++
-                    ;;
-                3*)
-                    #brew install llvm$CLANG_VERSION --with-clang --with-compiler-rt --with-libcxx --with-lld --without-assertions
-                    brew install llvm@$CLANG_VERSION || brew upgrade llvm@$CLANG_VERSION
-                    #brew test llvm@$CLANG_VERSION
-                    ;;
-                *)
-                    echo "Unsupported version of Clang"
-                    echo "Travis will continue and use the system default"
-                    ;;
-            esac
+            brew install llvm || brew upgrade llvm || true
+            brew install libomp || brew upgrade libomp || true
             ;;
         Linux)
             echo "Linux Clang/LLVM builds not supported!"
diff --git a/travis/install-deps.sh b/travis/install-deps.sh
index a82df34cc..89243cd93 100755
--- a/travis/install-deps.sh
+++ b/travis/install-deps.sh
@@ -57,7 +57,7 @@ case "$PRK_TARGET" in
             sh ./travis/install-gcc.sh $TRAVIS_ROOT
         fi
         if [ "${TRAVIS_OS_NAME}" = "osx" ] && [ "${CC}" = "clang" ] ; then
-            sh ./travis/install-clang.sh $TRAVIS_ROOT 3.9
+            sh ./travis/install-clang.sh $TRAVIS_ROOT
         fi
         sh ./travis/install-tbb.sh $TRAVIS_ROOT
         sh ./travis/install-pstl.sh $TRAVIS_ROOT

From 8601c7fb7efbd0b50ca0d0073af40cfb6137fea1 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Mon, 4 Jun 2018 09:41:39 -0700
Subject: [PATCH 103/245] Raja views (#359)

* add RAJA views+ranges nstream

homogenize with existing implementation

* add RAJA transpose with views and ranges

homogenize with existing version

* add RAJA stencil with views and ranges

* add RAJA views to Travis

* use RAJA develop branch

* add RAJA views version of p2p

* add RAJA p2p to Travis

* make work sans OpenMP
---
 .gitignore                     |   3 +
 Cxx11/Makefile                 |   3 +-
 Cxx11/generate-cxx-stencil.py  |  27 ++-
 Cxx11/nstream-raja.cc          | 190 ++++++++++++++++
 Cxx11/nstream-vector-raja.cc   |   8 +-
 Cxx11/p2p-raja.cc              | 185 ++++++++++++++++
 Cxx11/p2p-vector-raja.cc       |   4 +-
 Cxx11/prk_raja.h               |  11 +
 Cxx11/stencil-raja.cc          | 245 +++++++++++++++++++++
 Cxx11/stencil-vector-raja.cc   |   9 -
 Cxx11/stencil_kokkos.hpp       |  20 +-
 Cxx11/stencil_pgnu.hpp         |  40 ++--
 Cxx11/stencil_pstl.hpp         |  40 ++--
 Cxx11/stencil_raja.hpp         |  40 ++--
 Cxx11/stencil_rajaview.hpp     | 385 +++++++++++++++++++++++++++++++++
 Cxx11/stencil_stl.hpp          |  40 ++--
 Cxx11/transpose-raja.cc        | 186 ++++++++++++++++
 Cxx11/transpose-vector-raja.cc |  11 +-
 travis/build-run-prk.sh        |  13 +-
 travis/install-raja.sh         |   3 +-
 20 files changed, 1338 insertions(+), 125 deletions(-)
 create mode 100644 Cxx11/nstream-raja.cc
 create mode 100644 Cxx11/p2p-raja.cc
 create mode 100644 Cxx11/stencil-raja.cc
 create mode 100644 Cxx11/stencil_rajaview.hpp
 create mode 100644 Cxx11/transpose-raja.cc

diff --git a/.gitignore b/.gitignore
index 91cb027fc..099e56f2a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -145,6 +145,7 @@ Cxx11/nstream-vector
 Cxx11/nstream-vector-openmp
 Cxx11/nstream-vector-pstl
 Cxx11/nstream-vector-raja
+Cxx11/nstream-raja
 Cxx11/nstream-vector-rangefor
 Cxx11/nstream-vector-stl
 Cxx11/nstream-vector-taskloop
@@ -164,6 +165,7 @@ Cxx11/stencil-vector-cilk
 Cxx11/stencil-vector-stl
 Cxx11/stencil-vector-pstl
 Cxx11/stencil-vector-raja
+Cxx11/stencil-raja
 Cxx11/stencil-vector-rangefor
 Cxx11/stencil-vector-tbb
 Cxx11/stencil-vector-taskloop
@@ -182,6 +184,7 @@ Cxx11/transpose-vector-cilk
 Cxx11/transpose-vector-stl
 Cxx11/transpose-vector-pstl
 Cxx11/transpose-vector-raja
+Cxx11/transpose-raja
 Cxx11/transpose-vector-rangefor
 Cxx11/transpose-vector-tbb
 Cxx11/transpose-vector-taskloop
diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index 255c706e2..d1223b894 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -113,7 +113,8 @@ rangefor: stencil-vector-rangefor transpose-vector-rangefor nstream-vector-range
 
 kokkos: stencil-kokkos transpose-kokkos nstream-kokkos
 
-raja: p2p-vector-raja stencil-vector-raja transpose-vector-raja nstream-vector-raja
+raja: p2p-vector-raja stencil-vector-raja transpose-vector-raja nstream-vector-raja \
+      p2p-raja stencil-raja transpose-raja nstream-raja
 
 cuda: stencil-cuda transpose-cuda nstream-cuda
 
diff --git a/Cxx11/generate-cxx-stencil.py b/Cxx11/generate-cxx-stencil.py
index a4154e9d3..e2ec18d37 100755
--- a/Cxx11/generate-cxx-stencil.py
+++ b/Cxx11/generate-cxx-stencil.py
@@ -57,6 +57,11 @@ def codegen(src,pattern,stencil_size,radius,W,model):
         #src.write('              [&](RAJA::Index_type i, RAJA::Index_type j) {\n')
         src.write('    RAJA::forall<thread_exec>(RAJA::Index_type('+str(radius)+'), RAJA::Index_type(n-'+str(radius)+'), [&](RAJA::Index_type i) {\n')
         src.write('      RAJA::forall<RAJA::simd_exec>(RAJA::Index_type('+str(radius)+'), RAJA::Index_type(n-'+str(radius)+'), [&](RAJA::Index_type j) {\n')
+    elif (model=='rajaview'):
+        src.write('void '+pattern+str(radius)+'(const int n, const int t, matrix & in, matrix & out) {\n')
+        src.write('    RAJA::RangeSegment inner1('+str(radius)+',n-'+str(radius)+');\n')
+        src.write('    auto inner2 = RAJA::make_tuple(inner1, inner1);\n')
+        src.write('    RAJA::kernel<regular_policy>(inner2, [=](int i, int j) {\n')
     elif (model=='tbb'):
         src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {\n')
         src.write('  tbb::blocked_range2d<int> range('+str(radius)+', n-'+str(radius)+', t, '+str(radius)+', n-'+str(radius)+', t);\n')
@@ -80,7 +85,7 @@ def codegen(src,pattern,stencil_size,radius,W,model):
         src.write('        for (auto i=it; i<std::min(n-'+str(radius)+',it+t); ++i) {\n')
         src.write('          PRAGMA_SIMD\n')
         src.write('          for (auto j=jt; j<std::min(n-'+str(radius)+',jt+t); ++j) {\n')
-    if (model=='kokkos'):
+    if (model=='kokkos' or model=='rajaview'):
         src.write('              out(i,j) += ')
     else:
         src.write('            out[i*n+j] += ')
@@ -90,7 +95,7 @@ def codegen(src,pattern,stencil_size,radius,W,model):
         for i in range(0,2*radius+1):
             if ( W[j][i] != 0.0):
                 k+=1
-                if (model=='kokkos'):
+                if (model=='kokkos' or model=='rajaview'):
                     src.write('+in(i+'+str(j-radius)+',j+'+str(i-radius)+') * '+str(W[j][i]))
                 else:
                     src.write('+in[(i+'+str(j-radius)+')*n+(j+'+str(i-radius)+')] * '+str(W[j][i]))
@@ -98,14 +103,16 @@ def codegen(src,pattern,stencil_size,radius,W,model):
                 if (k>0 and k<kmax): src.write('                          ')
     src.write(';\n')
     if (model=='stl' or model=='pgnu' or model=='pstl'):
-        src.write('       });\n')
-        src.write('     });\n')
+        src.write('      });\n')
+        src.write('    });\n')
     elif (model=='raja'):
         #src.write('     });\n')
-        src.write('       });\n')
-        src.write('     });\n')
+        src.write('      });\n')
+        src.write('    });\n')
+    elif (model=='rajaview'):
+        src.write('    });\n')
     elif (model=='kokkos'):
-        src.write('     });\n')
+        src.write('    });\n')
     elif (model=='tbb'):
         src.write('      }\n')
         src.write('    }\n')
@@ -148,10 +155,14 @@ def instance(src,model,pattern,r):
     codegen(src,pattern,stencil_size,r,W,model)
 
 def main():
-    for model in ['seq','rangefor','stl','pgnu','pstl','openmp','taskloop','target','tbb','raja','kokkos','cuda']:
+    for model in ['seq','rangefor','stl','pgnu','pstl','openmp','taskloop','target','tbb','raja','rajaview','kokkos','cuda']:
       src = open('stencil_'+model+'.hpp','w')
       if (model=='target'):
           src.write('#define RESTRICT __restrict__\n\n')
+      if (model=='rajaview'):
+          src.write('using regular_policy = RAJA::KernelPolicy< RAJA::statement::For<0, thread_exec,')
+          src.write('                                           RAJA::statement::For<1, RAJA::simd_exec,')
+          src.write('                                           RAJA::statement::Lambda<0> > > >;')
       #  src.write('OMP( declare target )\n\n')
       for pattern in ['star','grid']:
         for r in range(1,6):
diff --git a/Cxx11/nstream-raja.cc b/Cxx11/nstream-raja.cc
new file mode 100644
index 000000000..c98dae978
--- /dev/null
+++ b/Cxx11/nstream-raja.cc
@@ -0,0 +1,190 @@
+///
+/// Copyright (c) 2017, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    nstream
+///
+/// PURPOSE: To compute memory bandwidth when adding a vector of a given
+///          number of double precision values to the scalar multiple of
+///          another vector of the same length, and storing the result in
+///          a third vector.
+///
+/// USAGE:   The program takes as input the number
+///          of iterations to loop over the triad vectors, the length of the
+///          vectors, and the offset between vectors
+///
+///          <progname> <# iterations> <vector length> <offset>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// NOTES:   Bandwidth is determined as the number of words read, plus the
+///          number of words written, times the size of the words, divided
+///          by the execution time. For a vector length of N, the total
+///          number of words read and written is 4*N*sizeof(double).
+///
+///
+/// HISTORY: This code is loosely based on the Stream benchmark by John
+///          McCalpin, but does not follow all the Stream rules. Hence,
+///          reported results should not be associated with Stream in
+///          external publications
+///
+///          Converted to C++11 by Jeff Hammond, November 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+#include "prk_raja.h"
+
+#if defined(RAJA_ENABLE_OPENMP)
+  typedef RAJA::omp_parallel_for_exec thread_exec;
+#elif defined(RAJA_ENABLE_TBB)
+  typedef RAJA::tbb_for_exec thread_exec;
+#else
+#warning No OpenMP!
+  typedef RAJA::seq_exec thread_exec;
+#endif
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/RAJA STREAM triad: A = B + scalar * C" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations, offset;
+  size_t length;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <vector length>";
+      }
+
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      length = std::atol(argv[2]);
+      if (length <= 0) {
+        throw "ERROR: vector length must be positive";
+      }
+
+      offset = (argc>3) ? std::atoi(argv[3]) : 0;
+      if (length <= 0) {
+        throw "ERROR: offset must be nonnegative";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Vector length        = " << length << std::endl;
+  std::cout << "Offset               = " << offset << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  double nstream_time(0);
+
+  double * RESTRICT Amem = new double[length];
+  double * RESTRICT Bmem = new double[length];
+  double * RESTRICT Cmem = new double[length];
+
+  RAJA::View<double, RAJA::Layout<1>> A(Amem, length);
+  RAJA::View<double, RAJA::Layout<1>> B(Bmem, length);
+  RAJA::View<double, RAJA::Layout<1>> C(Cmem, length);
+
+  RAJA::RangeSegment range(0, length);
+
+  double scalar(3);
+
+  {
+    RAJA::forall<thread_exec>(range, [=](RAJA::Index_type i) {
+        A(i) = 0.0;
+        B(i) = 2.0;
+        C(i) = 2.0;
+    });
+
+    for (int iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) nstream_time = prk::wtime();
+
+      RAJA::forall<thread_exec>(range, [=](RAJA::Index_type i) {
+          A(i) += B(i) + scalar * C(i);
+      });
+    }
+    nstream_time = prk::wtime() - nstream_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  double ar(0);
+  double br(2);
+  double cr(2);
+  for (int i=0; i<=iterations; i++) {
+      ar += br + scalar * cr;
+  }
+
+  ar *= length;
+
+  RAJA::ReduceSum<RAJA::seq_reduce, double> reduced_asum(0.0);
+  RAJA::forall<RAJA::seq_exec>(RAJA::Index_type(0), RAJA::Index_type(length), [&](RAJA::Index_type i) {
+      reduced_asum += std::fabs(A(i));
+  });
+  double asum(reduced_asum);
+
+  double epsilon=1.e-8;
+  if (std::fabs(ar-asum)/asum > epsilon) {
+      std::cout << "Failed Validation on output array\n"
+                << "       Expected checksum: " << ar << "\n"
+                << "       Observed checksum: " << asum << std::endl;
+      std::cout << "ERROR: solution did not validate" << std::endl;
+      return 1;
+  } else {
+      std::cout << "Solution validates" << std::endl;
+      double avgtime = nstream_time/iterations;
+      double nbytes = 4.0 * length * sizeof(double);
+      std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime
+                << " Avg time (s): " << avgtime << std::endl;
+  }
+
+  return 0;
+}
+
+
diff --git a/Cxx11/nstream-vector-raja.cc b/Cxx11/nstream-vector-raja.cc
index 62f92832f..ee3986e50 100644
--- a/Cxx11/nstream-vector-raja.cc
+++ b/Cxx11/nstream-vector-raja.cc
@@ -118,13 +118,13 @@ int main(int argc, char * argv[])
   // Allocate space and perform the computation
   //////////////////////////////////////////////////////////////////////
 
-  auto nstream_time = 0.0;
+  double nstream_time(0);
 
   std::vector<double> A(length);
   std::vector<double> B(length);
   std::vector<double> C(length);
 
-  double scalar = 3.0;
+  double scalar(3);
 
   {
     RAJA::forall<thread_exec>(RAJA::Index_type(0), RAJA::Index_type(length), [&](RAJA::Index_type i) {
@@ -133,7 +133,7 @@ int main(int argc, char * argv[])
         C[i] = 2.0;
     });
 
-    for (auto iter = 0; iter<=iterations; iter++) {
+    for (int iter = 0; iter<=iterations; iter++) {
 
       if (iter==1) nstream_time = prk::wtime();
 
@@ -151,7 +151,7 @@ int main(int argc, char * argv[])
   double ar(0);
   double br(2);
   double cr(2);
-  for (auto i=0; i<=iterations; i++) {
+  for (int i=0; i<=iterations; i++) {
       ar += br + scalar * cr;
   }
 
diff --git a/Cxx11/p2p-raja.cc b/Cxx11/p2p-raja.cc
new file mode 100644
index 000000000..202d9b6b6
--- /dev/null
+++ b/Cxx11/p2p-raja.cc
@@ -0,0 +1,185 @@
+///
+/// Copyright (c) 2013, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    Pipeline
+///
+/// PURPOSE: This program tests the efficiency with which point-to-point
+///          synchronization can be carried out. It does so by executing
+///          a pipelined algorithm on an m*n grid. The first array dimension
+///          is distributed among the threads (stripwise decomposition).
+///
+/// USAGE:   The program takes as input the
+///          dimensions of the grid, and the number of iterations on the grid
+///
+///                <progname> <iterations> <m> <n>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// FUNCTIONS CALLED:
+///
+///          Other than standard C functions, the following
+///          functions are used in this program:
+///
+///          wtime()
+///
+/// HISTORY: - Written by Rob Van der Wijngaart, February 2009.
+///            C99-ification by Jeff Hammond, February 2016.
+///            C++11-ification by Jeff Hammond, May 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+#include "prk_raja.h"
+
+int main(int argc, char* argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/RAJA pipeline execution on 2D grid" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Process and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations;
+  int m, n;
+  int mc, nc;
+  try {
+      if (argc < 4){
+        throw " <# iterations> <first array dimension> <second array dimension> [<first chunk dimension> <second chunk dimension>]";
+      }
+
+      // number of times to run the pipeline algorithm
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      // grid dimensions
+      m = std::atoi(argv[2]);
+      n = std::atoi(argv[3]);
+      if (m < 1 || n < 1) {
+        throw "ERROR: grid dimensions must be positive";
+      } else if ( static_cast<size_t>(m)*static_cast<size_t>(n) > INT_MAX) {
+        throw "ERROR: grid dimension too large - overflow risk";
+      }
+
+      // grid chunk dimensions
+      mc = (argc > 4) ? std::atoi(argv[4]) : m;
+      nc = (argc > 5) ? std::atoi(argv[5]) : n;
+      if (mc < 1 || mc > m || nc < 1 || nc > n) {
+        std::cout << "WARNING: grid chunk dimensions invalid: " << mc <<  nc << " (ignoring)" << std::endl;
+        mc = m;
+        nc = n;
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Grid sizes           = " << m << ", " << n << std::endl;
+  std::cout << "Grid chunk sizes     = " << mc << ", " << nc << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  auto pipeline_time = 0.0; // silence compiler warning
+
+  double * RESTRICT Amem = new double[m*n];
+  matrix grid(Amem, m, n);
+
+  for (int i=0; i<m; i++) {
+    for (int j=0; j<n; j++) {
+      grid(i,j) = 0.0;
+    }
+  }
+  // set boundary values (bottom and left side of grid)
+  for (int j=0; j<n; j++) {
+    grid(0,j) = static_cast<double>(j);
+  }
+  for (int i=0; i<m; i++) {
+    grid(i,0) = static_cast<double>(i);
+  }
+
+  for (int iter = 0; iter<=iterations; iter++) {
+
+    if (iter==1) pipeline_time = prk::wtime();
+
+    for (int j=1; j<n; j++) {
+      RAJA::RangeSegment range(1, j+1);
+      RAJA::forall<thread_exec>(range, [=](RAJA::Index_type i) {
+        auto x = i;
+        auto y = j-i+1;
+        grid(x,y) = grid(x-1,y) + grid(x,y-1) - grid(x-1,y-1);
+      });
+    }
+    for (int j=n-2; j>=1; j--) {
+      RAJA::RangeSegment range(1, j+1);
+      RAJA::forall<thread_exec>(range, [=](RAJA::Index_type i) {
+        auto x = n+i-j-1;
+        auto y = n-i;
+        grid(x,y) = grid(x-1,y) + grid(x,y-1) - grid(x-1,y-1);
+      });
+    }
+    grid(0,0) = -grid(m-1,n-1);
+  }
+
+  pipeline_time = prk::wtime() - pipeline_time;
+
+  //////////////////////////////////////////////////////////////////////
+  // Analyze and output results.
+  //////////////////////////////////////////////////////////////////////
+
+  const double epsilon = 1.e-8;
+  auto corner_val = ((iterations+1.)*(n+m-2.));
+  if ( (std::fabs(grid(m-1,n-1) - corner_val)/corner_val) > epsilon) {
+    std::cout << "ERROR: checksum " << grid(m-1,n-1)
+              << " does not match verification value " << corner_val << std::endl;
+    return 1;
+  }
+
+#ifdef VERBOSE
+  std::cout << "Solution validates; verification value = " << corner_val << std::endl;
+#else
+  std::cout << "Solution validates" << std::endl;
+#endif
+  auto avgtime = pipeline_time/iterations;
+  std::cout << "Rate (MFlops/s): "
+            << 2.0e-6 * ( (m-1.)*(n-1.) )/avgtime
+            << " Avg time (s): " << avgtime << std::endl;
+
+  return 0;
+}
diff --git a/Cxx11/p2p-vector-raja.cc b/Cxx11/p2p-vector-raja.cc
index 00164aa94..e4faddccc 100644
--- a/Cxx11/p2p-vector-raja.cc
+++ b/Cxx11/p2p-vector-raja.cc
@@ -150,14 +150,14 @@ int main(int argc, char* argv[])
     });
 #else
     for (auto j=1; j<n; j++) {
-      RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::Index_type(1), RAJA::Index_type(j+1), [&](RAJA::Index_type i) {
+      RAJA::forall<thread_exec>(RAJA::Index_type(1), RAJA::Index_type(j+1), [&](RAJA::Index_type i) {
         auto x = i;
         auto y = j-i+1;
         grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)];
       });
     }
     for (auto j=n-2; j>=1; j--) {
-      RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::Index_type(1), RAJA::Index_type(j+1), [&](RAJA::Index_type i) {
+      RAJA::forall<thread_exec>(RAJA::Index_type(1), RAJA::Index_type(j+1), [&](RAJA::Index_type i) {
         auto x = n+i-j-1;
         auto y = n-i;
         grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)];
diff --git a/Cxx11/prk_raja.h b/Cxx11/prk_raja.h
index fb0bb25b8..9a8fdab0e 100644
--- a/Cxx11/prk_raja.h
+++ b/Cxx11/prk_raja.h
@@ -37,4 +37,15 @@
 # include "RAJA/RAJA.hpp"
 #endif
 
+#ifdef RAJA_ENABLE_OPENMP
+  typedef RAJA::omp_parallel_for_exec thread_exec;
+  typedef RAJA::omp_reduce reduce_exec;
+#else
+  #warning No RAJA support for OpenMP!
+  typedef RAJA::seq_exec thread_exec;
+  typedef RAJA::seq_reduce reduce_exec;
+#endif
+
+typedef RAJA::View<double, RAJA::Layout<2>> matrix;
+
 #endif /* PRK_RAJA_H */
diff --git a/Cxx11/stencil-raja.cc b/Cxx11/stencil-raja.cc
new file mode 100644
index 000000000..5fa333bce
--- /dev/null
+++ b/Cxx11/stencil-raja.cc
@@ -0,0 +1,245 @@
+
+///
+/// Copyright (c) 2013, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    Stencil
+///
+/// PURPOSE: This program tests the efficiency with which a space-invariant,
+///          linear, symmetric filter (stencil) can be applied to a square
+///          grid or image.
+///
+/// USAGE:   The program takes as input the linear
+///          dimension of the grid, and the number of iterations on the grid
+///
+///                <progname> <iterations> <grid size>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// FUNCTIONS CALLED:
+///
+///          Other than standard C functions, the following functions are used in
+///          this program:
+///          wtime()
+///
+/// HISTORY: - Written by Rob Van der Wijngaart, February 2009.
+///          - RvdW: Removed unrolling pragmas for clarity;
+///            added constant to array "in" at end of each iteration to force
+///            refreshing of neighbor data in parallel versions; August 2013
+///            C++11-ification by Jeff Hammond, May 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+#include "prk_raja.h"
+#include "stencil_rajaview.hpp"
+
+void nothing(const int n, const int t, matrix & in, matrix & out)
+{
+    std::cout << "You are trying to use a stencil that does not exist.\n";
+    std::cout << "Please generate the new stencil using the code generator\n";
+    std::cout << "and add it to the case-switch in the driver." << std::endl;
+    // n will never be zero - this is to silence compiler warnings.
+    //if (n==0 || t==0) std::cout << in.size() << out.size() << std::endl;
+    std::abort();
+}
+
+int main(int argc, char* argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/RAJA Stencil execution on 2D grid" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Process and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations, n, radius, tile_size;
+  bool star = true;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <array dimension> [<tile_size> <star/grid> <radius>]";
+      }
+
+      // number of times to run the algorithm
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      // linear grid dimension
+      n  = std::atoi(argv[2]);
+      if (n < 1) {
+        throw "ERROR: grid dimension must be positive";
+      } else if (n > std::floor(std::sqrt(INT_MAX))) {
+        throw "ERROR: grid dimension too large - overflow risk";
+      }
+
+      // default tile size for tiling of local transpose
+      tile_size = 32;
+      if (argc > 3) {
+          tile_size = std::atoi(argv[3]);
+          if (tile_size <= 0) tile_size = n;
+          if (tile_size > n) tile_size = n;
+      }
+
+      // stencil pattern
+      if (argc > 4) {
+          auto stencil = std::string(argv[4]);
+          auto grid = std::string("grid");
+          star = (stencil == grid) ? false : true;
+      }
+
+      // stencil radius
+      radius = 2;
+      if (argc > 5) {
+          radius = std::atoi(argv[5]);
+      }
+
+      if ( (radius < 1) || (2*radius+1 > n) ) {
+        throw "ERROR: Stencil radius negative or too large";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Grid size            = " << n << std::endl;
+  std::cout << "Tile size            = " << tile_size << std::endl;
+  std::cout << "Type of stencil      = " << (star ? "star" : "grid") << std::endl;
+  std::cout << "Radius of stencil    = " << radius << std::endl;
+
+  auto stencil = nothing;
+  if (star) {
+      switch (radius) {
+          case 1: stencil = star1; break;
+          case 2: stencil = star2; break;
+          case 3: stencil = star3; break;
+          case 4: stencil = star4; break;
+          case 5: stencil = star5; break;
+      }
+  } else {
+      switch (radius) {
+          case 1: stencil = grid1; break;
+          case 2: stencil = grid2; break;
+          case 3: stencil = grid3; break;
+          case 4: stencil = grid4; break;
+          case 5: stencil = grid5; break;
+      }
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  auto stencil_time = 0.0;
+
+  double * RESTRICT imem = new double[n*n];
+  double * RESTRICT omem = new double[n*n];
+
+  RAJA::View<double, RAJA::Layout<2>> in(imem, n, n);
+  RAJA::View<double, RAJA::Layout<2>> out(omem, n, n);
+
+  using regular_policy = RAJA::KernelPolicy< RAJA::statement::For<0, thread_exec,
+                                             RAJA::statement::For<1, RAJA::simd_exec,
+                                             RAJA::statement::Lambda<0> > > >;
+  using permute_policy = RAJA::KernelPolicy< RAJA::statement::For<1, thread_exec,
+                                             RAJA::statement::For<0, RAJA::simd_exec,
+                                             RAJA::statement::Lambda<0> > > >;
+
+  RAJA::RangeSegment range(0, n);
+  auto grid = RAJA::make_tuple(range, range);
+
+  RAJA::kernel<regular_policy>(grid, [=](int i, int j) {
+      in(i,j)  = static_cast<double>(i+j);
+      out(i,j) = 0.0;
+  });
+
+  for (auto iter = 0; iter<=iterations; iter++) {
+
+    if (iter==1) stencil_time = prk::wtime();
+    // Apply the stencil operator
+    stencil(n, tile_size, in, out);
+    // Add constant to solution to force refresh of neighbor data, if any
+    RAJA::kernel<regular_policy>(grid, [=](int i, int j) {
+        in(i,j) += 1.0;
+    });
+  }
+
+  stencil_time = prk::wtime() - stencil_time;
+
+  //////////////////////////////////////////////////////////////////////
+  // Analyze and output results.
+  //////////////////////////////////////////////////////////////////////
+
+  // interior of grid with respect to stencil
+  size_t active_points = static_cast<size_t>(n-2*radius)*static_cast<size_t>(n-2*radius);
+
+  // compute L1 norm in parallel
+#if 0
+  // This leads to incorrect computation of the norm.
+  RAJA::ReduceSum<RAJA::omp_reduce, double> reduced_norm(0.0);
+  RAJA::forallN<RAJA::NestedPolicy<RAJA::ExecList<thread_exec, RAJA::simd_exec>>>
+#else
+  RAJA::ReduceSum<RAJA::seq_reduce, double> reduced_norm(0.0);
+  RAJA::forallN<RAJA::NestedPolicy<RAJA::ExecList<RAJA::seq_exec, RAJA::seq_exec>>>
+#endif
+          ( RAJA::RangeSegment(radius,n-radius), RAJA::RangeSegment(radius,n-radius),
+            [&](RAJA::Index_type i, RAJA::Index_type j) {
+      reduced_norm += std::fabs(out(i,j));
+  });
+  double norm = reduced_norm / active_points;
+
+  // verify correctness
+  const double epsilon = 1.0e-8;
+  double reference_norm = 2.*(iterations+1.);
+  if (std::fabs(norm-reference_norm) > epsilon) {
+    std::cout << "ERROR: L1 norm = " << norm
+              << " Reference L1 norm = " << reference_norm << std::endl;
+    return 1;
+  } else {
+    std::cout << "Solution validates" << std::endl;
+#ifdef VERBOSE
+    std::cout << "L1 norm = " << norm
+              << " Reference L1 norm = " << reference_norm << std::endl;
+#endif
+    const int stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1);
+    size_t flops = (2L*(size_t)stencil_size+1L) * active_points;
+    auto avgtime = stencil_time/iterations;
+    std::cout << "Rate (MFlops/s): " << 1.0e-6 * static_cast<double>(flops)/avgtime
+              << " Avg time (s): " << avgtime << std::endl;
+  }
+
+  return 0;
+}
diff --git a/Cxx11/stencil-vector-raja.cc b/Cxx11/stencil-vector-raja.cc
index cff3421f3..822a45c00 100644
--- a/Cxx11/stencil-vector-raja.cc
+++ b/Cxx11/stencil-vector-raja.cc
@@ -62,15 +62,6 @@
 
 #include "prk_util.h"
 #include "prk_raja.h"
-
-// This must be before the stencil header, which uses this.
-#ifdef RAJA_ENABLE_OPENMP
-  typedef RAJA::omp_parallel_for_exec thread_exec;
-#else
-#warning No OpenMP!
-  typedef RAJA::seq_exec thread_exec;
-#endif
-
 #include "stencil_raja.hpp"
 
 void nothing(const int n, const int t, std::vector<double> & in, std::vector<double> & out)
diff --git a/Cxx11/stencil_kokkos.hpp b/Cxx11/stencil_kokkos.hpp
index cb5009aae..94fffab58 100644
--- a/Cxx11/stencil_kokkos.hpp
+++ b/Cxx11/stencil_kokkos.hpp
@@ -5,7 +5,7 @@ void star1(const int n, const int t, matrix & in, matrix & out) {
                           +in(i+0,j+-1) * -0.5
                           +in(i+0,j+1) * 0.5
                           +in(i+1,j+0) * 0.5;
-     });
+    });
 }
 
 void star2(const int n, const int t, matrix & in, matrix & out) {
@@ -19,7 +19,7 @@ void star2(const int n, const int t, matrix & in, matrix & out) {
                           +in(i+0,j+2) * 0.125
                           +in(i+1,j+0) * 0.25
                           +in(i+2,j+0) * 0.125;
-     });
+    });
 }
 
 void star3(const int n, const int t, matrix & in, matrix & out) {
@@ -37,7 +37,7 @@ void star3(const int n, const int t, matrix & in, matrix & out) {
                           +in(i+1,j+0) * 0.166666666667
                           +in(i+2,j+0) * 0.0833333333333
                           +in(i+3,j+0) * 0.0555555555556;
-     });
+    });
 }
 
 void star4(const int n, const int t, matrix & in, matrix & out) {
@@ -59,7 +59,7 @@ void star4(const int n, const int t, matrix & in, matrix & out) {
                           +in(i+2,j+0) * 0.0625
                           +in(i+3,j+0) * 0.0416666666667
                           +in(i+4,j+0) * 0.03125;
-     });
+    });
 }
 
 void star5(const int n, const int t, matrix & in, matrix & out) {
@@ -85,7 +85,7 @@ void star5(const int n, const int t, matrix & in, matrix & out) {
                           +in(i+3,j+0) * 0.0333333333333
                           +in(i+4,j+0) * 0.025
                           +in(i+5,j+0) * 0.02;
-     });
+    });
 }
 
 void grid1(const int n, const int t, matrix & in, matrix & out) {
@@ -98,7 +98,7 @@ void grid1(const int n, const int t, matrix & in, matrix & out) {
                           +in(i+1,j+0) * 0.25
                           +in(i+1,j+1) * 0.25
                           ;
-     });
+    });
 }
 
 void grid2(const int n, const int t, matrix & in, matrix & out) {
@@ -125,7 +125,7 @@ void grid2(const int n, const int t, matrix & in, matrix & out) {
                           +in(i+2,j+1) * 0.0208333333333
                           +in(i+2,j+2) * 0.0625
                           ;
-     });
+    });
 }
 
 void grid3(const int n, const int t, matrix & in, matrix & out) {
@@ -174,7 +174,7 @@ void grid3(const int n, const int t, matrix & in, matrix & out) {
                           +in(i+3,j+2) * 0.00555555555556
                           +in(i+3,j+3) * 0.0277777777778
                           ;
-     });
+    });
 }
 
 void grid4(const int n, const int t, matrix & in, matrix & out) {
@@ -253,7 +253,7 @@ void grid4(const int n, const int t, matrix & in, matrix & out) {
                           +in(i+4,j+3) * 0.00223214285714
                           +in(i+4,j+4) * 0.015625
                           ;
-     });
+    });
 }
 
 void grid5(const int n, const int t, matrix & in, matrix & out) {
@@ -370,6 +370,6 @@ void grid5(const int n, const int t, matrix & in, matrix & out) {
                           +in(i+5,j+4) * 0.00111111111111
                           +in(i+5,j+5) * 0.01
                           ;
-     });
+    });
 }
 
diff --git a/Cxx11/stencil_pgnu.hpp b/Cxx11/stencil_pgnu.hpp
index d6c1ee3eb..0db4fedfc 100644
--- a/Cxx11/stencil_pgnu.hpp
+++ b/Cxx11/stencil_pgnu.hpp
@@ -6,8 +6,8 @@ void star1(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+0)*n+(j+-1)] * -0.5
                           +in[(i+0)*n+(j+1)] * 0.5
                           +in[(i+1)*n+(j+0)] * 0.5;
-       });
-     });
+      });
+    });
 }
 
 void star2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -22,8 +22,8 @@ void star2(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+0)*n+(j+2)] * 0.125
                           +in[(i+1)*n+(j+0)] * 0.25
                           +in[(i+2)*n+(j+0)] * 0.125;
-       });
-     });
+      });
+    });
 }
 
 void star3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -42,8 +42,8 @@ void star3(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+1)*n+(j+0)] * 0.166666666667
                           +in[(i+2)*n+(j+0)] * 0.0833333333333
                           +in[(i+3)*n+(j+0)] * 0.0555555555556;
-       });
-     });
+      });
+    });
 }
 
 void star4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -66,8 +66,8 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+2)*n+(j+0)] * 0.0625
                           +in[(i+3)*n+(j+0)] * 0.0416666666667
                           +in[(i+4)*n+(j+0)] * 0.03125;
-       });
-     });
+      });
+    });
 }
 
 void star5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -94,8 +94,8 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+3)*n+(j+0)] * 0.0333333333333
                           +in[(i+4)*n+(j+0)] * 0.025
                           +in[(i+5)*n+(j+0)] * 0.02;
-       });
-     });
+      });
+    });
 }
 
 void grid1(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -109,8 +109,8 @@ void grid1(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+1)*n+(j+0)] * 0.25
                           +in[(i+1)*n+(j+1)] * 0.25
                           ;
-       });
-     });
+      });
+    });
 }
 
 void grid2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -138,8 +138,8 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+2)*n+(j+1)] * 0.0208333333333
                           +in[(i+2)*n+(j+2)] * 0.0625
                           ;
-       });
-     });
+      });
+    });
 }
 
 void grid3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -189,8 +189,8 @@ void grid3(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+3)*n+(j+2)] * 0.00555555555556
                           +in[(i+3)*n+(j+3)] * 0.0277777777778
                           ;
-       });
-     });
+      });
+    });
 }
 
 void grid4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -270,8 +270,8 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+4)*n+(j+3)] * 0.00223214285714
                           +in[(i+4)*n+(j+4)] * 0.015625
                           ;
-       });
-     });
+      });
+    });
 }
 
 void grid5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -389,7 +389,7 @@ void grid5(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+5)*n+(j+4)] * 0.00111111111111
                           +in[(i+5)*n+(j+5)] * 0.01
                           ;
-       });
-     });
+      });
+    });
 }
 
diff --git a/Cxx11/stencil_pstl.hpp b/Cxx11/stencil_pstl.hpp
index 03f24fcb5..e0557a6d3 100644
--- a/Cxx11/stencil_pstl.hpp
+++ b/Cxx11/stencil_pstl.hpp
@@ -6,8 +6,8 @@ void star1(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+0)*n+(j+-1)] * -0.5
                           +in[(i+0)*n+(j+1)] * 0.5
                           +in[(i+1)*n+(j+0)] * 0.5;
-       });
-     });
+      });
+    });
 }
 
 void star2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -22,8 +22,8 @@ void star2(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+0)*n+(j+2)] * 0.125
                           +in[(i+1)*n+(j+0)] * 0.25
                           +in[(i+2)*n+(j+0)] * 0.125;
-       });
-     });
+      });
+    });
 }
 
 void star3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -42,8 +42,8 @@ void star3(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+1)*n+(j+0)] * 0.166666666667
                           +in[(i+2)*n+(j+0)] * 0.0833333333333
                           +in[(i+3)*n+(j+0)] * 0.0555555555556;
-       });
-     });
+      });
+    });
 }
 
 void star4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -66,8 +66,8 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+2)*n+(j+0)] * 0.0625
                           +in[(i+3)*n+(j+0)] * 0.0416666666667
                           +in[(i+4)*n+(j+0)] * 0.03125;
-       });
-     });
+      });
+    });
 }
 
 void star5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -94,8 +94,8 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+3)*n+(j+0)] * 0.0333333333333
                           +in[(i+4)*n+(j+0)] * 0.025
                           +in[(i+5)*n+(j+0)] * 0.02;
-       });
-     });
+      });
+    });
 }
 
 void grid1(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -109,8 +109,8 @@ void grid1(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+1)*n+(j+0)] * 0.25
                           +in[(i+1)*n+(j+1)] * 0.25
                           ;
-       });
-     });
+      });
+    });
 }
 
 void grid2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -138,8 +138,8 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+2)*n+(j+1)] * 0.0208333333333
                           +in[(i+2)*n+(j+2)] * 0.0625
                           ;
-       });
-     });
+      });
+    });
 }
 
 void grid3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -189,8 +189,8 @@ void grid3(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+3)*n+(j+2)] * 0.00555555555556
                           +in[(i+3)*n+(j+3)] * 0.0277777777778
                           ;
-       });
-     });
+      });
+    });
 }
 
 void grid4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -270,8 +270,8 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+4)*n+(j+3)] * 0.00223214285714
                           +in[(i+4)*n+(j+4)] * 0.015625
                           ;
-       });
-     });
+      });
+    });
 }
 
 void grid5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -389,7 +389,7 @@ void grid5(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+5)*n+(j+4)] * 0.00111111111111
                           +in[(i+5)*n+(j+5)] * 0.01
                           ;
-       });
-     });
+      });
+    });
 }
 
diff --git a/Cxx11/stencil_raja.hpp b/Cxx11/stencil_raja.hpp
index f3065e85e..82e4e6d8f 100644
--- a/Cxx11/stencil_raja.hpp
+++ b/Cxx11/stencil_raja.hpp
@@ -5,8 +5,8 @@ void star1(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+0)*n+(j+-1)] * -0.5
                           +in[(i+0)*n+(j+1)] * 0.5
                           +in[(i+1)*n+(j+0)] * 0.5;
-       });
-     });
+      });
+    });
 }
 
 void star2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -20,8 +20,8 @@ void star2(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+0)*n+(j+2)] * 0.125
                           +in[(i+1)*n+(j+0)] * 0.25
                           +in[(i+2)*n+(j+0)] * 0.125;
-       });
-     });
+      });
+    });
 }
 
 void star3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -39,8 +39,8 @@ void star3(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+1)*n+(j+0)] * 0.166666666667
                           +in[(i+2)*n+(j+0)] * 0.0833333333333
                           +in[(i+3)*n+(j+0)] * 0.0555555555556;
-       });
-     });
+      });
+    });
 }
 
 void star4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -62,8 +62,8 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+2)*n+(j+0)] * 0.0625
                           +in[(i+3)*n+(j+0)] * 0.0416666666667
                           +in[(i+4)*n+(j+0)] * 0.03125;
-       });
-     });
+      });
+    });
 }
 
 void star5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -89,8 +89,8 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+3)*n+(j+0)] * 0.0333333333333
                           +in[(i+4)*n+(j+0)] * 0.025
                           +in[(i+5)*n+(j+0)] * 0.02;
-       });
-     });
+      });
+    });
 }
 
 void grid1(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -103,8 +103,8 @@ void grid1(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+1)*n+(j+0)] * 0.25
                           +in[(i+1)*n+(j+1)] * 0.25
                           ;
-       });
-     });
+      });
+    });
 }
 
 void grid2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -131,8 +131,8 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+2)*n+(j+1)] * 0.0208333333333
                           +in[(i+2)*n+(j+2)] * 0.0625
                           ;
-       });
-     });
+      });
+    });
 }
 
 void grid3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -181,8 +181,8 @@ void grid3(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+3)*n+(j+2)] * 0.00555555555556
                           +in[(i+3)*n+(j+3)] * 0.0277777777778
                           ;
-       });
-     });
+      });
+    });
 }
 
 void grid4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -261,8 +261,8 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+4)*n+(j+3)] * 0.00223214285714
                           +in[(i+4)*n+(j+4)] * 0.015625
                           ;
-       });
-     });
+      });
+    });
 }
 
 void grid5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -379,7 +379,7 @@ void grid5(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+5)*n+(j+4)] * 0.00111111111111
                           +in[(i+5)*n+(j+5)] * 0.01
                           ;
-       });
-     });
+      });
+    });
 }
 
diff --git a/Cxx11/stencil_rajaview.hpp b/Cxx11/stencil_rajaview.hpp
new file mode 100644
index 000000000..0e303773f
--- /dev/null
+++ b/Cxx11/stencil_rajaview.hpp
@@ -0,0 +1,385 @@
+using regular_policy = RAJA::KernelPolicy< RAJA::statement::For<0, thread_exec,                                           RAJA::statement::For<1, RAJA::simd_exec,                                           RAJA::statement::Lambda<0> > > >;void star1(const int n, const int t, matrix & in, matrix & out) {
+    RAJA::RangeSegment inner1(1,n-1);
+    auto inner2 = RAJA::make_tuple(inner1, inner1);
+    RAJA::kernel<regular_policy>(inner2, [=](int i, int j) {
+              out(i,j) += +in(i+-1,j+0) * -0.5
+                          +in(i+0,j+-1) * -0.5
+                          +in(i+0,j+1) * 0.5
+                          +in(i+1,j+0) * 0.5;
+    });
+}
+
+void star2(const int n, const int t, matrix & in, matrix & out) {
+    RAJA::RangeSegment inner1(2,n-2);
+    auto inner2 = RAJA::make_tuple(inner1, inner1);
+    RAJA::kernel<regular_policy>(inner2, [=](int i, int j) {
+              out(i,j) += +in(i+-2,j+0) * -0.125
+                          +in(i+-1,j+0) * -0.25
+                          +in(i+0,j+-2) * -0.125
+                          +in(i+0,j+-1) * -0.25
+                          +in(i+0,j+1) * 0.25
+                          +in(i+0,j+2) * 0.125
+                          +in(i+1,j+0) * 0.25
+                          +in(i+2,j+0) * 0.125;
+    });
+}
+
+void star3(const int n, const int t, matrix & in, matrix & out) {
+    RAJA::RangeSegment inner1(3,n-3);
+    auto inner2 = RAJA::make_tuple(inner1, inner1);
+    RAJA::kernel<regular_policy>(inner2, [=](int i, int j) {
+              out(i,j) += +in(i+-3,j+0) * -0.0555555555556
+                          +in(i+-2,j+0) * -0.0833333333333
+                          +in(i+-1,j+0) * -0.166666666667
+                          +in(i+0,j+-3) * -0.0555555555556
+                          +in(i+0,j+-2) * -0.0833333333333
+                          +in(i+0,j+-1) * -0.166666666667
+                          +in(i+0,j+1) * 0.166666666667
+                          +in(i+0,j+2) * 0.0833333333333
+                          +in(i+0,j+3) * 0.0555555555556
+                          +in(i+1,j+0) * 0.166666666667
+                          +in(i+2,j+0) * 0.0833333333333
+                          +in(i+3,j+0) * 0.0555555555556;
+    });
+}
+
+void star4(const int n, const int t, matrix & in, matrix & out) {
+    RAJA::RangeSegment inner1(4,n-4);
+    auto inner2 = RAJA::make_tuple(inner1, inner1);
+    RAJA::kernel<regular_policy>(inner2, [=](int i, int j) {
+              out(i,j) += +in(i+-4,j+0) * -0.03125
+                          +in(i+-3,j+0) * -0.0416666666667
+                          +in(i+-2,j+0) * -0.0625
+                          +in(i+-1,j+0) * -0.125
+                          +in(i+0,j+-4) * -0.03125
+                          +in(i+0,j+-3) * -0.0416666666667
+                          +in(i+0,j+-2) * -0.0625
+                          +in(i+0,j+-1) * -0.125
+                          +in(i+0,j+1) * 0.125
+                          +in(i+0,j+2) * 0.0625
+                          +in(i+0,j+3) * 0.0416666666667
+                          +in(i+0,j+4) * 0.03125
+                          +in(i+1,j+0) * 0.125
+                          +in(i+2,j+0) * 0.0625
+                          +in(i+3,j+0) * 0.0416666666667
+                          +in(i+4,j+0) * 0.03125;
+    });
+}
+
+void star5(const int n, const int t, matrix & in, matrix & out) {
+    RAJA::RangeSegment inner1(5,n-5);
+    auto inner2 = RAJA::make_tuple(inner1, inner1);
+    RAJA::kernel<regular_policy>(inner2, [=](int i, int j) {
+              out(i,j) += +in(i+-5,j+0) * -0.02
+                          +in(i+-4,j+0) * -0.025
+                          +in(i+-3,j+0) * -0.0333333333333
+                          +in(i+-2,j+0) * -0.05
+                          +in(i+-1,j+0) * -0.1
+                          +in(i+0,j+-5) * -0.02
+                          +in(i+0,j+-4) * -0.025
+                          +in(i+0,j+-3) * -0.0333333333333
+                          +in(i+0,j+-2) * -0.05
+                          +in(i+0,j+-1) * -0.1
+                          +in(i+0,j+1) * 0.1
+                          +in(i+0,j+2) * 0.05
+                          +in(i+0,j+3) * 0.0333333333333
+                          +in(i+0,j+4) * 0.025
+                          +in(i+0,j+5) * 0.02
+                          +in(i+1,j+0) * 0.1
+                          +in(i+2,j+0) * 0.05
+                          +in(i+3,j+0) * 0.0333333333333
+                          +in(i+4,j+0) * 0.025
+                          +in(i+5,j+0) * 0.02;
+    });
+}
+
+void grid1(const int n, const int t, matrix & in, matrix & out) {
+    RAJA::RangeSegment inner1(1,n-1);
+    auto inner2 = RAJA::make_tuple(inner1, inner1);
+    RAJA::kernel<regular_policy>(inner2, [=](int i, int j) {
+              out(i,j) += +in(i+-1,j+-1) * -0.25
+                          +in(i+-1,j+0) * -0.25
+                          +in(i+0,j+-1) * -0.25
+                          +in(i+0,j+1) * 0.25
+                          +in(i+1,j+0) * 0.25
+                          +in(i+1,j+1) * 0.25
+                          ;
+    });
+}
+
+void grid2(const int n, const int t, matrix & in, matrix & out) {
+    RAJA::RangeSegment inner1(2,n-2);
+    auto inner2 = RAJA::make_tuple(inner1, inner1);
+    RAJA::kernel<regular_policy>(inner2, [=](int i, int j) {
+              out(i,j) += +in(i+-2,j+-2) * -0.0625
+                          +in(i+-2,j+-1) * -0.0208333333333
+                          +in(i+-2,j+0) * -0.0208333333333
+                          +in(i+-2,j+1) * -0.0208333333333
+                          +in(i+-1,j+-2) * -0.0208333333333
+                          +in(i+-1,j+-1) * -0.125
+                          +in(i+-1,j+0) * -0.125
+                          +in(i+-1,j+2) * 0.0208333333333
+                          +in(i+0,j+-2) * -0.0208333333333
+                          +in(i+0,j+-1) * -0.125
+                          +in(i+0,j+1) * 0.125
+                          +in(i+0,j+2) * 0.0208333333333
+                          +in(i+1,j+-2) * -0.0208333333333
+                          +in(i+1,j+0) * 0.125
+                          +in(i+1,j+1) * 0.125
+                          +in(i+1,j+2) * 0.0208333333333
+                          +in(i+2,j+-1) * 0.0208333333333
+                          +in(i+2,j+0) * 0.0208333333333
+                          +in(i+2,j+1) * 0.0208333333333
+                          +in(i+2,j+2) * 0.0625
+                          ;
+    });
+}
+
+void grid3(const int n, const int t, matrix & in, matrix & out) {
+    RAJA::RangeSegment inner1(3,n-3);
+    auto inner2 = RAJA::make_tuple(inner1, inner1);
+    RAJA::kernel<regular_policy>(inner2, [=](int i, int j) {
+              out(i,j) += +in(i+-3,j+-3) * -0.0277777777778
+                          +in(i+-3,j+-2) * -0.00555555555556
+                          +in(i+-3,j+-1) * -0.00555555555556
+                          +in(i+-3,j+0) * -0.00555555555556
+                          +in(i+-3,j+1) * -0.00555555555556
+                          +in(i+-3,j+2) * -0.00555555555556
+                          +in(i+-2,j+-3) * -0.00555555555556
+                          +in(i+-2,j+-2) * -0.0416666666667
+                          +in(i+-2,j+-1) * -0.0138888888889
+                          +in(i+-2,j+0) * -0.0138888888889
+                          +in(i+-2,j+1) * -0.0138888888889
+                          +in(i+-2,j+3) * 0.00555555555556
+                          +in(i+-1,j+-3) * -0.00555555555556
+                          +in(i+-1,j+-2) * -0.0138888888889
+                          +in(i+-1,j+-1) * -0.0833333333333
+                          +in(i+-1,j+0) * -0.0833333333333
+                          +in(i+-1,j+2) * 0.0138888888889
+                          +in(i+-1,j+3) * 0.00555555555556
+                          +in(i+0,j+-3) * -0.00555555555556
+                          +in(i+0,j+-2) * -0.0138888888889
+                          +in(i+0,j+-1) * -0.0833333333333
+                          +in(i+0,j+1) * 0.0833333333333
+                          +in(i+0,j+2) * 0.0138888888889
+                          +in(i+0,j+3) * 0.00555555555556
+                          +in(i+1,j+-3) * -0.00555555555556
+                          +in(i+1,j+-2) * -0.0138888888889
+                          +in(i+1,j+0) * 0.0833333333333
+                          +in(i+1,j+1) * 0.0833333333333
+                          +in(i+1,j+2) * 0.0138888888889
+                          +in(i+1,j+3) * 0.00555555555556
+                          +in(i+2,j+-3) * -0.00555555555556
+                          +in(i+2,j+-1) * 0.0138888888889
+                          +in(i+2,j+0) * 0.0138888888889
+                          +in(i+2,j+1) * 0.0138888888889
+                          +in(i+2,j+2) * 0.0416666666667
+                          +in(i+2,j+3) * 0.00555555555556
+                          +in(i+3,j+-2) * 0.00555555555556
+                          +in(i+3,j+-1) * 0.00555555555556
+                          +in(i+3,j+0) * 0.00555555555556
+                          +in(i+3,j+1) * 0.00555555555556
+                          +in(i+3,j+2) * 0.00555555555556
+                          +in(i+3,j+3) * 0.0277777777778
+                          ;
+    });
+}
+
+void grid4(const int n, const int t, matrix & in, matrix & out) {
+    RAJA::RangeSegment inner1(4,n-4);
+    auto inner2 = RAJA::make_tuple(inner1, inner1);
+    RAJA::kernel<regular_policy>(inner2, [=](int i, int j) {
+              out(i,j) += +in(i+-4,j+-4) * -0.015625
+                          +in(i+-4,j+-3) * -0.00223214285714
+                          +in(i+-4,j+-2) * -0.00223214285714
+                          +in(i+-4,j+-1) * -0.00223214285714
+                          +in(i+-4,j+0) * -0.00223214285714
+                          +in(i+-4,j+1) * -0.00223214285714
+                          +in(i+-4,j+2) * -0.00223214285714
+                          +in(i+-4,j+3) * -0.00223214285714
+                          +in(i+-3,j+-4) * -0.00223214285714
+                          +in(i+-3,j+-3) * -0.0208333333333
+                          +in(i+-3,j+-2) * -0.00416666666667
+                          +in(i+-3,j+-1) * -0.00416666666667
+                          +in(i+-3,j+0) * -0.00416666666667
+                          +in(i+-3,j+1) * -0.00416666666667
+                          +in(i+-3,j+2) * -0.00416666666667
+                          +in(i+-3,j+4) * 0.00223214285714
+                          +in(i+-2,j+-4) * -0.00223214285714
+                          +in(i+-2,j+-3) * -0.00416666666667
+                          +in(i+-2,j+-2) * -0.03125
+                          +in(i+-2,j+-1) * -0.0104166666667
+                          +in(i+-2,j+0) * -0.0104166666667
+                          +in(i+-2,j+1) * -0.0104166666667
+                          +in(i+-2,j+3) * 0.00416666666667
+                          +in(i+-2,j+4) * 0.00223214285714
+                          +in(i+-1,j+-4) * -0.00223214285714
+                          +in(i+-1,j+-3) * -0.00416666666667
+                          +in(i+-1,j+-2) * -0.0104166666667
+                          +in(i+-1,j+-1) * -0.0625
+                          +in(i+-1,j+0) * -0.0625
+                          +in(i+-1,j+2) * 0.0104166666667
+                          +in(i+-1,j+3) * 0.00416666666667
+                          +in(i+-1,j+4) * 0.00223214285714
+                          +in(i+0,j+-4) * -0.00223214285714
+                          +in(i+0,j+-3) * -0.00416666666667
+                          +in(i+0,j+-2) * -0.0104166666667
+                          +in(i+0,j+-1) * -0.0625
+                          +in(i+0,j+1) * 0.0625
+                          +in(i+0,j+2) * 0.0104166666667
+                          +in(i+0,j+3) * 0.00416666666667
+                          +in(i+0,j+4) * 0.00223214285714
+                          +in(i+1,j+-4) * -0.00223214285714
+                          +in(i+1,j+-3) * -0.00416666666667
+                          +in(i+1,j+-2) * -0.0104166666667
+                          +in(i+1,j+0) * 0.0625
+                          +in(i+1,j+1) * 0.0625
+                          +in(i+1,j+2) * 0.0104166666667
+                          +in(i+1,j+3) * 0.00416666666667
+                          +in(i+1,j+4) * 0.00223214285714
+                          +in(i+2,j+-4) * -0.00223214285714
+                          +in(i+2,j+-3) * -0.00416666666667
+                          +in(i+2,j+-1) * 0.0104166666667
+                          +in(i+2,j+0) * 0.0104166666667
+                          +in(i+2,j+1) * 0.0104166666667
+                          +in(i+2,j+2) * 0.03125
+                          +in(i+2,j+3) * 0.00416666666667
+                          +in(i+2,j+4) * 0.00223214285714
+                          +in(i+3,j+-4) * -0.00223214285714
+                          +in(i+3,j+-2) * 0.00416666666667
+                          +in(i+3,j+-1) * 0.00416666666667
+                          +in(i+3,j+0) * 0.00416666666667
+                          +in(i+3,j+1) * 0.00416666666667
+                          +in(i+3,j+2) * 0.00416666666667
+                          +in(i+3,j+3) * 0.0208333333333
+                          +in(i+3,j+4) * 0.00223214285714
+                          +in(i+4,j+-3) * 0.00223214285714
+                          +in(i+4,j+-2) * 0.00223214285714
+                          +in(i+4,j+-1) * 0.00223214285714
+                          +in(i+4,j+0) * 0.00223214285714
+                          +in(i+4,j+1) * 0.00223214285714
+                          +in(i+4,j+2) * 0.00223214285714
+                          +in(i+4,j+3) * 0.00223214285714
+                          +in(i+4,j+4) * 0.015625
+                          ;
+    });
+}
+
+void grid5(const int n, const int t, matrix & in, matrix & out) {
+    RAJA::RangeSegment inner1(5,n-5);
+    auto inner2 = RAJA::make_tuple(inner1, inner1);
+    RAJA::kernel<regular_policy>(inner2, [=](int i, int j) {
+              out(i,j) += +in(i+-5,j+-5) * -0.01
+                          +in(i+-5,j+-4) * -0.00111111111111
+                          +in(i+-5,j+-3) * -0.00111111111111
+                          +in(i+-5,j+-2) * -0.00111111111111
+                          +in(i+-5,j+-1) * -0.00111111111111
+                          +in(i+-5,j+0) * -0.00111111111111
+                          +in(i+-5,j+1) * -0.00111111111111
+                          +in(i+-5,j+2) * -0.00111111111111
+                          +in(i+-5,j+3) * -0.00111111111111
+                          +in(i+-5,j+4) * -0.00111111111111
+                          +in(i+-4,j+-5) * -0.00111111111111
+                          +in(i+-4,j+-4) * -0.0125
+                          +in(i+-4,j+-3) * -0.00178571428571
+                          +in(i+-4,j+-2) * -0.00178571428571
+                          +in(i+-4,j+-1) * -0.00178571428571
+                          +in(i+-4,j+0) * -0.00178571428571
+                          +in(i+-4,j+1) * -0.00178571428571
+                          +in(i+-4,j+2) * -0.00178571428571
+                          +in(i+-4,j+3) * -0.00178571428571
+                          +in(i+-4,j+5) * 0.00111111111111
+                          +in(i+-3,j+-5) * -0.00111111111111
+                          +in(i+-3,j+-4) * -0.00178571428571
+                          +in(i+-3,j+-3) * -0.0166666666667
+                          +in(i+-3,j+-2) * -0.00333333333333
+                          +in(i+-3,j+-1) * -0.00333333333333
+                          +in(i+-3,j+0) * -0.00333333333333
+                          +in(i+-3,j+1) * -0.00333333333333
+                          +in(i+-3,j+2) * -0.00333333333333
+                          +in(i+-3,j+4) * 0.00178571428571
+                          +in(i+-3,j+5) * 0.00111111111111
+                          +in(i+-2,j+-5) * -0.00111111111111
+                          +in(i+-2,j+-4) * -0.00178571428571
+                          +in(i+-2,j+-3) * -0.00333333333333
+                          +in(i+-2,j+-2) * -0.025
+                          +in(i+-2,j+-1) * -0.00833333333333
+                          +in(i+-2,j+0) * -0.00833333333333
+                          +in(i+-2,j+1) * -0.00833333333333
+                          +in(i+-2,j+3) * 0.00333333333333
+                          +in(i+-2,j+4) * 0.00178571428571
+                          +in(i+-2,j+5) * 0.00111111111111
+                          +in(i+-1,j+-5) * -0.00111111111111
+                          +in(i+-1,j+-4) * -0.00178571428571
+                          +in(i+-1,j+-3) * -0.00333333333333
+                          +in(i+-1,j+-2) * -0.00833333333333
+                          +in(i+-1,j+-1) * -0.05
+                          +in(i+-1,j+0) * -0.05
+                          +in(i+-1,j+2) * 0.00833333333333
+                          +in(i+-1,j+3) * 0.00333333333333
+                          +in(i+-1,j+4) * 0.00178571428571
+                          +in(i+-1,j+5) * 0.00111111111111
+                          +in(i+0,j+-5) * -0.00111111111111
+                          +in(i+0,j+-4) * -0.00178571428571
+                          +in(i+0,j+-3) * -0.00333333333333
+                          +in(i+0,j+-2) * -0.00833333333333
+                          +in(i+0,j+-1) * -0.05
+                          +in(i+0,j+1) * 0.05
+                          +in(i+0,j+2) * 0.00833333333333
+                          +in(i+0,j+3) * 0.00333333333333
+                          +in(i+0,j+4) * 0.00178571428571
+                          +in(i+0,j+5) * 0.00111111111111
+                          +in(i+1,j+-5) * -0.00111111111111
+                          +in(i+1,j+-4) * -0.00178571428571
+                          +in(i+1,j+-3) * -0.00333333333333
+                          +in(i+1,j+-2) * -0.00833333333333
+                          +in(i+1,j+0) * 0.05
+                          +in(i+1,j+1) * 0.05
+                          +in(i+1,j+2) * 0.00833333333333
+                          +in(i+1,j+3) * 0.00333333333333
+                          +in(i+1,j+4) * 0.00178571428571
+                          +in(i+1,j+5) * 0.00111111111111
+                          +in(i+2,j+-5) * -0.00111111111111
+                          +in(i+2,j+-4) * -0.00178571428571
+                          +in(i+2,j+-3) * -0.00333333333333
+                          +in(i+2,j+-1) * 0.00833333333333
+                          +in(i+2,j+0) * 0.00833333333333
+                          +in(i+2,j+1) * 0.00833333333333
+                          +in(i+2,j+2) * 0.025
+                          +in(i+2,j+3) * 0.00333333333333
+                          +in(i+2,j+4) * 0.00178571428571
+                          +in(i+2,j+5) * 0.00111111111111
+                          +in(i+3,j+-5) * -0.00111111111111
+                          +in(i+3,j+-4) * -0.00178571428571
+                          +in(i+3,j+-2) * 0.00333333333333
+                          +in(i+3,j+-1) * 0.00333333333333
+                          +in(i+3,j+0) * 0.00333333333333
+                          +in(i+3,j+1) * 0.00333333333333
+                          +in(i+3,j+2) * 0.00333333333333
+                          +in(i+3,j+3) * 0.0166666666667
+                          +in(i+3,j+4) * 0.00178571428571
+                          +in(i+3,j+5) * 0.00111111111111
+                          +in(i+4,j+-5) * -0.00111111111111
+                          +in(i+4,j+-3) * 0.00178571428571
+                          +in(i+4,j+-2) * 0.00178571428571
+                          +in(i+4,j+-1) * 0.00178571428571
+                          +in(i+4,j+0) * 0.00178571428571
+                          +in(i+4,j+1) * 0.00178571428571
+                          +in(i+4,j+2) * 0.00178571428571
+                          +in(i+4,j+3) * 0.00178571428571
+                          +in(i+4,j+4) * 0.0125
+                          +in(i+4,j+5) * 0.00111111111111
+                          +in(i+5,j+-4) * 0.00111111111111
+                          +in(i+5,j+-3) * 0.00111111111111
+                          +in(i+5,j+-2) * 0.00111111111111
+                          +in(i+5,j+-1) * 0.00111111111111
+                          +in(i+5,j+0) * 0.00111111111111
+                          +in(i+5,j+1) * 0.00111111111111
+                          +in(i+5,j+2) * 0.00111111111111
+                          +in(i+5,j+3) * 0.00111111111111
+                          +in(i+5,j+4) * 0.00111111111111
+                          +in(i+5,j+5) * 0.01
+                          ;
+    });
+}
+
diff --git a/Cxx11/stencil_stl.hpp b/Cxx11/stencil_stl.hpp
index 4dcdde467..6633cff00 100644
--- a/Cxx11/stencil_stl.hpp
+++ b/Cxx11/stencil_stl.hpp
@@ -6,8 +6,8 @@ void star1(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+0)*n+(j+-1)] * -0.5
                           +in[(i+0)*n+(j+1)] * 0.5
                           +in[(i+1)*n+(j+0)] * 0.5;
-       });
-     });
+      });
+    });
 }
 
 void star2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -22,8 +22,8 @@ void star2(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+0)*n+(j+2)] * 0.125
                           +in[(i+1)*n+(j+0)] * 0.25
                           +in[(i+2)*n+(j+0)] * 0.125;
-       });
-     });
+      });
+    });
 }
 
 void star3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -42,8 +42,8 @@ void star3(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+1)*n+(j+0)] * 0.166666666667
                           +in[(i+2)*n+(j+0)] * 0.0833333333333
                           +in[(i+3)*n+(j+0)] * 0.0555555555556;
-       });
-     });
+      });
+    });
 }
 
 void star4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -66,8 +66,8 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+2)*n+(j+0)] * 0.0625
                           +in[(i+3)*n+(j+0)] * 0.0416666666667
                           +in[(i+4)*n+(j+0)] * 0.03125;
-       });
-     });
+      });
+    });
 }
 
 void star5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -94,8 +94,8 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+3)*n+(j+0)] * 0.0333333333333
                           +in[(i+4)*n+(j+0)] * 0.025
                           +in[(i+5)*n+(j+0)] * 0.02;
-       });
-     });
+      });
+    });
 }
 
 void grid1(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -109,8 +109,8 @@ void grid1(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+1)*n+(j+0)] * 0.25
                           +in[(i+1)*n+(j+1)] * 0.25
                           ;
-       });
-     });
+      });
+    });
 }
 
 void grid2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -138,8 +138,8 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+2)*n+(j+1)] * 0.0208333333333
                           +in[(i+2)*n+(j+2)] * 0.0625
                           ;
-       });
-     });
+      });
+    });
 }
 
 void grid3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -189,8 +189,8 @@ void grid3(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+3)*n+(j+2)] * 0.00555555555556
                           +in[(i+3)*n+(j+3)] * 0.0277777777778
                           ;
-       });
-     });
+      });
+    });
 }
 
 void grid4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -270,8 +270,8 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+4)*n+(j+3)] * 0.00223214285714
                           +in[(i+4)*n+(j+4)] * 0.015625
                           ;
-       });
-     });
+      });
+    });
 }
 
 void grid5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
@@ -389,7 +389,7 @@ void grid5(const int n, const int t, std::vector<double> & in, std::vector<doubl
                           +in[(i+5)*n+(j+4)] * 0.00111111111111
                           +in[(i+5)*n+(j+5)] * 0.01
                           ;
-       });
-     });
+      });
+    });
 }
 
diff --git a/Cxx11/transpose-raja.cc b/Cxx11/transpose-raja.cc
new file mode 100644
index 000000000..d685bfeb8
--- /dev/null
+++ b/Cxx11/transpose-raja.cc
@@ -0,0 +1,186 @@
+///
+/// Copyright (c) 2018, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    transpose
+///
+/// PURPOSE: This program measures the time for the transpose of a
+///          column-major stored matrix into a row-major stored matrix.
+///
+/// USAGE:   Program input is the matrix order and the number of times to
+///          repeat the operation:
+///
+///          transpose <matrix_size> <# iterations>
+///
+///          The output consists of diagnostics to make sure the
+///          transpose worked and timing statistics.
+///
+/// HISTORY: Written by  Rob Van der Wijngaart, February 2009.
+///          Converted to C++11 by Jeff Hammond, February 2016 and May 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+#include "prk_raja.h"
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/RAJA Matrix transpose: B = A^T" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations;
+  int order;
+  int tile_size;
+  bool permute = false;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <matrix order> [<tile_size> <permute=0/1>]";
+      }
+
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      order = std::atoi(argv[2]);
+      if (order <= 0) {
+        throw "ERROR: Matrix Order must be greater than 0";
+      } else if (order > std::floor(std::sqrt(INT_MAX))) {
+        throw "ERROR: matrix dimension too large - overflow risk";
+      }
+
+      // default tile size for tiling of local transpose
+      tile_size = (argc>3) ? std::atoi(argv[3]) : 32;
+      // a negative tile size means no tiling of the local transpose
+      if (tile_size <= 0) tile_size = order;
+
+      auto permute_input = (argc>4) ? std::atoi(argv[4]) : 0;
+      if (permute_input != 0 && permute_input != 1) {
+        throw "ERROR: permute must be 0 (no) or 1 (yes)";
+      }
+      permute = (permute_input == 1);
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Matrix order         = " << order << std::endl;
+  std::cout << "Tile size            = " << tile_size << std::endl;
+  std::cout << "Permute loops        = " << (permute ? "yes" : "no") << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  double trans_time(0);
+
+  double * RESTRICT Amem = new double[order*order];
+  double * RESTRICT Bmem = new double[order*order];
+
+  matrix A(Amem, order, order);
+  matrix B(Bmem, order, order);
+
+  using regular_policy = RAJA::KernelPolicy< RAJA::statement::For<0, thread_exec,
+                                             RAJA::statement::For<1, RAJA::simd_exec,
+                                             RAJA::statement::Lambda<0> > > >;
+  using permute_policy = RAJA::KernelPolicy< RAJA::statement::For<1, thread_exec,
+                                             RAJA::statement::For<0, RAJA::simd_exec,
+                                             RAJA::statement::Lambda<0> > > >;
+
+  RAJA::RangeSegment range(0, order);
+  auto range2d = RAJA::make_tuple(range, range);
+
+  RAJA::kernel<regular_policy>(range2d, [=](int i, int j) {
+      A(i,j) = static_cast<double>(i*order+j);
+      B(i,j) = 0.0;
+  });
+
+  for (int iter = 0; iter<=iterations; ++iter) {
+
+    if (iter==1) trans_time = prk::wtime();
+
+    if (permute) {
+        RAJA::kernel<permute_policy>(range2d, [=](int i, int j) {
+            B(i,j) += A(j,i);
+            A(j,i) += 1.0;
+        });
+    } else {
+        RAJA::kernel<regular_policy>(range2d, [=](int i, int j) {
+            B(i,j) += A(j,i);
+            A(j,i) += 1.0;
+        });
+    }
+  }
+  trans_time = prk::wtime() - trans_time;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  using reduce_policy = RAJA::KernelPolicy< RAJA::statement::For<0, thread_exec,
+                                            RAJA::statement::For<1, RAJA::seq_exec,
+                                            RAJA::statement::Lambda<0> > > >;
+
+  double const addit = (iterations+1.) * (0.5*iterations);
+  RAJA::ReduceSum<reduce_exec, double> abserr(0.0);
+  RAJA::kernel<reduce_policy>(range2d, [=](int i, int j) {
+      double const ij = static_cast<double>(i*order+j);
+      double const reference = ij*(1.+iterations)+addit;
+      abserr += std::fabs(B(j,i) - reference);
+  });
+
+#ifdef VERBOSE
+  std::cout << "Sum of absolute differences: " << abserr << std::endl;
+#endif
+
+  double epsilon(1.0e-8);
+  if (abserr < epsilon) {
+    std::cout << "Solution validates" << std::endl;
+    auto avgtime = trans_time/iterations;
+    auto bytes = (size_t)order * (size_t)order * sizeof(double);
+    std::cout << "Rate (MB/s): " << 1.0e-6 * (2.*bytes)/avgtime
+              << " Avg time (s): " << avgtime << std::endl;
+  } else {
+    std::cout << "ERROR: Aggregate squared error " << abserr
+              << " exceeds threshold " << epsilon << std::endl;
+    return 1;
+  }
+  return 0;
+}
+
+
diff --git a/Cxx11/transpose-vector-raja.cc b/Cxx11/transpose-vector-raja.cc
index 59b757eea..d40cefc65 100644
--- a/Cxx11/transpose-vector-raja.cc
+++ b/Cxx11/transpose-vector-raja.cc
@@ -270,7 +270,7 @@ int main(int argc, char * argv[])
   std::cout << "RAJA use simd         = " << (use_simd ? "yes" : "no") << std::endl;
 
   //////////////////////////////////////////////////////////////////////
-  /// Allocate space for the input and transpose matrix
+  // Allocate space and perform the computation
   //////////////////////////////////////////////////////////////////////
 
   std::vector<double> A(order*order);
@@ -421,9 +421,9 @@ int main(int argc, char * argv[])
   }
 #endif
 
-  auto trans_time = 0.0;
+  double trans_time(0);
 
-  for (auto iter = 0; iter<=iterations; iter++) {
+  for (int iter = 0; iter<=iterations; iter++) {
 
     if (iter==1) trans_time = prk::wtime();
 
@@ -598,17 +598,16 @@ int main(int argc, char * argv[])
   }
 #endif
 
-
 #ifdef VERBOSE
   std::cout << "Sum of absolute differences: " << abserr << std::endl;
 #endif
 
-  const auto epsilon = 1.0e-8;
+  double epsilon(1.0e-8);
   if (abserr < epsilon) {
     std::cout << "Solution validates" << std::endl;
     auto avgtime = trans_time/iterations;
     auto bytes = (size_t)order * (size_t)order * sizeof(double);
-    std::cout << "Rate (MB/s): " << 1.0e-6 * (2L*bytes)/avgtime
+    std::cout << "Rate (MB/s): " << 1.0e-6 * (2.*bytes)/avgtime
               << " Avg time (s): " << avgtime << std::endl;
   } else {
     std::cout << "ERROR: Aggregate squared error " << abserr
diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index 0b0827729..2820eeff8 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -592,11 +592,17 @@ case "$PRK_TARGET" in
                 ;;
         esac
         # RAJA
-        make -C $PRK_TARGET_PATH stencil-vector-raja transpose-vector-raja nstream-vector-raja
+        make -C $PRK_TARGET_PATH p2p-vector-raja stencil-vector-raja transpose-vector-raja nstream-vector-raja \
+                                 p2p-raja stencil-raja transpose-raja nstream-raja
+        # New (Views)
+        $PRK_TARGET_PATH/p2p-raja                10 1024 1024
+        $PRK_TARGET_PATH/stencil-raja            10 1000
+        $PRK_TARGET_PATH/transpose-raja          10 1024
+        $PRK_TARGET_PATH/nstream-raja            10 16777216 32
+        # Old (STL)
+        $PRK_TARGET_PATH/p2p-vector-raja         10 1024 1024
         $PRK_TARGET_PATH/stencil-vector-raja     10 1000
-        # RAJA variant 11 should be the best
         $PRK_TARGET_PATH/transpose-vector-raja   10 1024
-        # test all the RAJA variants with a smaller problem
         for f in seq omp tbb ; do
          for s in y n ; do
           for t in y n ; do
@@ -612,6 +618,7 @@ case "$PRK_TARGET" in
         for s in star grid ; do
             for r in 1 2 3 4 5 ; do
                 $PRK_TARGET_PATH/stencil-vector-raja 10 200 20 $s $r
+                $PRK_TARGET_PATH/stencil-raja        10 200 20 $s $r
             done
         done
         # Kokkos
diff --git a/travis/install-raja.sh b/travis/install-raja.sh
index fe633f5aa..114b9f2a5 100644
--- a/travis/install-raja.sh
+++ b/travis/install-raja.sh
@@ -40,8 +40,7 @@ esac
 ${PRK_CXX} -v
 
 if [ ! -d "$TRAVIS_ROOT/raja" ]; then
-    #BRANCH=develop # forallN deprecated
-    BRANCH=master
+    BRANCH=develop
     git clone --recursive --depth 1 -b ${BRANCH} https://github.com/LLNL/RAJA.git
     cd RAJA
     mkdir build

From 65d27e74278d5ace0f68ef989f7151c50f10a88f Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Mon, 4 Jun 2018 09:46:59 -0700
Subject: [PATCH 104/245] refactor Rust files to use Cargo (#351)

* refactor Rust files to use Cargo

* use cargo in travis

* fix path in Travis for Rust

* try again with dir
---
 .gitignore                       |   6 +-
 RUST/Makefile                    |  28 +---
 RUST/legacy/Makefile             |  34 ++++
 RUST/{ => legacy}/p2p.rs         |   0
 RUST/{ => legacy}/stencil-old.rs |   0
 RUST/{ => legacy}/stencil.rs     |   0
 RUST/{ => legacy}/transpose.rs   |   0
 RUST/p2p/Cargo.toml              |   6 +
 RUST/p2p/src/main.rs             | 175 ++++++++++++++++++++
 RUST/stencil/Cargo.toml          |   6 +
 RUST/stencil/src/main.rs         | 273 +++++++++++++++++++++++++++++++
 RUST/transpose/Cargo.toml        |   6 +
 RUST/transpose/src/main.rs       | 190 +++++++++++++++++++++
 travis/build-run-prk.sh          |   7 +-
 14 files changed, 704 insertions(+), 27 deletions(-)
 create mode 100644 RUST/legacy/Makefile
 rename RUST/{ => legacy}/p2p.rs (100%)
 rename RUST/{ => legacy}/stencil-old.rs (100%)
 rename RUST/{ => legacy}/stencil.rs (100%)
 rename RUST/{ => legacy}/transpose.rs (100%)
 create mode 100644 RUST/p2p/Cargo.toml
 create mode 100644 RUST/p2p/src/main.rs
 create mode 100644 RUST/stencil/Cargo.toml
 create mode 100644 RUST/stencil/src/main.rs
 create mode 100644 RUST/transpose/Cargo.toml
 create mode 100644 RUST/transpose/src/main.rs

diff --git a/.gitignore b/.gitignore
index 099e56f2a..b6b88b3b2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -251,6 +251,6 @@ FORTRAN/transpose-ornlacc
 FORTRAN/transpose-taskloop-openmp
 FORTRAN/transpose-tasks-openmp
 FORTRAN/transpose-ornlacc
-RUST/p2p
-RUST/stencil
-RUST/transpose
+RUST/p2p/Cargo.lock
+RUST/stencil/Cargo.lock
+RUST/transpose/Cargo.lock
diff --git a/RUST/Makefile b/RUST/Makefile
index 74414a78d..4bbfeb5de 100644
--- a/RUST/Makefile
+++ b/RUST/Makefile
@@ -1,13 +1,3 @@
-include ../common/RUST.defs
-include ../common/PRKVERSION
-
-ifndef RADIUS
-  RADIUS=2
-endif
-
-RUSTC    = rustc
-RCFLAGS  = -g
-
 # Enable verbose printing
 #RCFLAGS += --cfg "VERBOSE"
 
@@ -18,17 +8,15 @@ RCFLAGS  = -g
 # Stencil shape: star is default, uncomment to switch to grid
 #RCFLAGS += --cfg grid
 
-.PHONY: all clean run
-
-all: p2p stencil transpose
+.PHONY: all clean
 
-%: %.rs
-	$(RUSTC) $(RCFLAGS) $< -o $@
+all:
+	cd p2p && cargo build
+	cd stencil && cargo build
+	cd transpose && cargo build
 
 clean:
-	-rm -f *.o
-	-rm -f *.optrpt
-	-rm -f *.dwarf
-	-rm -rf *.dSYM
-	-rm -f p2p stencil transpose
+	cd p2p && cargo clean
+	cd stencil && cargo clean
+	cd transpose && cargo clean
 
diff --git a/RUST/legacy/Makefile b/RUST/legacy/Makefile
new file mode 100644
index 000000000..74414a78d
--- /dev/null
+++ b/RUST/legacy/Makefile
@@ -0,0 +1,34 @@
+include ../common/RUST.defs
+include ../common/PRKVERSION
+
+ifndef RADIUS
+  RADIUS=2
+endif
+
+RUSTC    = rustc
+RCFLAGS  = -g
+
+# Enable verbose printing
+#RCFLAGS += --cfg "VERBOSE"
+
+# This is now a runtime option
+# Stencil radius
+#RCFLAGS += --cfg radius="$(RADIUS)"
+
+# Stencil shape: star is default, uncomment to switch to grid
+#RCFLAGS += --cfg grid
+
+.PHONY: all clean run
+
+all: p2p stencil transpose
+
+%: %.rs
+	$(RUSTC) $(RCFLAGS) $< -o $@
+
+clean:
+	-rm -f *.o
+	-rm -f *.optrpt
+	-rm -f *.dwarf
+	-rm -rf *.dSYM
+	-rm -f p2p stencil transpose
+
diff --git a/RUST/p2p.rs b/RUST/legacy/p2p.rs
similarity index 100%
rename from RUST/p2p.rs
rename to RUST/legacy/p2p.rs
diff --git a/RUST/stencil-old.rs b/RUST/legacy/stencil-old.rs
similarity index 100%
rename from RUST/stencil-old.rs
rename to RUST/legacy/stencil-old.rs
diff --git a/RUST/stencil.rs b/RUST/legacy/stencil.rs
similarity index 100%
rename from RUST/stencil.rs
rename to RUST/legacy/stencil.rs
diff --git a/RUST/transpose.rs b/RUST/legacy/transpose.rs
similarity index 100%
rename from RUST/transpose.rs
rename to RUST/legacy/transpose.rs
diff --git a/RUST/p2p/Cargo.toml b/RUST/p2p/Cargo.toml
new file mode 100644
index 000000000..589644538
--- /dev/null
+++ b/RUST/p2p/Cargo.toml
@@ -0,0 +1,6 @@
+[package]
+name = "p2p"
+version = "0.1.0"
+authors = ["Jeff Hammond <jeff.r.hammond@intel.com>"]
+
+[dependencies]
diff --git a/RUST/p2p/src/main.rs b/RUST/p2p/src/main.rs
new file mode 100644
index 000000000..4da63472a
--- /dev/null
+++ b/RUST/p2p/src/main.rs
@@ -0,0 +1,175 @@
+///
+/// Copyright (c) 2013, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    Pipeline
+///
+/// PURPOSE: This program tests the efficiency with which point-to-point
+///          synchronization can be carried out. It does so by executing
+///          a pipelined algorithm on an m*n grid. The first array dimension
+///          is distributed among the threads (stripwise decomposition).
+///
+/// USAGE:   The program takes as input the
+///          dimensions of the grid, and the number of iterations on the grid
+///
+///                <progname> <iterations> <m> <n>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// FUNCTIONS CALLED:
+///
+///          Other than standard C functions, the following
+///          functions are used in this program:
+///
+///          wtime()
+///
+/// HISTORY: - Written by Rob Van der Wijngaart, February 2009.
+///          - C99-ification by Jeff Hammond, February 2016.
+///          - C++11-ification by Jeff Hammond, May 2017.
+///          - Rust port by Jeff Hammond, May 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+use std::env;
+use std::time::{Instant,Duration};
+
+fn help() {
+  println!("Usage: <# iterations> <matrix order> [tile size]");
+}
+
+fn main()
+{
+  println!("Parallel Research Kernels version");
+  println!("Rust pipeline execution on 2D grid");
+
+  //////////////////////////////////////////////////////////////////////
+  // Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  let args : Vec<String> = env::args().collect();
+
+  let iterations : u32;
+  let m : usize;
+  let n : usize;
+
+  if args.len() == 4 {
+    iterations = match args[1].parse() {
+      Ok(n) => { n },
+      Err(_) => { help(); return; },
+    };
+    m = match args[2].parse() {
+      Ok(n) => { n },
+      Err(_) => { help(); return; },
+    };
+    n = match args[3].parse() {
+      Ok(n) => { n },
+      Err(_) => { help(); return; },
+    };
+  } else {
+    help();
+    return;
+  }
+
+  if iterations < 1 {
+    println!("ERROR: iterations must be >= 1");
+  }
+  if m < 1 || n < 1 {
+    println!("ERROR: grid dimensions must be positive: {}, {}", m, n);
+  }
+
+  println!("Grid sizes                = {}, {}", m, n);
+  println!("Number of iterations      = {}", iterations);
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space for the input and do the work
+  //////////////////////////////////////////////////////////////////////
+
+  let nelems : usize = m*n;
+  let mut vector : Vec<f64> = vec![0.0; nelems];
+
+  // set boundary values (bottom and left side of grid)
+  for j in 0..n {
+    vector[0*n+j] = j as f64;
+  }
+  for i in 0..m {
+    vector[i*n+0] = i as f64;
+  }
+
+  let timer = Instant::now();
+  let mut t0 : Duration = timer.elapsed();
+
+  for k in 0..iterations+1 {
+
+    if k == 1 { t0 = timer.elapsed(); }
+
+    for i in 1..m {
+      for j in 1..n {
+        vector[i*n+j] = vector[(i-1)*n+j] + vector[i*n+(j-1)] - vector[(i-1)*n+(j-1)];
+      }
+    }
+
+    // copy top right corner value to bottom left corner to create dependency; we
+    // need a barrier to make sure the latest value is used. This also guarantees
+    // that the flags for the next iteration (if any) are not getting clobbered
+    vector[0*n+0] = -vector[(m-1)*n+(n-1)];
+
+  }
+  let t1 = timer.elapsed();
+  let dt = (t1.checked_sub(t0)).unwrap();
+  let dtt : u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64;
+  let pipeline_time : f64 = dtt as f64 / 1.0e9_f64 as f64;
+
+  //////////////////////////////////////////////////////////////////////
+  // Analyze and output results.
+  //////////////////////////////////////////////////////////////////////
+
+  // error tolerance
+  let epsilon : f64 = 1.0e-8;
+
+  // verify correctness, using top right value
+  let corner_val : f64 = (((iterations+1) as usize)*(n + m as usize - 2 as usize)) as f64;
+  if ( (vector[(m-1)*n+(n-1)] - corner_val).abs() / corner_val) > epsilon {
+    println!("ERROR: checksum {} does not match verification value {} ", vector[(m-1)*n+(n-1)], corner_val);
+    return;
+  }
+
+  if cfg!(VERBOSE) {
+    println!("Solution validates; verification value = {}", corner_val);
+  } else {
+    println!("Solution validates");
+  }
+
+  let avgtime : f64 = (pipeline_time as f64) / (iterations as f64);
+  let bytes : usize = 2 * (m-1) * (n-1);
+  println!("Rate (MB/s): {:10.3} Avg time (s): {:10.3}", (1.0e-6_f64) * (bytes as f64) / avgtime, avgtime);
+}
diff --git a/RUST/stencil/Cargo.toml b/RUST/stencil/Cargo.toml
new file mode 100644
index 000000000..6a05b19c6
--- /dev/null
+++ b/RUST/stencil/Cargo.toml
@@ -0,0 +1,6 @@
+[package]
+name = "stencil"
+version = "0.1.0"
+authors = ["Jeff Hammond <jeff.r.hammond@intel.com>"]
+
+[dependencies]
diff --git a/RUST/stencil/src/main.rs b/RUST/stencil/src/main.rs
new file mode 100644
index 000000000..a2bcb9c21
--- /dev/null
+++ b/RUST/stencil/src/main.rs
@@ -0,0 +1,273 @@
+
+///
+/// Copyright (c) 2013, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    Stencil
+///
+/// PURPOSE: This program tests the efficiency with which a space-invariant,
+///          linear, symmetric filter (stencil) can be applied to a square
+///          grid or image.
+///
+/// USAGE:   The program takes as input the linear
+///          dimension of the grid, and the number of iterations on the grid
+///
+///                <progname> <iterations> <grid size>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// FUNCTIONS CALLED:
+///
+///          Other than standard C functions, the following functions are used in
+///          this program:
+///          wtime()
+///
+/// HISTORY: - Written by Rob Van der Wijngaart, February 2009.
+///          - RvdW: Removed unrolling pragmas for clarity;
+///            added constant to array "a" at end of each iteration to force
+///            refreshing of neighbor data in parallel versions; August 2013
+///          - C++11-ification by Jeff Hammond, May 2017.
+///          - Rust port by Jeff Hammond, May 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+use std::env;
+use std::time::{Instant,Duration};
+
+fn help() {
+  println!("Usage: <# iterations> <grid dimension> <radius>");
+}
+
+fn main()
+{
+  println!("Parallel Research Kernels");
+  println!("Rust stencil execution on 2D grid");
+
+  //////////////////////////////////////////////////////////////////////
+  // Process and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  let args : Vec<String> = env::args().collect();
+
+  let iterations : usize;
+  let n : usize;
+  let r : usize;
+
+  // This is a compile-time setting.
+  // grid stencil (star is the default)
+  let grid : bool = if cfg!(grid) { true } else { false };
+
+  // I have failed to make this a compile-time setting.
+  /*
+  let r : usize =
+      if cfg!(radius = "1") { 1 } else
+      if cfg!(radius = "2") { 2 } else
+      if cfg!(radius = "3") { 3 } else
+      if cfg!(radius = "4") { 4 } else
+      if cfg!(radius = "5") { 5 } else
+      if cfg!(radius = "6") { 6 } else
+      { println!("FAIL"); 0 };
+  */
+
+  if args.len() == 4 {
+    iterations = match args[1].parse() {
+      Ok(n) => { n },
+      Err(_) => { help(); return; },
+    };
+    n = match args[2].parse() {
+      Ok(n) => { n },
+      Err(_) => { help(); return; },
+    };
+    r = match args[3].parse() {
+      Ok(n) => { n },
+      Err(_) => { 2 },
+    };
+  } else {
+    help();
+    return;
+  }
+
+  if iterations < 1 {
+    println!("ERROR: iterations must be >= 1");
+  }
+  if n < 1 {
+    println!("ERROR: grid dimension must be positive: {}", n);
+  }
+
+  if r < 1 {
+    println!("ERROR: Stencil radius {} should be positive ", r);
+    return;
+  } else if (2 * r + 1) > n {
+    println!("ERROR: Stencil radius {} exceeds grid size {}", r, n);
+    return;
+  }
+
+  println!("Grid size            = {}", n);
+  println!("Radius of stencil    = {}", r);
+  if grid {
+    println!("Type of stencil      = grid");
+  } else {
+    println!("Type of stencil      = star");
+  }
+  println!("Data type            = double precision");
+  println!("Compact representation of stencil loop body");
+  println!("Number of iterations = {}",iterations);
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space for the input and do the work
+  //////////////////////////////////////////////////////////////////////
+
+  // input and output arrays
+  let mut a : Vec<Vec<f64>> = vec![vec![0.0; n]; n];
+  let mut b : Vec<Vec<f64>> = vec![vec![0.0; n]; n];
+
+  // weights of points a the stencil
+  let wdim : usize = 2 * r + 1;
+  let mut w : Vec<Vec<f64>> = vec![vec![0.0; wdim]; wdim];
+
+  // fill the stencil ws to reflect a discrete divergence operator
+  let stencil_size : usize;
+  if grid {
+    stencil_size = (2*r+1)*(2*r+1);
+    for j in 1..r+1 {
+      for i in 1-j..j {
+        let denom : f64 = (4*j*(2*j-1)*r) as f64;
+        w[r+i][r+j] =  1./denom;
+        w[r+i][r-j] = -1./denom;
+        w[r+j][r+i] =  1./denom;
+        w[r-j][r+i] = -1./denom;
+      }
+      let denom : f64 = (4*j*r) as f64;
+      w[r+j][r+j]   =  1./denom;
+      w[r-j][r-j]   = -1./denom;
+    }
+  }  else /* star */ {
+    stencil_size = 4*r+1;
+    for i in 1..r+1 {
+      let denom : f64 = (2 * i * r) as f64;
+      w[r][r+i] =  1./denom;
+      w[r][r-i] = -1./denom;
+      w[r+i][r] =  1./denom;
+      w[r-i][r] = -1./denom;
+    }
+  }
+
+  // interior of grid with respect to stencil
+  let active_points : usize = (n-2*r)*(n-2*r);
+
+  // initialize the input and output arrays
+  for j in 0..n {
+    for i in 0..n {
+      a[i][j] = (i+j) as f64;
+      b[i][j] = 0.0;
+    }
+  }
+
+  let timer = Instant::now();
+  let mut t0 : Duration = timer.elapsed();
+
+  for k in 0..iterations+1 {
+
+    if k == 1 { t0 = timer.elapsed(); }
+
+    // Apply the stencil operator
+    for i in r..n-r {
+      for j in r..n-r {
+        if grid {
+          for ii in 0-r..r+1 {
+            for jj in 0-r..r+1 {
+              b[i][j] += w[r+ii][r+jj]*a[i+ii][j+jj];
+            }
+          }
+        } else {
+          b[i][j] += w[r][r]*a[i][j];
+          for jj in r..0 {
+            b[i][j] += w[r][r-jj]*a[i][j-jj];
+          }
+          for jj in 1..r+1 {
+            b[i][j] += w[r][r+jj]*a[i][j+jj];
+          }
+          for ii in r..0 {
+            b[i][j] += w[r-ii][r]*a[i-ii][j];
+          }
+          for ii in 1..r+1 {
+            b[i][j] += w[r+ii][r]*a[i+ii][j];
+          }
+        }
+      }
+    }
+
+    // add constant to solution to force refresh of neighbor data, if any
+    for j in 0..n {
+      for i in 0..n {
+        a[i][j] += 1.0;
+      }
+    }
+  }
+  let t1 = timer.elapsed();
+  let dt = (t1.checked_sub(t0)).unwrap();
+  let dtt : u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64;
+  let stencil_time : f64 = dtt as f64 / 1.0e9_f64 as f64;
+
+  //////////////////////////////////////////////////////////////////////
+  // Analyze and output results.
+  //////////////////////////////////////////////////////////////////////
+
+  // error tolerance
+  let epsilon : f64 = 1.0e-8;
+
+  // compute L1 norm a parallel
+  let mut norm : f64 = 0.0;
+  for i in r..n-r+1 {
+    for j in r..n-r+1 {
+      norm += (b[i][j]).abs();
+    }
+  }
+  norm /= active_points as f64;
+
+  // verify correctness
+  let reference_norm : f64 = 2.*(iterations as f64 + 1.);
+  if (norm-reference_norm).abs() > epsilon {
+    println!("ERROR: L1 norm = {} Reference L1 norm = {}", norm, reference_norm);
+    return;
+  } else {
+    println!("Solution validates");
+    if cfg!(VERBOSE) {
+      println!("L1 norm = {} Reference L1 norm = {}", norm, reference_norm);
+    }
+    let flops : usize = (2*stencil_size+1) * active_points;
+    let avgtime : f64 = (stencil_time as f64) / (iterations as f64);
+    println!("Rate (MFlops/s): {:10.3} Avg time (s): {:10.3}", (1.0e-6_f64) * (flops as f64) / avgtime, avgtime);
+  }
+
+}
diff --git a/RUST/transpose/Cargo.toml b/RUST/transpose/Cargo.toml
new file mode 100644
index 000000000..3f634d3c5
--- /dev/null
+++ b/RUST/transpose/Cargo.toml
@@ -0,0 +1,6 @@
+[package]
+name = "transpose"
+version = "0.1.0"
+authors = ["Jeff Hammond <jeff.r.hammond@intel.com>"]
+
+[dependencies]
diff --git a/RUST/transpose/src/main.rs b/RUST/transpose/src/main.rs
new file mode 100644
index 000000000..5d1ba1e87
--- /dev/null
+++ b/RUST/transpose/src/main.rs
@@ -0,0 +1,190 @@
+///
+/// Copyright (c) 2013, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    transpose
+///
+/// PURPOSE: This program measures the time for the transpose of a
+///          column-major stored matrix into a row-major stored matrix.
+///
+/// USAGE:   Program input is the matrix order and the number of times to
+///          repeat the operation:
+///
+///          transpose <matrix_size> <# iterations> [tile size]
+///
+///          An optional parameter specifies the tile size used to divide the
+///          individual matrix blocks for improved cache and TLB performance.
+///
+///          The output consists of diagnostics to make sure the
+///          transpose worked and timing statistics.
+///
+/// HISTORY: Written by  Rob Van der Wijngaart, February 2009.
+///          Converted to C++11 by Jeff Hammond, February 2016 and May 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+use std::env;
+use std::mem;
+use std::time::{Instant,Duration};
+
+fn help() {
+  println!("Usage: <# iterations> <matrix order> [tile size]");
+}
+
+fn main()
+{
+  println!("Parallel Research Kernels");
+  println!("Rust Matrix transpose: B = A^T");
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  let args : Vec<String> = env::args().collect();
+
+  let iterations : u32;
+  let order      : usize;
+  let tilesize   : usize;
+
+  match args.len() {
+    3 => {
+      iterations = match args[1].parse() {
+        Ok(n) => { n },
+        Err(_) => { help(); return; },
+      };
+      order = match args[2].parse() {
+        Ok(n) => { n },
+        Err(_) => { help(); return; },
+      };
+      tilesize = 32;
+    },
+    4 => {
+      iterations = match args[1].parse() {
+        Ok(n) => { n },
+        Err(_) => { help(); return; },
+      };
+      order = match args[2].parse() {
+        Ok(n) => { n },
+        Err(_) => { help(); return; },
+      };
+      tilesize = match args[3].parse() {
+        Ok(n) => { n },
+        Err(_) => { help(); return; },
+      };
+    },
+    _ => {
+      help();
+      return;
+    }
+  }
+
+  if iterations < 1 {
+    println!("ERROR: iterations must be >= 1");
+  }
+  if tilesize > order {
+    println!("ERROR: tilesize cannot be > order");
+  }
+
+  println!("Matrix order          = {}", order);
+  if tilesize < order {
+      println!("Tile size             = {}", tilesize);
+  } else {
+      println!("Untiled");
+  }
+  println!("Number of iterations  = {}", iterations);
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space for the input and transpose matrix
+  //////////////////////////////////////////////////////////////////////
+
+  let nelems : usize = order*order;
+  let mut a : Vec<f64> = vec![0.0; nelems];
+  let mut b : Vec<f64> = vec![0.0; nelems];
+
+  for i in 0..order {
+    for j in 0..order {
+      a[i*order+j] = (i*order+j) as f64;
+    }
+  }
+
+  let timer = Instant::now();
+  let mut t0 : Duration = timer.elapsed();
+
+  for k in 0..iterations+1 {
+
+    if k == 1 { t0 = timer.elapsed(); }
+
+    for i in 0..order {
+      for j in 0..order {
+        b[j*order+i] += a[i*order+j];
+        a[i*order+j] += 1.0;
+      }
+    }
+
+  }
+  let t1 = timer.elapsed();
+  let dt = (t1.checked_sub(t0)).unwrap();
+  let dtt : u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64;
+  let transpose_time : f64 = dtt as f64 * 1.0e-9;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  let addit : usize = ((iterations as usize + 1) * (iterations as usize)) / 2;
+  let mut abserr : f64 = 0.0;
+  for i in 0..order {
+    for j in 0..order {
+      let ij = i*order+j;
+      let ji = j*order+i;
+      let reference : f64 = (ij*(iterations as usize + 1)+addit) as f64;
+      abserr += (b[ji] - reference).abs();
+    }
+  }
+
+  if cfg!(VERBOSE) {
+    println!("Sum of absolute differences: {:30.15}", abserr);
+  }
+
+  let epsilon : f64 = 1.0e-8;
+  if abserr < epsilon {
+    println!("Solution validates");
+    let avgtime : f64 = (transpose_time as f64) / (iterations as f64);
+    let bytes : usize = 2 * nelems * mem::size_of::<f64>();
+    println!("Rate (MB/s): {:10.3} Avg time (s): {:10.3}", (1.0e0-6_f64) * (bytes as f64) / avgtime, avgtime);
+  } else {
+    println!("ERROR: Aggregate squared error {:30.15} exceeds threshold {:30.15}", abserr, epsilon);
+    return;
+  }
+}
+
+
diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index 2820eeff8..4a8ea2230 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -100,11 +100,10 @@ case "$PRK_TARGET" in
         echo "Rust"
         which rustc
         rustc --version
-        make $PRK_TARGET
         export PRK_TARGET_PATH=RUST
-        ./$PRK_TARGET_PATH/p2p               10 100 100
-        ./$PRK_TARGET_PATH/stencil           10 100
-        ./$PRK_TARGET_PATH/transpose         10 100
+        cd $TRAVIS_HOME/$PRK_TARGET_PATH/p2p       && cargo run 10 100 100
+        cd $TRAVIS_HOME/$PRK_TARGET_PATH/stencil   && cargo run 10 100
+        cd $TRAVIS_HOME/$PRK_TARGET_PATH/transpose && cargo run 10 100
         ;;
     allc1z)
         echo "C1z"

From 159156ecd145348c8e517eeaa162f05ab8d06e50 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Mon, 4 Jun 2018 23:49:11 -0500
Subject: [PATCH 105/245] fix whitespace in RAJA view stencil code gen

---
 Cxx11/generate-cxx-stencil.py | 6 +++---
 Cxx11/stencil_rajaview.hpp    | 6 +++++-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/Cxx11/generate-cxx-stencil.py b/Cxx11/generate-cxx-stencil.py
index e2ec18d37..286d0dfb3 100755
--- a/Cxx11/generate-cxx-stencil.py
+++ b/Cxx11/generate-cxx-stencil.py
@@ -160,9 +160,9 @@ def main():
       if (model=='target'):
           src.write('#define RESTRICT __restrict__\n\n')
       if (model=='rajaview'):
-          src.write('using regular_policy = RAJA::KernelPolicy< RAJA::statement::For<0, thread_exec,')
-          src.write('                                           RAJA::statement::For<1, RAJA::simd_exec,')
-          src.write('                                           RAJA::statement::Lambda<0> > > >;')
+          src.write('using regular_policy = RAJA::KernelPolicy< RAJA::statement::For<0, thread_exec,\n')
+          src.write('                                           RAJA::statement::For<1, RAJA::simd_exec,\n')
+          src.write('                                           RAJA::statement::Lambda<0> > > >;\n\n')
       #  src.write('OMP( declare target )\n\n')
       for pattern in ['star','grid']:
         for r in range(1,6):
diff --git a/Cxx11/stencil_rajaview.hpp b/Cxx11/stencil_rajaview.hpp
index 0e303773f..4a521770f 100644
--- a/Cxx11/stencil_rajaview.hpp
+++ b/Cxx11/stencil_rajaview.hpp
@@ -1,4 +1,8 @@
-using regular_policy = RAJA::KernelPolicy< RAJA::statement::For<0, thread_exec,                                           RAJA::statement::For<1, RAJA::simd_exec,                                           RAJA::statement::Lambda<0> > > >;void star1(const int n, const int t, matrix & in, matrix & out) {
+using regular_policy = RAJA::KernelPolicy< RAJA::statement::For<0, thread_exec,
+                                           RAJA::statement::For<1, RAJA::simd_exec,
+                                           RAJA::statement::Lambda<0> > > >;
+
+void star1(const int n, const int t, matrix & in, matrix & out) {
     RAJA::RangeSegment inner1(1,n-1);
     auto inner2 = RAJA::make_tuple(inner1, inner1);
     RAJA::kernel<regular_policy>(inner2, [=](int i, int j) {

From 07a8b413e2cab45a1fd129bc4166750c6c1d1ddd Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Tue, 5 Jun 2018 00:09:54 -0500
Subject: [PATCH 106/245] fix mistype bug in preprocess logic for Boost.Ranges

---
 Cxx11/prk_ranges.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cxx11/prk_ranges.h b/Cxx11/prk_ranges.h
index d794016ff..9eb081844 100644
--- a/Cxx11/prk_ranges.h
+++ b/Cxx11/prk_ranges.h
@@ -33,7 +33,7 @@
 #define PRK_RANGES_H
 
 #if defined(USE_RANGES)
-# if defined(USE_RANGES_IRANGE)
+# if defined(USE_BOOST_IRANGE)
 #  include "boost/range/irange.hpp"
 # elif defined(USE_RANGES_TS)
 #  include "range/v3/view/iota.hpp"

From 56cec34a5825a11136983a472ff7b263c48dfe74 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Tue, 5 Jun 2018 14:53:13 -0500
Subject: [PATCH 107/245] fix Travis XFAIL MPI1, Python, Charm++, AMPI (#360)

* fix copy+paste error
* conditional import of timer for Python 2.7
* try Charm++ autoprovision
* add numpy version and use dot instead of matmul
---
 .travis.yml               |  3 +--
 PYTHON/dgemm-numpy.py     | 11 ++++++++---
 PYTHON/dgemm.py           |  7 +++++--
 PYTHON/nstream-numpy.py   |  8 ++++++--
 PYTHON/nstream.py         |  7 +++++--
 PYTHON/p2p-numba.py       |  7 ++++++-
 PYTHON/p2p-numpy.py       |  8 ++++++--
 PYTHON/p2p.py             |  7 +++++--
 PYTHON/sparse-numpy.py    |  8 ++++++--
 PYTHON/sparse-scipy.py    |  7 +++++--
 PYTHON/sparse.py          |  7 +++++--
 PYTHON/stencil-numba.py   |  8 ++++++--
 PYTHON/stencil-numpy.py   |  8 ++++++--
 PYTHON/stencil.py         |  7 +++++--
 PYTHON/transpose-numpy.py |  8 ++++++--
 PYTHON/transpose.py       |  7 +++++--
 travis/build-run-prk.sh   | 14 +++++++++++---
 17 files changed, 97 insertions(+), 35 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index b79ed39d5..ac0a9f07c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -145,11 +145,10 @@ matrix:
   - os: linux
     env: PRK_TARGET=allfgmpi
   - os: linux
+    compiler: clang
     env: PRK_TARGET=allmpi
   - os: linux
     env: PRK_TARGET=allcharm++
-  # Sadly, Python is XFAIL because Travis CI's Python 3.4.3 can't find cannot process_time.
-  - env: PRK_TARGET=allpython
 addons:
   apt:
     sources:
diff --git a/PYTHON/dgemm-numpy.py b/PYTHON/dgemm-numpy.py
index da5bcd197..db6bd9f92 100755
--- a/PYTHON/dgemm-numpy.py
+++ b/PYTHON/dgemm-numpy.py
@@ -51,9 +51,13 @@
 # *******************************************************************
 
 import sys
-#from timeit import default_timer as timer
-from time import process_time as timer
+print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor))
+if sys.version_info >= (3, 3):
+    from time import process_time as timer
+else:
+    from timeit import default_timer as timer
 import numpy
+print('Numpy version  = ', numpy.version.version)
 
 def main():
 
@@ -91,7 +95,8 @@ def main():
 
         if k<1: t0 = timer()
 
-        C += numpy.matmul(A,B)
+        #C += numpy.matmul(A,B) # requires Numpy 1.10 or later
+        C += numpy.dot(A,B)
 
     t1 = timer()
     dgemm_time = t1 - t0
diff --git a/PYTHON/dgemm.py b/PYTHON/dgemm.py
index 0aff9405f..9347d494a 100755
--- a/PYTHON/dgemm.py
+++ b/PYTHON/dgemm.py
@@ -51,8 +51,11 @@
 # *******************************************************************
 
 import sys
-#from timeit import default_timer as timer
-from time import process_time as timer
+print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor))
+if sys.version_info >= (3, 3):
+    from time import process_time as timer
+else:
+    from timeit import default_timer as timer
 
 def main():
 
diff --git a/PYTHON/nstream-numpy.py b/PYTHON/nstream-numpy.py
index 593d4fae7..fd0808993 100755
--- a/PYTHON/nstream-numpy.py
+++ b/PYTHON/nstream-numpy.py
@@ -64,9 +64,13 @@
 # *******************************************************************
 
 import sys
-#from timeit import default_timer as timer
-from time import process_time as timer
+print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor))
+if sys.version_info >= (3, 3):
+    from time import process_time as timer
+else:
+    from timeit import default_timer as timer
 import numpy
+print('Numpy version  = ', numpy.version.version)
 
 def main():
 
diff --git a/PYTHON/nstream.py b/PYTHON/nstream.py
index b79d979c9..070ec4647 100755
--- a/PYTHON/nstream.py
+++ b/PYTHON/nstream.py
@@ -64,8 +64,11 @@
 # *******************************************************************
 
 import sys
-#from timeit import default_timer as timer
-from time import process_time as timer
+print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor))
+if sys.version_info >= (3, 3):
+    from time import process_time as timer
+else:
+    from timeit import default_timer as timer
 
 def main():
 
diff --git a/PYTHON/p2p-numba.py b/PYTHON/p2p-numba.py
index 8d2922c87..c54fe742c 100755
--- a/PYTHON/p2p-numba.py
+++ b/PYTHON/p2p-numba.py
@@ -52,8 +52,13 @@
 # *******************************************************************
 
 import sys
-from timeit import default_timer as timer
+print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor))
+if sys.version_info >= (3, 3):
+    from time import process_time as timer
+else:
+    from timeit import default_timer as timer
 import numpy
+print('Numpy version  = ', numpy.version.version)
 import numba
 
 @jit
diff --git a/PYTHON/p2p-numpy.py b/PYTHON/p2p-numpy.py
index 61818c492..9e16d65eb 100755
--- a/PYTHON/p2p-numpy.py
+++ b/PYTHON/p2p-numpy.py
@@ -52,9 +52,13 @@
 # *******************************************************************
 
 import sys
-#from timeit import default_timer as timer
-from time import process_time as timer
+print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor))
+if sys.version_info >= (3, 3):
+    from time import process_time as timer
+else:
+    from timeit import default_timer as timer
 import numpy
+print('Numpy version  = ', numpy.version.version)
 
 def main():
 
diff --git a/PYTHON/p2p.py b/PYTHON/p2p.py
index e5d605e56..6e724f4f9 100755
--- a/PYTHON/p2p.py
+++ b/PYTHON/p2p.py
@@ -52,8 +52,11 @@
 # *******************************************************************
 
 import sys
-#from timeit import default_timer as timer
-from time import process_time as timer
+print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor))
+if sys.version_info >= (3, 3):
+    from time import process_time as timer
+else:
+    from timeit import default_timer as timer
 
 def main():
 
diff --git a/PYTHON/sparse-numpy.py b/PYTHON/sparse-numpy.py
index 261afaa64..b62cdfe52 100755
--- a/PYTHON/sparse-numpy.py
+++ b/PYTHON/sparse-numpy.py
@@ -54,9 +54,13 @@
 # *******************************************************************
 
 import sys
-#from timeit import default_timer as timer
-from time import process_time as timer
+print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor))
+if sys.version_info >= (3, 3):
+    from time import process_time as timer
+else:
+    from timeit import default_timer as timer
 import numpy
+print('Numpy version  = ', numpy.version.version)
 
 def offset(i,j,lsize):
     return i+(j<<lsize)
diff --git a/PYTHON/sparse-scipy.py b/PYTHON/sparse-scipy.py
index 2022442a6..3fa04ba37 100755
--- a/PYTHON/sparse-scipy.py
+++ b/PYTHON/sparse-scipy.py
@@ -54,8 +54,11 @@
 # *******************************************************************
 
 import sys
-#from timeit import default_timer as timer
-from time import process_time as timer
+print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor))
+if sys.version_info >= (3, 3):
+    from time import process_time as timer
+else:
+    from timeit import default_timer as timer
 import numpy
 import scipy
 
diff --git a/PYTHON/sparse.py b/PYTHON/sparse.py
index 511f13462..a36e5dc6b 100755
--- a/PYTHON/sparse.py
+++ b/PYTHON/sparse.py
@@ -54,8 +54,11 @@
 # *******************************************************************
 
 import sys
-#from timeit import default_timer as timer
-from time import process_time as timer
+print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor))
+if sys.version_info >= (3, 3):
+    from time import process_time as timer
+else:
+    from timeit import default_timer as timer
 
 def offset(i,j,lsize):
     return i+(j<<lsize)
diff --git a/PYTHON/stencil-numba.py b/PYTHON/stencil-numba.py
index 1cc9c8b5d..03da35a48 100755
--- a/PYTHON/stencil-numba.py
+++ b/PYTHON/stencil-numba.py
@@ -56,10 +56,14 @@
 # *******************************************************************
 
 import sys
-#from timeit import default_timer as timer
-from time import process_time as timer
+print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor))
+if sys.version_info >= (3, 3):
+    from time import process_time as timer
+else:
+    from timeit import default_timer as timer
 from numba import jit
 import numpy
+print('Numpy version  = ', numpy.version.version)
 
 @jit
 def grid(n,r,W,A,B):
diff --git a/PYTHON/stencil-numpy.py b/PYTHON/stencil-numpy.py
index b152e1c81..29ac31f29 100755
--- a/PYTHON/stencil-numpy.py
+++ b/PYTHON/stencil-numpy.py
@@ -56,9 +56,13 @@
 # *******************************************************************
 
 import sys
-#from timeit import default_timer as timer
-from time import process_time as timer
+print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor))
+if sys.version_info >= (3, 3):
+    from time import process_time as timer
+else:
+    from timeit import default_timer as timer
 import numpy
+print('Numpy version  = ', numpy.version.version)
 
 def main():
 
diff --git a/PYTHON/stencil.py b/PYTHON/stencil.py
index 9c8b066d0..618425435 100755
--- a/PYTHON/stencil.py
+++ b/PYTHON/stencil.py
@@ -56,8 +56,11 @@
 # *******************************************************************
 
 import sys
-#from timeit import default_timer as timer
-from time import process_time as timer
+print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor))
+if sys.version_info >= (3, 3):
+    from time import process_time as timer
+else:
+    from timeit import default_timer as timer
 
 def main():
 
diff --git a/PYTHON/transpose-numpy.py b/PYTHON/transpose-numpy.py
index 58ee58197..a70c4e741 100755
--- a/PYTHON/transpose-numpy.py
+++ b/PYTHON/transpose-numpy.py
@@ -50,9 +50,13 @@
 # *******************************************************************
 
 import sys
-#from timeit import default_timer as timer
-from time import process_time as timer
+print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor))
+if sys.version_info >= (3, 3):
+    from time import process_time as timer
+else:
+    from timeit import default_timer as timer
 import numpy
+print('Numpy version  = ', numpy.version.version)
 
 def main():
 
diff --git a/PYTHON/transpose.py b/PYTHON/transpose.py
index a67d365b2..43338aaf9 100755
--- a/PYTHON/transpose.py
+++ b/PYTHON/transpose.py
@@ -50,8 +50,11 @@
 # *******************************************************************
 
 import sys
-#from timeit import default_timer as timer
-from time import process_time as timer
+print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor))
+if sys.version_info >= (3, 3):
+    from time import process_time as timer
+else:
+    from timeit import default_timer as timer
 
 def main():
 
diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index 4a8ea2230..cf197e44e 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -916,7 +916,7 @@ case "$PRK_TARGET" in
         make allmpishm
         export PRK_TARGET_PATH=MPISHM
         export PRK_MPI_PROCS=4
-        export PRK_RUN="$PRK_RUN -n $PRK_MPI_PROCS ${PRK_OVERSUBSCRIBE:-}"
+        export PRK_RUN="$PRK_LAUNCHER -n $PRK_MPI_PROCS ${PRK_OVERSUBSCRIBE:-}"
         export PRK_MPISHM_RANKS=$(($PRK_MPI_PROCS/2))
         $PRK_RUN $PRK_TARGET_PATH/Synch_p2p/p2p                         10 1024 1024
         $PRK_RUN $PRK_TARGET_PATH/Stencil/stencil     $PRK_MPISHM_RANKS 10 1000
@@ -1015,7 +1015,11 @@ case "$PRK_TARGET" in
         export PRK_TARGET_PATH=CHARM++
         export PRK_CHARM_PROCS=4
         export PRK_LAUNCHER=$CHARM_ROOT/bin/charmrun
-        export PRK_LAUNCHER_ARGS="+p$PRK_CHARM_PROCS ++local"
+        if [ "${TRAVIS_OS_NAME}" = "linux" ] ; then
+            export PRK_LAUNCHER_ARGS="+autoProvision +isomalloc_sync"
+        else
+            export PRK_LAUNCHER_ARGS="+p$PRK_CHARM_PROCS ++local"
+        fi
         # For Charm++, the last argument is the overdecomposition factor -->               \|/
         $PRK_LAUNCHER $PRK_TARGET_PATH/Synch_p2p/p2p       $PRK_LAUNCHER_ARGS 10 1024 1024  1
         $PRK_LAUNCHER $PRK_TARGET_PATH/Stencil/stencil     $PRK_LAUNCHER_ARGS 10 1000       1
@@ -1039,7 +1043,11 @@ case "$PRK_TARGET" in
         export PRK_TARGET_PATH=AMPI
         export PRK_CHARM_PROCS=4
         export PRK_LAUNCHER=$CHARM_ROOT/bin/charmrun
-        export PRK_LAUNCHER_ARGS="+p$PRK_CHARM_PROCS +vp$PRK_CHARM_PROCS +isomalloc_sync ++local"
+        if [ "${TRAVIS_OS_NAME}" = "linux" ] ; then
+            export PRK_LAUNCHER_ARGS="+autoProvision +isomalloc_sync"
+        else
+            export PRK_LAUNCHER_ARGS="+p$PRK_CHARM_PROCS +vp$PRK_CHARM_PROCS +isomalloc_sync ++local"
+        fi
         export PRK_LOAD_BALANCER_ARGS="+balancer RefineLB"
         $PRK_LAUNCHER $PRK_TARGET_PATH/Synch_p2p/p2p       $PRK_LAUNCHER_ARGS 10 1024 1024
         $PRK_LAUNCHER $PRK_TARGET_PATH/Stencil/stencil     $PRK_LAUNCHER_ARGS 10 1000

From 4ffac0bcbae093e0882b2a252f8a033924cba4b6 Mon Sep 17 00:00:00 2001
From: Ronak Buch <ronakbuch@gmail.com>
Date: Mon, 18 Jun 2018 15:53:48 -0500
Subject: [PATCH 108/245] Fix spacing in Charm++ Stencil Makefile (#362)

---
 CHARM++/Stencil/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHARM++/Stencil/Makefile b/CHARM++/Stencil/Makefile
index c936aec7b..9fbde017d 100644
--- a/CHARM++/Stencil/Makefile
+++ b/CHARM++/Stencil/Makefile
@@ -64,7 +64,7 @@ RESTRICT_KEYWORD=0/1    disable/enable restrict keyword (aliasing) [0]  \n\
 STAR=0/1                box/star shaped stencil                    [1]  \n\
 VERBOSE=0/1             omit/include verbose run information       [0]"
 
-TUNEFLAGS    = $(RESTRICTFLAG) $(VERBOSEFLAG)$(USERFLAGS) $(LOOPGENFLAG)\
+TUNEFLAGS    = $(RESTRICTFLAG) $(VERBOSEFLAG) $(USERFLAGS) $(LOOPGENFLAG)\
                $(DOUBLEFLAG)   $(RADIUSFLAG) $(STARFLAG) 
 PROGRAM     = stencil
 OBJS        = $(PROGRAM).o $(COMOBJS)

From a13c1f7f01d903444094b7f6aa78dc61e193d42c Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Thu, 19 Jul 2018 21:45:08 -0700
Subject: [PATCH 109/245] format fix

---
 Cxx11/transpose-vector-pstl.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Cxx11/transpose-vector-pstl.cc b/Cxx11/transpose-vector-pstl.cc
index e94172bd6..ac7aefb8a 100644
--- a/Cxx11/transpose-vector-pstl.cc
+++ b/Cxx11/transpose-vector-pstl.cc
@@ -58,9 +58,9 @@ int main(int argc, char * argv[])
 {
   std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
 #if defined(USE_PSTL)
-  std::cout << "C++17 Parallel STL Matrix transpose: B = A^T" << std::endl;
+  std::cout << "C++17/PSTL Matrix transpose: B = A^T" << std::endl;
 #else
-  std::cout << "C++11 STL Matrix transpose: B = A^T" << std::endl;
+  std::cout << "C++11/STL Matrix transpose: B = A^T" << std::endl;
 #endif
 
   //////////////////////////////////////////////////////////////////////
@@ -93,8 +93,8 @@ int main(int argc, char * argv[])
     return 1;
   }
 
-  std::cout << "Number of iterations  = " << iterations << std::endl;
-  std::cout << "Matrix order          = " << order << std::endl;
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Matrix order         = " << order << std::endl;
 
   //////////////////////////////////////////////////////////////////////
   /// Allocate space for the input and transpose matrix

From c3c4fca280751d679c8d8b3ec122e3892707113f Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Thu, 19 Jul 2018 22:05:13 -0700
Subject: [PATCH 110/245] format fix

---
 Cxx11/transpose-vector-raja.cc | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/Cxx11/transpose-vector-raja.cc b/Cxx11/transpose-vector-raja.cc
index d40cefc65..621068d43 100644
--- a/Cxx11/transpose-vector-raja.cc
+++ b/Cxx11/transpose-vector-raja.cc
@@ -260,14 +260,14 @@ int main(int argc, char * argv[])
   if (use_for=="tbb")    for_name = "TBB (static)";
   if (use_for=="tbbdyn") for_name = "TBB (dynamic)";
 
-  std::cout << "Number of iterations  = " << iterations << std::endl;
-  std::cout << "Matrix order          = " << order << std::endl;
-  std::cout << "Tile size             = " << tile_size << "(compile-time constant, unlike other impls)" << std::endl;
-  std::cout << "RAJA threading        = " << for_name << std::endl;
-  std::cout << "RAJA forallN          = " << (use_nested ? "yes" : "no") << std::endl;
-  std::cout << "RAJA use tiling       = " << (use_tiled ? "yes" : "no") << std::endl;
-  std::cout << "RAJA use permute      = " << use_permute << std::endl;
-  std::cout << "RAJA use simd         = " << (use_simd ? "yes" : "no") << std::endl;
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Matrix order         = " << order << std::endl;
+  std::cout << "Tile size            = " << tile_size << "(compile-time constant, unlike other impls)" << std::endl;
+  std::cout << "RAJA threading       = " << for_name << std::endl;
+  std::cout << "RAJA forallN         = " << (use_nested ? "yes" : "no") << std::endl;
+  std::cout << "RAJA use tiling      = " << (use_tiled ? "yes" : "no") << std::endl;
+  std::cout << "RAJA use permute     = " << use_permute << std::endl;
+  std::cout << "RAJA use simd        = " << (use_simd ? "yes" : "no") << std::endl;
 
   //////////////////////////////////////////////////////////////////////
   // Allocate space and perform the computation

From b6f76ef80f93d75b40e19e7af1debe25108fffe0 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 21 Jul 2018 21:42:34 -0700
Subject: [PATCH 111/245] format fix

---
 Cxx11/nstream-vector-pstl.cc        | 4 ++--
 Cxx11/p2p-hyperplane-vector-pstl.cc | 4 ++--
 Cxx11/stencil-vector-pstl.cc        | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Cxx11/nstream-vector-pstl.cc b/Cxx11/nstream-vector-pstl.cc
index 0bab633b0..21b5e0b45 100644
--- a/Cxx11/nstream-vector-pstl.cc
+++ b/Cxx11/nstream-vector-pstl.cc
@@ -71,9 +71,9 @@ int main(int argc, char * argv[])
 {
   std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
 #if defined(USE_PSTL)
-  std::cout << "C++17 Parallel STL STREAM triad: A = B + scalar * C" << std::endl;
+  std::cout << "C++17/PSTL STREAM triad: A = B + scalar * C" << std::endl;
 #else
-  std::cout << "C++11 STL STREAM triad: A = B + scalar * C" << std::endl;
+  std::cout << "C++11/STL STREAM triad: A = B + scalar * C" << std::endl;
 #endif
 
   //////////////////////////////////////////////////////////////////////
diff --git a/Cxx11/p2p-hyperplane-vector-pstl.cc b/Cxx11/p2p-hyperplane-vector-pstl.cc
index 132b26a45..c64757e6d 100644
--- a/Cxx11/p2p-hyperplane-vector-pstl.cc
+++ b/Cxx11/p2p-hyperplane-vector-pstl.cc
@@ -67,9 +67,9 @@ int main(int argc, char* argv[])
 {
   std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
 #if defined(USE_PSTL)
-  std::cout << "C++17 PSTL HYPERPLANE pipeline execution on 2D grid" << std::endl;
+  std::cout << "C++17/PSTL HYPERPLANE pipeline execution on 2D grid" << std::endl;
 #else
-  std::cout << "C++11 STL HYPERPLANE pipeline execution on 2D grid" << std::endl;
+  std::cout << "C++11/STL HYPERPLANE pipeline execution on 2D grid" << std::endl;
 #endif
 
   //////////////////////////////////////////////////////////////////////
diff --git a/Cxx11/stencil-vector-pstl.cc b/Cxx11/stencil-vector-pstl.cc
index a328b1420..ca3c83ec0 100644
--- a/Cxx11/stencil-vector-pstl.cc
+++ b/Cxx11/stencil-vector-pstl.cc
@@ -86,7 +86,7 @@ int main(int argc, char* argv[])
 {
   std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
 #if defined(USE_PSTL)
-  std::cout << "C++17/Parallel STL Stencil execution on 2D grid" << std::endl;
+  std::cout << "C++17/PSTL Stencil execution on 2D grid" << std::endl;
 #else
   std::cout << "C++11/STL Stencil execution on 2D grid" << std::endl;
 #endif

From 46741c0fc6f271bcef61c91505432eea8d864c60 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Sun, 22 Jul 2018 15:54:15 -0700
Subject: [PATCH 112/245] remove STL usage from OpenMP codes (#363)

---
 .gitignore                                    |  5 ++
 Cxx11/Makefile                                | 12 ++---
 Cxx11/generate-cxx-stencil.py                 |  2 +-
 ...eam-vector-openmp.cc => nstream-openmp.cc} |  6 +--
 ...ector-openmp.cc => p2p-doacross-openmp.cc} |  2 +-
 ...tor-openmp.cc => p2p-hyperplane-openmp.cc} |  2 +-
 ...cil-vector-openmp.cc => stencil-openmp.cc} |  9 ++--
 Cxx11/stencil_openmp.hpp                      | 20 ++++----
 ...e-vector-openmp.cc => transpose-openmp.cc} |  4 +-
 travis/build-run-prk.sh                       | 48 +++++++++----------
 10 files changed, 57 insertions(+), 53 deletions(-)
 rename Cxx11/{nstream-vector-openmp.cc => nstream-openmp.cc} (97%)
 rename Cxx11/{p2p-doacross-vector-openmp.cc => p2p-doacross-openmp.cc} (99%)
 rename Cxx11/{p2p-hyperplane-vector-openmp.cc => p2p-hyperplane-openmp.cc} (99%)
 rename Cxx11/{stencil-vector-openmp.cc => stencil-openmp.cc} (97%)
 rename Cxx11/{transpose-vector-openmp.cc => transpose-openmp.cc} (98%)

diff --git a/.gitignore b/.gitignore
index b6b88b3b2..d4a60c93f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -254,3 +254,8 @@ FORTRAN/transpose-ornlacc
 RUST/p2p/Cargo.lock
 RUST/stencil/Cargo.lock
 RUST/transpose/Cargo.lock
+nstream-openmp
+p2p-doacross-openmp
+p2p-hyperplane-openmp
+stencil-openmp
+transpose-openmp
diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index d1223b894..54873e41d 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -68,20 +68,20 @@ endif
 
 all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sycl boost-compute $(EXTRA)
 
-p2p: p2p-vector p2p-doacross-vector-openmp p2p-hyperplane-vector-openmp p2p-tasks-openmp p2p-openmp-target \
+p2p: p2p-vector p2p-doacross-openmp p2p-hyperplane-openmp p2p-tasks-openmp p2p-openmp-target \
      p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl p2p-hyperplane-vector-tbb \
      p2p-hyperplane-sycl p2p-hyperplane-vector-ornlacc p2p-tasks-tbb
 
-stencil: stencil-valarray stencil-vector stencil-vector-async stencil-vector-openmp stencil-openmp-target \
+stencil: stencil-valarray stencil-vector stencil-vector-async stencil-openmp stencil-openmp-target \
 	 stencil-vector-taskloop stencil-vector-stl stencil-vector-pstl stencil-vector-raja \
 	 stencil-vector-rangefor stencil-vector-tbb stencil-vector-thread stencil-kokkos stencil-opencl \
 	 stencil-cuda
 
-transpose: transpose-valarray transpose-vector transpose-vector-async transpose-vector-openmp transpose-openmp-target \
+transpose: transpose-valarray transpose-vector transpose-vector-async transpose-openmp transpose-openmp-target \
 	   transpose-vector-taskloop transpose-vector-stl transpose-vector-pstl transpose-vector-raja \
 	   transpose-vector-rangefor transpose-vector-tbb transpose-vector-thread transpose-kokkos transpose-opencl
 
-nstream: nstream-valarray nstream-vector nstream-vector-openmp nstream-openmp-target \
+nstream: nstream-valarray nstream-vector nstream-openmp nstream-openmp-target \
 	 nstream-vector-taskloop nstream-vector-stl nstream-vector-pstl nstream-vector-raja \
 	 nstream-vector-rangefor nstream-vector-tbb nstream-kokkos nstream-opencl
 
@@ -92,7 +92,7 @@ vector: p2p-vector p2p-hyperplane-vector stencil-vector transpose-vector nstream
 
 valarray: transpose-valarray nstream-valarray
 
-openmp: p2p-hyperplane-vector-openmp p2p-tasks-openmp stencil-vector-openmp transpose-vector-openmp nstream-vector-openmp
+openmp: p2p-hyperplane-openmp p2p-tasks-openmp stencil-openmp transpose-openmp nstream-openmp
 
 target: stencil-openmp-target transpose-openmp-target nstream-openmp-target
 
@@ -135,7 +135,7 @@ boost-compute: nstream-vector-boost-compute
 # busted
 #nstream-valarray-boost-compute
 
-p2p-hyperplane-vector: p2p-hyperplane-vector-openmp.cc prk_util.h
+p2p-hyperplane-vector: p2p-hyperplane-openmp.cc prk_util.h
 	$(CXX) $(CXXFLAGS) $< -o $@
 
 transpose-opencl: transpose-opencl.cc transpose.cl prk_util.h prk_opencl.h
diff --git a/Cxx11/generate-cxx-stencil.py b/Cxx11/generate-cxx-stencil.py
index 286d0dfb3..b6d7a2d72 100755
--- a/Cxx11/generate-cxx-stencil.py
+++ b/Cxx11/generate-cxx-stencil.py
@@ -7,7 +7,7 @@
 
 def codegen(src,pattern,stencil_size,radius,W,model):
     if (model=='openmp'):
-        src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {\n')
+        src.write('void '+pattern+str(radius)+'(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) {\n')
         src.write('    OMP_FOR(collapse(2))\n')
         src.write('    for (auto it='+str(radius)+'; it<n-'+str(radius)+'; it+=t) {\n')
         src.write('      for (auto jt='+str(radius)+'; jt<n-'+str(radius)+'; jt+=t) {\n')
diff --git a/Cxx11/nstream-vector-openmp.cc b/Cxx11/nstream-openmp.cc
similarity index 97%
rename from Cxx11/nstream-vector-openmp.cc
rename to Cxx11/nstream-openmp.cc
index d48015df6..f3ea9bbd8 100644
--- a/Cxx11/nstream-vector-openmp.cc
+++ b/Cxx11/nstream-openmp.cc
@@ -117,9 +117,9 @@ int main(int argc, char * argv[])
 
   auto nstream_time = 0.0;
 
-  std::vector<double> A(length);
-  std::vector<double> B(length);
-  std::vector<double> C(length);
+  double * RESTRICT A = new double[length];
+  double * RESTRICT B = new double[length];
+  double * RESTRICT C = new double[length];
 
   double scalar = 3.0;
 
diff --git a/Cxx11/p2p-doacross-vector-openmp.cc b/Cxx11/p2p-doacross-openmp.cc
similarity index 99%
rename from Cxx11/p2p-doacross-vector-openmp.cc
rename to Cxx11/p2p-doacross-openmp.cc
index 37b9802f0..4b10f9fbe 100644
--- a/Cxx11/p2p-doacross-vector-openmp.cc
+++ b/Cxx11/p2p-doacross-openmp.cc
@@ -125,7 +125,7 @@ int main(int argc, char* argv[])
 
   auto pipeline_time = 0.0; // silence compiler warning
 
-  std::vector<double> grid(m*n);;
+  double * RESTRICT grid = new double[m*n];
 
   OMP_PARALLEL()
   {
diff --git a/Cxx11/p2p-hyperplane-vector-openmp.cc b/Cxx11/p2p-hyperplane-openmp.cc
similarity index 99%
rename from Cxx11/p2p-hyperplane-vector-openmp.cc
rename to Cxx11/p2p-hyperplane-openmp.cc
index 471ce336c..4a49584c9 100644
--- a/Cxx11/p2p-hyperplane-vector-openmp.cc
+++ b/Cxx11/p2p-hyperplane-openmp.cc
@@ -127,7 +127,7 @@ int main(int argc, char* argv[])
 
   auto pipeline_time = 0.0; // silence compiler warning
 
-  double * grid = new double[n*n];
+  double * RESTRICT grid = new double[n*n];
 
   OMP_PARALLEL()
   {
diff --git a/Cxx11/stencil-vector-openmp.cc b/Cxx11/stencil-openmp.cc
similarity index 97%
rename from Cxx11/stencil-vector-openmp.cc
rename to Cxx11/stencil-openmp.cc
index 5f5e59f42..14d299c58 100644
--- a/Cxx11/stencil-vector-openmp.cc
+++ b/Cxx11/stencil-openmp.cc
@@ -67,13 +67,13 @@
 #include "stencil_seq.hpp"
 #endif
 
-void nothing(const int n, const int t, std::vector<double> & in, std::vector<double> & out)
+void nothing(const int n, const int t, const double * RESTRICT in, double * RESTRICT out)
 {
     std::cout << "You are trying to use a stencil that does not exist.\n";
     std::cout << "Please generate the new stencil using the code generator\n";
     std::cout << "and add it to the case-switch in the driver." << std::endl;
     // n will never be zero - this is to silence compiler warnings.
-    if (n==0 || t==0) std::cout << in.size() << out.size() << std::endl;
+    if (n==0 || t==0) std::cout << in << out << std::endl;
     std::abort();
 }
 
@@ -175,8 +175,8 @@ int main(int argc, char* argv[])
 
   auto stencil_time = 0.0;
 
-  std::vector<double> in(n*n);
-  std::vector<double> out(n*n);
+  double * RESTRICT in  = new double[n*n];
+  double * RESTRICT out = new double[n*n];
 
   OMP_PARALLEL()
   {
@@ -227,7 +227,6 @@ int main(int argc, char* argv[])
 
   // interior of grid with respect to stencil
   size_t active_points = static_cast<size_t>(n-2*radius)*static_cast<size_t>(n-2*radius);
-
   // compute L1 norm in parallel
   double norm = 0.0;
   OMP_PARALLEL_FOR_REDUCE( +:norm )
diff --git a/Cxx11/stencil_openmp.hpp b/Cxx11/stencil_openmp.hpp
index 42edf4570..2a42b437c 100644
--- a/Cxx11/stencil_openmp.hpp
+++ b/Cxx11/stencil_openmp.hpp
@@ -1,4 +1,4 @@
-void star1(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void star1(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) {
     OMP_FOR(collapse(2))
     for (auto it=1; it<n-1; it+=t) {
       for (auto jt=1; jt<n-1; jt+=t) {
@@ -15,7 +15,7 @@ void star1(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void star2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void star2(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) {
     OMP_FOR(collapse(2))
     for (auto it=2; it<n-2; it+=t) {
       for (auto jt=2; jt<n-2; jt+=t) {
@@ -36,7 +36,7 @@ void star2(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void star3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void star3(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) {
     OMP_FOR(collapse(2))
     for (auto it=3; it<n-3; it+=t) {
       for (auto jt=3; jt<n-3; jt+=t) {
@@ -61,7 +61,7 @@ void star3(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void star4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void star4(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) {
     OMP_FOR(collapse(2))
     for (auto it=4; it<n-4; it+=t) {
       for (auto jt=4; jt<n-4; jt+=t) {
@@ -90,7 +90,7 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void star5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void star5(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) {
     OMP_FOR(collapse(2))
     for (auto it=5; it<n-5; it+=t) {
       for (auto jt=5; jt<n-5; jt+=t) {
@@ -123,7 +123,7 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void grid1(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void grid1(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) {
     OMP_FOR(collapse(2))
     for (auto it=1; it<n-1; it+=t) {
       for (auto jt=1; jt<n-1; jt+=t) {
@@ -143,7 +143,7 @@ void grid1(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void grid2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void grid2(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) {
     OMP_FOR(collapse(2))
     for (auto it=2; it<n-2; it+=t) {
       for (auto jt=2; jt<n-2; jt+=t) {
@@ -177,7 +177,7 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void grid3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void grid3(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) {
     OMP_FOR(collapse(2))
     for (auto it=3; it<n-3; it+=t) {
       for (auto jt=3; jt<n-3; jt+=t) {
@@ -233,7 +233,7 @@ void grid3(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void grid4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void grid4(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) {
     OMP_FOR(collapse(2))
     for (auto it=4; it<n-4; it+=t) {
       for (auto jt=4; jt<n-4; jt+=t) {
@@ -319,7 +319,7 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void grid5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void grid5(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) {
     OMP_FOR(collapse(2))
     for (auto it=5; it<n-5; it+=t) {
       for (auto jt=5; jt<n-5; jt+=t) {
diff --git a/Cxx11/transpose-vector-openmp.cc b/Cxx11/transpose-openmp.cc
similarity index 98%
rename from Cxx11/transpose-vector-openmp.cc
rename to Cxx11/transpose-openmp.cc
index ba3a26321..b81882614 100644
--- a/Cxx11/transpose-vector-openmp.cc
+++ b/Cxx11/transpose-openmp.cc
@@ -112,8 +112,8 @@ int main(int argc, char * argv[])
 
   auto trans_time = 0.0;
 
-  std::vector<double> A(order*order);
-  std::vector<double> B(order*order);
+  double * RESTRICT A = new double[order*order];
+  double * RESTRICT B = new double[order*order];
 
   OMP_PARALLEL()
   {
diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index cf197e44e..e4b25c3cb 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -359,18 +359,18 @@ case "$PRK_TARGET" in
             gcc)
                 # Host
                 echo "OPENMPFLAG=-fopenmp" >> common/make.defs
-                make -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-hyperplane-vector-openmp stencil-vector-openmp \
-                                         transpose-vector-openmp nstream-vector-openmp
+                make -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-hyperplane-openmp stencil-openmp \
+                                         transpose-openmp nstream-openmp
                 $PRK_TARGET_PATH/p2p-tasks-openmp                 10 1024 1024 100 100
-                $PRK_TARGET_PATH/p2p-hyperplane-vector-openmp     10 1024
-                $PRK_TARGET_PATH/p2p-hyperplane-vector-openmp     10 1024 64
-                $PRK_TARGET_PATH/stencil-vector-openmp            10 1000
-                $PRK_TARGET_PATH/transpose-vector-openmp          10 1024 32
-                $PRK_TARGET_PATH/nstream-vector-openmp            10 16777216 32
+                $PRK_TARGET_PATH/p2p-hyperplane-openmp     10 1024
+                $PRK_TARGET_PATH/p2p-hyperplane-openmp     10 1024 64
+                $PRK_TARGET_PATH/stencil-openmp            10 1000
+                $PRK_TARGET_PATH/transpose-openmp          10 1024 32
+                $PRK_TARGET_PATH/nstream-openmp            10 16777216 32
                 #echo "Test stencil code generator"
                 for s in star grid ; do
                     for r in 1 2 3 4 5 ; do
-                        $PRK_TARGET_PATH/stencil-vector-openmp 10 200 20 $s $r
+                        $PRK_TARGET_PATH/stencil-openmp 10 200 20 $s $r
                     done
                 done
                 # Offload
@@ -381,7 +381,7 @@ case "$PRK_TARGET" in
                 #echo "Test stencil code generator"
                 for s in star grid ; do
                     for r in 1 2 3 4 5 ; do
-                        $PRK_TARGET_PATH/stencil-vector-openmp 10 200 20 $s $r
+                        $PRK_TARGET_PATH/stencil-openmp 10 200 20 $s $r
                     done
                 done
                 # ORNL-ACC
@@ -394,18 +394,18 @@ case "$PRK_TARGET" in
                 if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then
                     # Host
                     echo "OPENMPFLAG=-fopenmp" >> common/make.defs
-                    make -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-hyperplane-vector-openmp stencil-vector-openmp \
-                                             transpose-vector-openmp nstream-vector-openmp
+                    make -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-hyperplane-openmp stencil-openmp \
+                                             transpose-openmp nstream-openmp
                     $PRK_TARGET_PATH/p2p-tasks-openmp                 10 1024 1024 100 100
-                    $PRK_TARGET_PATH/p2p-hyperplane-vector-openmp     10 1024
-                    $PRK_TARGET_PATH/p2p-hyperplane-vector-openmp     10 1024 64
-                    $PRK_TARGET_PATH/stencil-vector-openmp            10 1000
-                    $PRK_TARGET_PATH/transpose-vector-openmp          10 1024 32
-                    $PRK_TARGET_PATH/nstream-vector-openmp            10 16777216 32
+                    $PRK_TARGET_PATH/p2p-hyperplane-openmp     10 1024
+                    $PRK_TARGET_PATH/p2p-hyperplane-openmp     10 1024 64
+                    $PRK_TARGET_PATH/stencil-openmp            10 1000
+                    $PRK_TARGET_PATH/transpose-openmp          10 1024 32
+                    $PRK_TARGET_PATH/nstream-openmp            10 16777216 32
                     #echo "Test stencil code generator"
                     for s in star grid ; do
                         for r in 1 2 3 4 5 ; do
-                            $PRK_TARGET_PATH/stencil-vector-openmp 10 200 20 $s $r
+                            $PRK_TARGET_PATH/stencil-openmp 10 200 20 $s $r
                         done
                     done
                     # Offload
@@ -416,7 +416,7 @@ case "$PRK_TARGET" in
                     ##echo "Test stencil code generator"
                     #for s in star grid ; do
                     #    for r in 1 2 3 4 5 ; do
-                    #        $PRK_TARGET_PATH/stencil-vector-openmp 10 200 20 $s $r
+                    #        $PRK_TARGET_PATH/stencil-openmp 10 200 20 $s $r
                     #    done
                     #done
                 else
@@ -426,17 +426,17 @@ case "$PRK_TARGET" in
             icc)
                 # Host
                 echo "OPENMPFLAG=-qopenmp" >> common/make.defs
-                make -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-innerloop-openmp stencil-vector-openmp \
-                                         transpose-vector-openmp nstream-vector-openmp
+                make -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp \
+                                         transpose-openmp nstream-openmp
                 $PRK_TARGET_PATH/p2p-tasks-openmp                 10 1024 1024 100 100
                 $PRK_TARGET_PATH/p2p-innerloop-openmp             10 1024 1024
-                $PRK_TARGET_PATH/stencil-vector-openmp            10 1000
-                $PRK_TARGET_PATH/transpose-vector-openmp          10 1024 32
-                $PRK_TARGET_PATH/nstream-vector-openmp            10 16777216 32
+                $PRK_TARGET_PATH/stencil-openmp            10 1000
+                $PRK_TARGET_PATH/transpose-openmp          10 1024 32
+                $PRK_TARGET_PATH/nstream-openmp            10 16777216 32
                 #echo "Test stencil code generator"
                 for s in star grid ; do
                     for r in 1 2 3 4 5 ; do
-                        $PRK_TARGET_PATH/stencil-vector-openmp 10 200 20 $s $r
+                        $PRK_TARGET_PATH/stencil-openmp 10 200 20 $s $r
                     done
                 done
                 # Offload - not supported on MacOS

From f1968f1eb8156335dbdc5d8111fe0fdb89750053 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Fri, 14 Sep 2018 16:19:03 -0700
Subject: [PATCH 113/245] improve stencil code generator (#364)

* improve stencil code generator
* try C++17
---
 Cxx11/generate-cxx-stencil.py | 112 ++++---
 Cxx11/stencil_cuda.hpp        | 570 +++++++++++++++++-----------------
 Cxx11/stencil_kokkos.hpp      | 570 +++++++++++++++++-----------------
 Cxx11/stencil_openmp.hpp      | 570 +++++++++++++++++-----------------
 Cxx11/stencil_pgnu.hpp        | 570 +++++++++++++++++-----------------
 Cxx11/stencil_pstl.hpp        | 570 +++++++++++++++++-----------------
 Cxx11/stencil_raja.hpp        | 570 +++++++++++++++++-----------------
 Cxx11/stencil_rajaview.hpp    | 570 +++++++++++++++++-----------------
 Cxx11/stencil_rangefor.hpp    | 570 +++++++++++++++++-----------------
 Cxx11/stencil_seq.hpp         | 570 +++++++++++++++++-----------------
 Cxx11/stencil_stl.hpp         | 570 +++++++++++++++++-----------------
 Cxx11/stencil_target.hpp      | 570 +++++++++++++++++-----------------
 Cxx11/stencil_taskloop.hpp    | 570 +++++++++++++++++-----------------
 Cxx11/stencil_tbb.hpp         | 570 +++++++++++++++++-----------------
 travis/build-run-prk.sh       |   4 +-
 15 files changed, 3781 insertions(+), 3745 deletions(-)

diff --git a/Cxx11/generate-cxx-stencil.py b/Cxx11/generate-cxx-stencil.py
index b6d7a2d72..18d826acd 100755
--- a/Cxx11/generate-cxx-stencil.py
+++ b/Cxx11/generate-cxx-stencil.py
@@ -5,6 +5,39 @@
 import string
 import os
 
+def bodygen(src,pattern,stencil_size,radius,W,model):
+    if (model=='kokkos' or model=='rajaview'):
+        src.write('              out(i,j) += ')
+    else:
+        src.write('            out[i*n+j] += ')
+    k = 0
+    kmax = stencil_size-1;
+    for j in range(0,2*radius+1):
+        if (j-radius)<0:
+            jr=str(j-radius)
+        elif (j-radius)==0:
+            jr=''
+        else:
+            jr='+'+str(j-radius)
+
+        for i in range(0,2*radius+1):
+            if (i-radius)<0:
+                ir=str(i-radius)
+            elif (i-radius)==0:
+                ir=''
+            else:
+                ir='+'+str(i-radius)
+
+            if ( W[j][i] != 0.0):
+                k+=1
+                if (model=='kokkos' or model=='rajaview'):
+                    src.write('+in(i'+ir+',j'+jr+') * '+str(W[j][i]))
+                else:
+                    src.write('+in[(i'+ir+')*n+(j'+jr+')] * '+str(W[j][i]))
+                if (k<kmax): src.write('\n')
+                if (k>0 and k<kmax): src.write('                          ')
+    src.write(';\n')
+
 def codegen(src,pattern,stencil_size,radius,W,model):
     if (model=='openmp'):
         src.write('void '+pattern+str(radius)+'(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) {\n')
@@ -14,6 +47,11 @@ def codegen(src,pattern,stencil_size,radius,W,model):
         src.write('        for (auto i=it; i<std::min(n-'+str(radius)+',it+t); ++i) {\n')
         src.write('          OMP_SIMD\n')
         src.write('          for (auto j=jt; j<std::min(n-'+str(radius)+',jt+t); ++j) {\n')
+        bodygen(src,pattern,stencil_size,radius,W,model)
+        src.write('           }\n')
+        src.write('         }\n')
+        src.write('       }\n')
+        src.write('     }\n')
     elif (model=='taskloop'):
         src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector<double> & in, std::vector<double> & out, const int gs) {\n')
         src.write('    OMP_TASKLOOP_COLLAPSE(2, firstprivate(n) shared(in,out) grainsize(gs) )\n')
@@ -22,33 +60,55 @@ def codegen(src,pattern,stencil_size,radius,W,model):
         src.write('        for (auto i=it; i<std::min(n-'+str(radius)+',it+t); ++i) {\n')
         src.write('          OMP_SIMD\n')
         src.write('          for (auto j=jt; j<std::min(n-'+str(radius)+',jt+t); ++j) {\n')
+        bodygen(src,pattern,stencil_size,radius,W,model)
+        src.write('           }\n')
+        src.write('         }\n')
+        src.write('       }\n')
+        src.write('     }\n')
     elif (model=='target'):
         src.write('void '+pattern+str(radius)+'(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) {\n')
         src.write('    OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )\n')
         src.write('    for (auto i='+str(radius)+'; i<n-'+str(radius)+'; ++i) {\n')
         src.write('      for (auto j='+str(radius)+'; j<n-'+str(radius)+'; ++j) {\n')
+        bodygen(src,pattern,stencil_size,radius,W,model)
+        src.write('       }\n')
+        src.write('     }\n')
     elif (model=='rangefor'):
         src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {\n')
         src.write('    auto inside = prk::range('+str(radius)+',n-'+str(radius)+');\n')
         src.write('    for (auto i : inside) {\n')
         src.write('      PRAGMA_SIMD\n')
         src.write('      for (auto j : inside) {\n')
+        bodygen(src,pattern,stencil_size,radius,W,model)
+        src.write('           }\n')
+        src.write('         }\n')
+        src.write('       }\n')
+        src.write('     }\n')
     elif (model=='stl'):
         src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {\n')
         src.write('    auto inside = prk::range('+str(radius)+',n-'+str(radius)+');\n')
         src.write('    std::for_each( std::begin(inside), std::end(inside), [&] (int i) {\n')
         #src.write('      PRAGMA_SIMD\n')
         src.write('      std::for_each( std::begin(inside), std::end(inside), [&] (int j) {\n')
+        bodygen(src,pattern,stencil_size,radius,W,model)
+        src.write('      });\n')
+        src.write('    });\n')
     elif (model=='pgnu'):
         src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {\n')
         src.write('    auto inside = prk::range('+str(radius)+',n-'+str(radius)+');\n')
         src.write('    __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) {\n')
         src.write('      std::for_each( std::begin(inside), std::end(inside), [&] (int j) {\n')
+        bodygen(src,pattern,stencil_size,radius,W,model)
+        src.write('      });\n')
+        src.write('    });\n')
     elif (model=='pstl'):
         src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {\n')
         src.write('    auto inside = prk::range('+str(radius)+',n-'+str(radius)+');\n')
         src.write('    std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) {\n')
         src.write('      std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) {\n')
+        bodygen(src,pattern,stencil_size,radius,W,model)
+        src.write('      });\n')
+        src.write('    });\n')
     elif (model=='raja'):
         src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {\n')
         #src.write('    RAJA::forallN<RAJA::NestedPolicy<RAJA::ExecList<thread_exec, RAJA::simd_exec>>>\n')
@@ -57,11 +117,16 @@ def codegen(src,pattern,stencil_size,radius,W,model):
         #src.write('              [&](RAJA::Index_type i, RAJA::Index_type j) {\n')
         src.write('    RAJA::forall<thread_exec>(RAJA::Index_type('+str(radius)+'), RAJA::Index_type(n-'+str(radius)+'), [&](RAJA::Index_type i) {\n')
         src.write('      RAJA::forall<RAJA::simd_exec>(RAJA::Index_type('+str(radius)+'), RAJA::Index_type(n-'+str(radius)+'), [&](RAJA::Index_type j) {\n')
+        bodygen(src,pattern,stencil_size,radius,W,model)
+        src.write('      });\n')
+        src.write('    });\n')
     elif (model=='rajaview'):
         src.write('void '+pattern+str(radius)+'(const int n, const int t, matrix & in, matrix & out) {\n')
         src.write('    RAJA::RangeSegment inner1('+str(radius)+',n-'+str(radius)+');\n')
         src.write('    auto inner2 = RAJA::make_tuple(inner1, inner1);\n')
         src.write('    RAJA::kernel<regular_policy>(inner2, [=](int i, int j) {\n')
+        bodygen(src,pattern,stencil_size,radius,W,model)
+        src.write('    });\n')
     elif (model=='tbb'):
         src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {\n')
         src.write('  tbb::blocked_range2d<int> range('+str(radius)+', n-'+str(radius)+', t, '+str(radius)+', n-'+str(radius)+', t);\n')
@@ -69,15 +134,23 @@ def codegen(src,pattern,stencil_size,radius,W,model):
         src.write('    for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) {\n')
         src.write('      PRAGMA_SIMD\n')
         src.write('      for (auto j=r.cols().begin(); j!=r.cols().end(); ++j ) {\n')
+        bodygen(src,pattern,stencil_size,radius,W,model)
+        src.write('      }\n')
+        src.write('    }\n')
+        src.write('  }, tbb_partitioner );\n')
     elif (model=='kokkos'):
         src.write('void '+pattern+str(radius)+'(const int n, const int t, matrix & in, matrix & out) {\n')
         src.write('    auto inside = Kokkos::MDRangePolicy<Kokkos::Rank<2>>({'+str(radius)+','+str(radius)+'},{n-'+str(radius)+',n-'+str(radius)+'},{t,t});\n')
         src.write('    Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) {\n')
+        bodygen(src,pattern,stencil_size,radius,W,model)
+        src.write('    });\n')
     elif (model=='cuda'):
         src.write('__global__ void '+pattern+str(radius)+'(const int n, const prk_float * in, prk_float * out) {\n')
         src.write('    const int i = blockIdx.x * blockDim.x + threadIdx.x;\n')
         src.write('    const int j = blockIdx.y * blockDim.y + threadIdx.y;\n')
         src.write('    if ( ('+str(radius)+' <= i) && (i < n-'+str(radius)+') && ('+str(radius)+' <= j) && (j < n-'+str(radius)+') ) {\n')
+        bodygen(src,pattern,stencil_size,radius,W,model)
+        src.write('     }\n')
     else:
         src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {\n')
         src.write('    for (auto it='+str(radius)+'; it<n-'+str(radius)+'; it+=t) {\n')
@@ -85,44 +158,7 @@ def codegen(src,pattern,stencil_size,radius,W,model):
         src.write('        for (auto i=it; i<std::min(n-'+str(radius)+',it+t); ++i) {\n')
         src.write('          PRAGMA_SIMD\n')
         src.write('          for (auto j=jt; j<std::min(n-'+str(radius)+',jt+t); ++j) {\n')
-    if (model=='kokkos' or model=='rajaview'):
-        src.write('              out(i,j) += ')
-    else:
-        src.write('            out[i*n+j] += ')
-    k = 0
-    kmax = stencil_size-1;
-    for j in range(0,2*radius+1):
-        for i in range(0,2*radius+1):
-            if ( W[j][i] != 0.0):
-                k+=1
-                if (model=='kokkos' or model=='rajaview'):
-                    src.write('+in(i+'+str(j-radius)+',j+'+str(i-radius)+') * '+str(W[j][i]))
-                else:
-                    src.write('+in[(i+'+str(j-radius)+')*n+(j+'+str(i-radius)+')] * '+str(W[j][i]))
-                if (k<kmax): src.write('\n')
-                if (k>0 and k<kmax): src.write('                          ')
-    src.write(';\n')
-    if (model=='stl' or model=='pgnu' or model=='pstl'):
-        src.write('      });\n')
-        src.write('    });\n')
-    elif (model=='raja'):
-        #src.write('     });\n')
-        src.write('      });\n')
-        src.write('    });\n')
-    elif (model=='rajaview'):
-        src.write('    });\n')
-    elif (model=='kokkos'):
-        src.write('    });\n')
-    elif (model=='tbb'):
-        src.write('      }\n')
-        src.write('    }\n')
-        src.write('  }, tbb_partitioner );\n')
-    elif (model=='target'):
-        src.write('       }\n')
-        src.write('     }\n')
-    elif (model=='cuda'):
-        src.write('     }\n')
-    else:
+        bodygen(src,pattern,stencil_size,radius,W,model)
         src.write('           }\n')
         src.write('         }\n')
         src.write('       }\n')
diff --git a/Cxx11/stencil_cuda.hpp b/Cxx11/stencil_cuda.hpp
index 1783327fa..bfc10122d 100644
--- a/Cxx11/stencil_cuda.hpp
+++ b/Cxx11/stencil_cuda.hpp
@@ -2,10 +2,10 @@ __global__ void star1(const int n, const prk_float * in, prk_float * out) {
     const int i = blockIdx.x * blockDim.x + threadIdx.x;
     const int j = blockIdx.y * blockDim.y + threadIdx.y;
     if ( (1 <= i) && (i < n-1) && (1 <= j) && (j < n-1) ) {
-            out[i*n+j] += +in[(i+-1)*n+(j+0)] * -0.5
-                          +in[(i+0)*n+(j+-1)] * -0.5
-                          +in[(i+0)*n+(j+1)] * 0.5
-                          +in[(i+1)*n+(j+0)] * 0.5;
+            out[i*n+j] += +in[(i)*n+(j-1)] * -0.5
+                          +in[(i-1)*n+(j)] * -0.5
+                          +in[(i+1)*n+(j)] * 0.5
+                          +in[(i)*n+(j+1)] * 0.5;
      }
 }
 
@@ -13,14 +13,14 @@ __global__ void star2(const int n, const prk_float * in, prk_float * out) {
     const int i = blockIdx.x * blockDim.x + threadIdx.x;
     const int j = blockIdx.y * blockDim.y + threadIdx.y;
     if ( (2 <= i) && (i < n-2) && (2 <= j) && (j < n-2) ) {
-            out[i*n+j] += +in[(i+-2)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+0)] * -0.25
-                          +in[(i+0)*n+(j+-2)] * -0.125
-                          +in[(i+0)*n+(j+-1)] * -0.25
-                          +in[(i+0)*n+(j+1)] * 0.25
-                          +in[(i+0)*n+(j+2)] * 0.125
-                          +in[(i+1)*n+(j+0)] * 0.25
-                          +in[(i+2)*n+(j+0)] * 0.125;
+            out[i*n+j] += +in[(i)*n+(j-2)] * -0.125
+                          +in[(i)*n+(j-1)] * -0.25
+                          +in[(i-2)*n+(j)] * -0.125
+                          +in[(i-1)*n+(j)] * -0.25
+                          +in[(i+1)*n+(j)] * 0.25
+                          +in[(i+2)*n+(j)] * 0.125
+                          +in[(i)*n+(j+1)] * 0.25
+                          +in[(i)*n+(j+2)] * 0.125;
      }
 }
 
@@ -28,18 +28,18 @@ __global__ void star3(const int n, const prk_float * in, prk_float * out) {
     const int i = blockIdx.x * blockDim.x + threadIdx.x;
     const int j = blockIdx.y * blockDim.y + threadIdx.y;
     if ( (3 <= i) && (i < n-3) && (3 <= j) && (j < n-3) ) {
-            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556
-                          +in[(i+-2)*n+(j+0)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+0)] * -0.166666666667
-                          +in[(i+0)*n+(j+-3)] * -0.0555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.0833333333333
-                          +in[(i+0)*n+(j+-1)] * -0.166666666667
-                          +in[(i+0)*n+(j+1)] * 0.166666666667
-                          +in[(i+0)*n+(j+2)] * 0.0833333333333
-                          +in[(i+0)*n+(j+3)] * 0.0555555555556
-                          +in[(i+1)*n+(j+0)] * 0.166666666667
-                          +in[(i+2)*n+(j+0)] * 0.0833333333333
-                          +in[(i+3)*n+(j+0)] * 0.0555555555556;
+            out[i*n+j] += +in[(i)*n+(j-3)] * -0.0555555555556
+                          +in[(i)*n+(j-2)] * -0.0833333333333
+                          +in[(i)*n+(j-1)] * -0.166666666667
+                          +in[(i-3)*n+(j)] * -0.0555555555556
+                          +in[(i-2)*n+(j)] * -0.0833333333333
+                          +in[(i-1)*n+(j)] * -0.166666666667
+                          +in[(i+1)*n+(j)] * 0.166666666667
+                          +in[(i+2)*n+(j)] * 0.0833333333333
+                          +in[(i+3)*n+(j)] * 0.0555555555556
+                          +in[(i)*n+(j+1)] * 0.166666666667
+                          +in[(i)*n+(j+2)] * 0.0833333333333
+                          +in[(i)*n+(j+3)] * 0.0555555555556;
      }
 }
 
@@ -47,22 +47,22 @@ __global__ void star4(const int n, const prk_float * in, prk_float * out) {
     const int i = blockIdx.x * blockDim.x + threadIdx.x;
     const int j = blockIdx.y * blockDim.y + threadIdx.y;
     if ( (4 <= i) && (i < n-4) && (4 <= j) && (j < n-4) ) {
-            out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125
-                          +in[(i+-3)*n+(j+0)] * -0.0416666666667
-                          +in[(i+-2)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+0)*n+(j+-4)] * -0.03125
-                          +in[(i+0)*n+(j+-3)] * -0.0416666666667
-                          +in[(i+0)*n+(j+-2)] * -0.0625
-                          +in[(i+0)*n+(j+-1)] * -0.125
-                          +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.0625
-                          +in[(i+0)*n+(j+3)] * 0.0416666666667
-                          +in[(i+0)*n+(j+4)] * 0.03125
-                          +in[(i+1)*n+(j+0)] * 0.125
-                          +in[(i+2)*n+(j+0)] * 0.0625
-                          +in[(i+3)*n+(j+0)] * 0.0416666666667
-                          +in[(i+4)*n+(j+0)] * 0.03125;
+            out[i*n+j] += +in[(i)*n+(j-4)] * -0.03125
+                          +in[(i)*n+(j-3)] * -0.0416666666667
+                          +in[(i)*n+(j-2)] * -0.0625
+                          +in[(i)*n+(j-1)] * -0.125
+                          +in[(i-4)*n+(j)] * -0.03125
+                          +in[(i-3)*n+(j)] * -0.0416666666667
+                          +in[(i-2)*n+(j)] * -0.0625
+                          +in[(i-1)*n+(j)] * -0.125
+                          +in[(i+1)*n+(j)] * 0.125
+                          +in[(i+2)*n+(j)] * 0.0625
+                          +in[(i+3)*n+(j)] * 0.0416666666667
+                          +in[(i+4)*n+(j)] * 0.03125
+                          +in[(i)*n+(j+1)] * 0.125
+                          +in[(i)*n+(j+2)] * 0.0625
+                          +in[(i)*n+(j+3)] * 0.0416666666667
+                          +in[(i)*n+(j+4)] * 0.03125;
      }
 }
 
@@ -70,26 +70,26 @@ __global__ void star5(const int n, const prk_float * in, prk_float * out) {
     const int i = blockIdx.x * blockDim.x + threadIdx.x;
     const int j = blockIdx.y * blockDim.y + threadIdx.y;
     if ( (5 <= i) && (i < n-5) && (5 <= j) && (j < n-5) ) {
-            out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02
-                          +in[(i+-4)*n+(j+0)] * -0.025
-                          +in[(i+-3)*n+(j+0)] * -0.0333333333333
-                          +in[(i+-2)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+0)] * -0.1
-                          +in[(i+0)*n+(j+-5)] * -0.02
-                          +in[(i+0)*n+(j+-4)] * -0.025
-                          +in[(i+0)*n+(j+-3)] * -0.0333333333333
-                          +in[(i+0)*n+(j+-2)] * -0.05
-                          +in[(i+0)*n+(j+-1)] * -0.1
-                          +in[(i+0)*n+(j+1)] * 0.1
-                          +in[(i+0)*n+(j+2)] * 0.05
-                          +in[(i+0)*n+(j+3)] * 0.0333333333333
-                          +in[(i+0)*n+(j+4)] * 0.025
-                          +in[(i+0)*n+(j+5)] * 0.02
-                          +in[(i+1)*n+(j+0)] * 0.1
-                          +in[(i+2)*n+(j+0)] * 0.05
-                          +in[(i+3)*n+(j+0)] * 0.0333333333333
-                          +in[(i+4)*n+(j+0)] * 0.025
-                          +in[(i+5)*n+(j+0)] * 0.02;
+            out[i*n+j] += +in[(i)*n+(j-5)] * -0.02
+                          +in[(i)*n+(j-4)] * -0.025
+                          +in[(i)*n+(j-3)] * -0.0333333333333
+                          +in[(i)*n+(j-2)] * -0.05
+                          +in[(i)*n+(j-1)] * -0.1
+                          +in[(i-5)*n+(j)] * -0.02
+                          +in[(i-4)*n+(j)] * -0.025
+                          +in[(i-3)*n+(j)] * -0.0333333333333
+                          +in[(i-2)*n+(j)] * -0.05
+                          +in[(i-1)*n+(j)] * -0.1
+                          +in[(i+1)*n+(j)] * 0.1
+                          +in[(i+2)*n+(j)] * 0.05
+                          +in[(i+3)*n+(j)] * 0.0333333333333
+                          +in[(i+4)*n+(j)] * 0.025
+                          +in[(i+5)*n+(j)] * 0.02
+                          +in[(i)*n+(j+1)] * 0.1
+                          +in[(i)*n+(j+2)] * 0.05
+                          +in[(i)*n+(j+3)] * 0.0333333333333
+                          +in[(i)*n+(j+4)] * 0.025
+                          +in[(i)*n+(j+5)] * 0.02;
      }
 }
 
@@ -97,11 +97,11 @@ __global__ void grid1(const int n, const prk_float * in, prk_float * out) {
     const int i = blockIdx.x * blockDim.x + threadIdx.x;
     const int j = blockIdx.y * blockDim.y + threadIdx.y;
     if ( (1 <= i) && (i < n-1) && (1 <= j) && (j < n-1) ) {
-            out[i*n+j] += +in[(i+-1)*n+(j+-1)] * -0.25
-                          +in[(i+-1)*n+(j+0)] * -0.25
-                          +in[(i+0)*n+(j+-1)] * -0.25
-                          +in[(i+0)*n+(j+1)] * 0.25
-                          +in[(i+1)*n+(j+0)] * 0.25
+            out[i*n+j] += +in[(i-1)*n+(j-1)] * -0.25
+                          +in[(i)*n+(j-1)] * -0.25
+                          +in[(i-1)*n+(j)] * -0.25
+                          +in[(i+1)*n+(j)] * 0.25
+                          +in[(i)*n+(j+1)] * 0.25
                           +in[(i+1)*n+(j+1)] * 0.25
                           ;
      }
@@ -111,25 +111,25 @@ __global__ void grid2(const int n, const prk_float * in, prk_float * out) {
     const int i = blockIdx.x * blockDim.x + threadIdx.x;
     const int j = blockIdx.y * blockDim.y + threadIdx.y;
     if ( (2 <= i) && (i < n-2) && (2 <= j) && (j < n-2) ) {
-            out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625
-                          +in[(i+-2)*n+(j+-1)] * -0.0208333333333
-                          +in[(i+-2)*n+(j+0)] * -0.0208333333333
-                          +in[(i+-2)*n+(j+1)] * -0.0208333333333
-                          +in[(i+-1)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+-1)*n+(j+-1)] * -0.125
-                          +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+2)] * 0.0208333333333
-                          +in[(i+0)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+0)*n+(j+-1)] * -0.125
-                          +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.0208333333333
-                          +in[(i+1)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+1)*n+(j+0)] * 0.125
+            out[i*n+j] += +in[(i-2)*n+(j-2)] * -0.0625
+                          +in[(i-1)*n+(j-2)] * -0.0208333333333
+                          +in[(i)*n+(j-2)] * -0.0208333333333
+                          +in[(i+1)*n+(j-2)] * -0.0208333333333
+                          +in[(i-2)*n+(j-1)] * -0.0208333333333
+                          +in[(i-1)*n+(j-1)] * -0.125
+                          +in[(i)*n+(j-1)] * -0.125
+                          +in[(i+2)*n+(j-1)] * 0.0208333333333
+                          +in[(i-2)*n+(j)] * -0.0208333333333
+                          +in[(i-1)*n+(j)] * -0.125
+                          +in[(i+1)*n+(j)] * 0.125
+                          +in[(i+2)*n+(j)] * 0.0208333333333
+                          +in[(i-2)*n+(j+1)] * -0.0208333333333
+                          +in[(i)*n+(j+1)] * 0.125
                           +in[(i+1)*n+(j+1)] * 0.125
-                          +in[(i+1)*n+(j+2)] * 0.0208333333333
-                          +in[(i+2)*n+(j+-1)] * 0.0208333333333
-                          +in[(i+2)*n+(j+0)] * 0.0208333333333
                           +in[(i+2)*n+(j+1)] * 0.0208333333333
+                          +in[(i-1)*n+(j+2)] * 0.0208333333333
+                          +in[(i)*n+(j+2)] * 0.0208333333333
+                          +in[(i+1)*n+(j+2)] * 0.0208333333333
                           +in[(i+2)*n+(j+2)] * 0.0625
                           ;
      }
@@ -139,47 +139,47 @@ __global__ void grid3(const int n, const prk_float * in, prk_float * out) {
     const int i = blockIdx.x * blockDim.x + threadIdx.x;
     const int j = blockIdx.y * blockDim.y + threadIdx.y;
     if ( (3 <= i) && (i < n-3) && (3 <= j) && (j < n-3) ) {
-            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778
-                          +in[(i+-3)*n+(j+-2)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+-1)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+0)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+1)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+2)] * -0.00555555555556
-                          +in[(i+-2)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+-2)*n+(j+-2)] * -0.0416666666667
-                          +in[(i+-2)*n+(j+-1)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+0)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+1)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+3)] * 0.00555555555556
-                          +in[(i+-1)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+-1)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+-1)*n+(j+-1)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+0)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+2)] * 0.0138888888889
-                          +in[(i+-1)*n+(j+3)] * 0.00555555555556
-                          +in[(i+0)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+0)*n+(j+-1)] * -0.0833333333333
-                          +in[(i+0)*n+(j+1)] * 0.0833333333333
-                          +in[(i+0)*n+(j+2)] * 0.0138888888889
-                          +in[(i+0)*n+(j+3)] * 0.00555555555556
-                          +in[(i+1)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+1)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+1)*n+(j+0)] * 0.0833333333333
+            out[i*n+j] += +in[(i-3)*n+(j-3)] * -0.0277777777778
+                          +in[(i-2)*n+(j-3)] * -0.00555555555556
+                          +in[(i-1)*n+(j-3)] * -0.00555555555556
+                          +in[(i)*n+(j-3)] * -0.00555555555556
+                          +in[(i+1)*n+(j-3)] * -0.00555555555556
+                          +in[(i+2)*n+(j-3)] * -0.00555555555556
+                          +in[(i-3)*n+(j-2)] * -0.00555555555556
+                          +in[(i-2)*n+(j-2)] * -0.0416666666667
+                          +in[(i-1)*n+(j-2)] * -0.0138888888889
+                          +in[(i)*n+(j-2)] * -0.0138888888889
+                          +in[(i+1)*n+(j-2)] * -0.0138888888889
+                          +in[(i+3)*n+(j-2)] * 0.00555555555556
+                          +in[(i-3)*n+(j-1)] * -0.00555555555556
+                          +in[(i-2)*n+(j-1)] * -0.0138888888889
+                          +in[(i-1)*n+(j-1)] * -0.0833333333333
+                          +in[(i)*n+(j-1)] * -0.0833333333333
+                          +in[(i+2)*n+(j-1)] * 0.0138888888889
+                          +in[(i+3)*n+(j-1)] * 0.00555555555556
+                          +in[(i-3)*n+(j)] * -0.00555555555556
+                          +in[(i-2)*n+(j)] * -0.0138888888889
+                          +in[(i-1)*n+(j)] * -0.0833333333333
+                          +in[(i+1)*n+(j)] * 0.0833333333333
+                          +in[(i+2)*n+(j)] * 0.0138888888889
+                          +in[(i+3)*n+(j)] * 0.00555555555556
+                          +in[(i-3)*n+(j+1)] * -0.00555555555556
+                          +in[(i-2)*n+(j+1)] * -0.0138888888889
+                          +in[(i)*n+(j+1)] * 0.0833333333333
                           +in[(i+1)*n+(j+1)] * 0.0833333333333
-                          +in[(i+1)*n+(j+2)] * 0.0138888888889
-                          +in[(i+1)*n+(j+3)] * 0.00555555555556
-                          +in[(i+2)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+2)*n+(j+-1)] * 0.0138888888889
-                          +in[(i+2)*n+(j+0)] * 0.0138888888889
                           +in[(i+2)*n+(j+1)] * 0.0138888888889
-                          +in[(i+2)*n+(j+2)] * 0.0416666666667
-                          +in[(i+2)*n+(j+3)] * 0.00555555555556
-                          +in[(i+3)*n+(j+-2)] * 0.00555555555556
-                          +in[(i+3)*n+(j+-1)] * 0.00555555555556
-                          +in[(i+3)*n+(j+0)] * 0.00555555555556
                           +in[(i+3)*n+(j+1)] * 0.00555555555556
+                          +in[(i-3)*n+(j+2)] * -0.00555555555556
+                          +in[(i-1)*n+(j+2)] * 0.0138888888889
+                          +in[(i)*n+(j+2)] * 0.0138888888889
+                          +in[(i+1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+2)*n+(j+2)] * 0.0416666666667
                           +in[(i+3)*n+(j+2)] * 0.00555555555556
+                          +in[(i-2)*n+(j+3)] * 0.00555555555556
+                          +in[(i-1)*n+(j+3)] * 0.00555555555556
+                          +in[(i)*n+(j+3)] * 0.00555555555556
+                          +in[(i+1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+2)*n+(j+3)] * 0.00555555555556
                           +in[(i+3)*n+(j+3)] * 0.0277777777778
                           ;
      }
@@ -189,77 +189,77 @@ __global__ void grid4(const int n, const prk_float * in, prk_float * out) {
     const int i = blockIdx.x * blockDim.x + threadIdx.x;
     const int j = blockIdx.y * blockDim.y + threadIdx.y;
     if ( (4 <= i) && (i < n-4) && (4 <= j) && (j < n-4) ) {
-            out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625
-                          +in[(i+-4)*n+(j+-3)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+-2)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+-1)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+0)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+1)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+2)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+3)] * -0.00223214285714
-                          +in[(i+-3)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-3)*n+(j+-3)] * -0.0208333333333
-                          +in[(i+-3)*n+(j+-2)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+-1)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+0)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+1)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+2)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+4)] * 0.00223214285714
-                          +in[(i+-2)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-2)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+-2)*n+(j+-2)] * -0.03125
-                          +in[(i+-2)*n+(j+-1)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+0)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+1)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+3)] * 0.00416666666667
-                          +in[(i+-2)*n+(j+4)] * 0.00223214285714
-                          +in[(i+-1)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-1)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+-1)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+-1)*n+(j+-1)] * -0.0625
-                          +in[(i+-1)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+2)] * 0.0104166666667
-                          +in[(i+-1)*n+(j+3)] * 0.00416666666667
-                          +in[(i+-1)*n+(j+4)] * 0.00223214285714
-                          +in[(i+0)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+0)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+0)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+0)*n+(j+-1)] * -0.0625
-                          +in[(i+0)*n+(j+1)] * 0.0625
-                          +in[(i+0)*n+(j+2)] * 0.0104166666667
-                          +in[(i+0)*n+(j+3)] * 0.00416666666667
-                          +in[(i+0)*n+(j+4)] * 0.00223214285714
-                          +in[(i+1)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+1)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+1)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+1)*n+(j+0)] * 0.0625
+            out[i*n+j] += +in[(i-4)*n+(j-4)] * -0.015625
+                          +in[(i-3)*n+(j-4)] * -0.00223214285714
+                          +in[(i-2)*n+(j-4)] * -0.00223214285714
+                          +in[(i-1)*n+(j-4)] * -0.00223214285714
+                          +in[(i)*n+(j-4)] * -0.00223214285714
+                          +in[(i+1)*n+(j-4)] * -0.00223214285714
+                          +in[(i+2)*n+(j-4)] * -0.00223214285714
+                          +in[(i+3)*n+(j-4)] * -0.00223214285714
+                          +in[(i-4)*n+(j-3)] * -0.00223214285714
+                          +in[(i-3)*n+(j-3)] * -0.0208333333333
+                          +in[(i-2)*n+(j-3)] * -0.00416666666667
+                          +in[(i-1)*n+(j-3)] * -0.00416666666667
+                          +in[(i)*n+(j-3)] * -0.00416666666667
+                          +in[(i+1)*n+(j-3)] * -0.00416666666667
+                          +in[(i+2)*n+(j-3)] * -0.00416666666667
+                          +in[(i+4)*n+(j-3)] * 0.00223214285714
+                          +in[(i-4)*n+(j-2)] * -0.00223214285714
+                          +in[(i-3)*n+(j-2)] * -0.00416666666667
+                          +in[(i-2)*n+(j-2)] * -0.03125
+                          +in[(i-1)*n+(j-2)] * -0.0104166666667
+                          +in[(i)*n+(j-2)] * -0.0104166666667
+                          +in[(i+1)*n+(j-2)] * -0.0104166666667
+                          +in[(i+3)*n+(j-2)] * 0.00416666666667
+                          +in[(i+4)*n+(j-2)] * 0.00223214285714
+                          +in[(i-4)*n+(j-1)] * -0.00223214285714
+                          +in[(i-3)*n+(j-1)] * -0.00416666666667
+                          +in[(i-2)*n+(j-1)] * -0.0104166666667
+                          +in[(i-1)*n+(j-1)] * -0.0625
+                          +in[(i)*n+(j-1)] * -0.0625
+                          +in[(i+2)*n+(j-1)] * 0.0104166666667
+                          +in[(i+3)*n+(j-1)] * 0.00416666666667
+                          +in[(i+4)*n+(j-1)] * 0.00223214285714
+                          +in[(i-4)*n+(j)] * -0.00223214285714
+                          +in[(i-3)*n+(j)] * -0.00416666666667
+                          +in[(i-2)*n+(j)] * -0.0104166666667
+                          +in[(i-1)*n+(j)] * -0.0625
+                          +in[(i+1)*n+(j)] * 0.0625
+                          +in[(i+2)*n+(j)] * 0.0104166666667
+                          +in[(i+3)*n+(j)] * 0.00416666666667
+                          +in[(i+4)*n+(j)] * 0.00223214285714
+                          +in[(i-4)*n+(j+1)] * -0.00223214285714
+                          +in[(i-3)*n+(j+1)] * -0.00416666666667
+                          +in[(i-2)*n+(j+1)] * -0.0104166666667
+                          +in[(i)*n+(j+1)] * 0.0625
                           +in[(i+1)*n+(j+1)] * 0.0625
-                          +in[(i+1)*n+(j+2)] * 0.0104166666667
-                          +in[(i+1)*n+(j+3)] * 0.00416666666667
-                          +in[(i+1)*n+(j+4)] * 0.00223214285714
-                          +in[(i+2)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+2)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+2)*n+(j+-1)] * 0.0104166666667
-                          +in[(i+2)*n+(j+0)] * 0.0104166666667
                           +in[(i+2)*n+(j+1)] * 0.0104166666667
-                          +in[(i+2)*n+(j+2)] * 0.03125
-                          +in[(i+2)*n+(j+3)] * 0.00416666666667
-                          +in[(i+2)*n+(j+4)] * 0.00223214285714
-                          +in[(i+3)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+3)*n+(j+-2)] * 0.00416666666667
-                          +in[(i+3)*n+(j+-1)] * 0.00416666666667
-                          +in[(i+3)*n+(j+0)] * 0.00416666666667
                           +in[(i+3)*n+(j+1)] * 0.00416666666667
-                          +in[(i+3)*n+(j+2)] * 0.00416666666667
-                          +in[(i+3)*n+(j+3)] * 0.0208333333333
-                          +in[(i+3)*n+(j+4)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-3)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-2)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-1)] * 0.00223214285714
-                          +in[(i+4)*n+(j+0)] * 0.00223214285714
                           +in[(i+4)*n+(j+1)] * 0.00223214285714
+                          +in[(i-4)*n+(j+2)] * -0.00223214285714
+                          +in[(i-3)*n+(j+2)] * -0.00416666666667
+                          +in[(i-1)*n+(j+2)] * 0.0104166666667
+                          +in[(i)*n+(j+2)] * 0.0104166666667
+                          +in[(i+1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+2)*n+(j+2)] * 0.03125
+                          +in[(i+3)*n+(j+2)] * 0.00416666666667
                           +in[(i+4)*n+(j+2)] * 0.00223214285714
+                          +in[(i-4)*n+(j+3)] * -0.00223214285714
+                          +in[(i-2)*n+(j+3)] * 0.00416666666667
+                          +in[(i-1)*n+(j+3)] * 0.00416666666667
+                          +in[(i)*n+(j+3)] * 0.00416666666667
+                          +in[(i+1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+3)*n+(j+3)] * 0.0208333333333
                           +in[(i+4)*n+(j+3)] * 0.00223214285714
+                          +in[(i-3)*n+(j+4)] * 0.00223214285714
+                          +in[(i-2)*n+(j+4)] * 0.00223214285714
+                          +in[(i-1)*n+(j+4)] * 0.00223214285714
+                          +in[(i)*n+(j+4)] * 0.00223214285714
+                          +in[(i+1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+3)*n+(j+4)] * 0.00223214285714
                           +in[(i+4)*n+(j+4)] * 0.015625
                           ;
      }
@@ -269,115 +269,115 @@ __global__ void grid5(const int n, const prk_float * in, prk_float * out) {
     const int i = blockIdx.x * blockDim.x + threadIdx.x;
     const int j = blockIdx.y * blockDim.y + threadIdx.y;
     if ( (5 <= i) && (i < n-5) && (5 <= j) && (j < n-5) ) {
-            out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01
-                          +in[(i+-5)*n+(j+-4)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-3)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-2)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-1)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+0)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+1)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+2)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+3)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+4)] * -0.00111111111111
-                          +in[(i+-4)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-4)*n+(j+-4)] * -0.0125
-                          +in[(i+-4)*n+(j+-3)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+-2)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+-1)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+0)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+1)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+2)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+3)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-3)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-3)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-3)*n+(j+-3)] * -0.0166666666667
-                          +in[(i+-3)*n+(j+-2)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+-1)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+0)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+1)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+2)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-3)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-2)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-2)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-2)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+-2)*n+(j+-2)] * -0.025
-                          +in[(i+-2)*n+(j+-1)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+0)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+1)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+3)] * 0.00333333333333
-                          +in[(i+-2)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-2)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-1)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-1)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-1)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+-1)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+-1)*n+(j+-1)] * -0.05
-                          +in[(i+-1)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+2)] * 0.00833333333333
-                          +in[(i+-1)*n+(j+3)] * 0.00333333333333
-                          +in[(i+-1)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-1)*n+(j+5)] * 0.00111111111111
-                          +in[(i+0)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+0)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+0)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+0)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+0)*n+(j+-1)] * -0.05
-                          +in[(i+0)*n+(j+1)] * 0.05
-                          +in[(i+0)*n+(j+2)] * 0.00833333333333
-                          +in[(i+0)*n+(j+3)] * 0.00333333333333
-                          +in[(i+0)*n+(j+4)] * 0.00178571428571
-                          +in[(i+0)*n+(j+5)] * 0.00111111111111
-                          +in[(i+1)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+1)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+1)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+1)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+1)*n+(j+0)] * 0.05
+            out[i*n+j] += +in[(i-5)*n+(j-5)] * -0.01
+                          +in[(i-4)*n+(j-5)] * -0.00111111111111
+                          +in[(i-3)*n+(j-5)] * -0.00111111111111
+                          +in[(i-2)*n+(j-5)] * -0.00111111111111
+                          +in[(i-1)*n+(j-5)] * -0.00111111111111
+                          +in[(i)*n+(j-5)] * -0.00111111111111
+                          +in[(i+1)*n+(j-5)] * -0.00111111111111
+                          +in[(i+2)*n+(j-5)] * -0.00111111111111
+                          +in[(i+3)*n+(j-5)] * -0.00111111111111
+                          +in[(i+4)*n+(j-5)] * -0.00111111111111
+                          +in[(i-5)*n+(j-4)] * -0.00111111111111
+                          +in[(i-4)*n+(j-4)] * -0.0125
+                          +in[(i-3)*n+(j-4)] * -0.00178571428571
+                          +in[(i-2)*n+(j-4)] * -0.00178571428571
+                          +in[(i-1)*n+(j-4)] * -0.00178571428571
+                          +in[(i)*n+(j-4)] * -0.00178571428571
+                          +in[(i+1)*n+(j-4)] * -0.00178571428571
+                          +in[(i+2)*n+(j-4)] * -0.00178571428571
+                          +in[(i+3)*n+(j-4)] * -0.00178571428571
+                          +in[(i+5)*n+(j-4)] * 0.00111111111111
+                          +in[(i-5)*n+(j-3)] * -0.00111111111111
+                          +in[(i-4)*n+(j-3)] * -0.00178571428571
+                          +in[(i-3)*n+(j-3)] * -0.0166666666667
+                          +in[(i-2)*n+(j-3)] * -0.00333333333333
+                          +in[(i-1)*n+(j-3)] * -0.00333333333333
+                          +in[(i)*n+(j-3)] * -0.00333333333333
+                          +in[(i+1)*n+(j-3)] * -0.00333333333333
+                          +in[(i+2)*n+(j-3)] * -0.00333333333333
+                          +in[(i+4)*n+(j-3)] * 0.00178571428571
+                          +in[(i+5)*n+(j-3)] * 0.00111111111111
+                          +in[(i-5)*n+(j-2)] * -0.00111111111111
+                          +in[(i-4)*n+(j-2)] * -0.00178571428571
+                          +in[(i-3)*n+(j-2)] * -0.00333333333333
+                          +in[(i-2)*n+(j-2)] * -0.025
+                          +in[(i-1)*n+(j-2)] * -0.00833333333333
+                          +in[(i)*n+(j-2)] * -0.00833333333333
+                          +in[(i+1)*n+(j-2)] * -0.00833333333333
+                          +in[(i+3)*n+(j-2)] * 0.00333333333333
+                          +in[(i+4)*n+(j-2)] * 0.00178571428571
+                          +in[(i+5)*n+(j-2)] * 0.00111111111111
+                          +in[(i-5)*n+(j-1)] * -0.00111111111111
+                          +in[(i-4)*n+(j-1)] * -0.00178571428571
+                          +in[(i-3)*n+(j-1)] * -0.00333333333333
+                          +in[(i-2)*n+(j-1)] * -0.00833333333333
+                          +in[(i-1)*n+(j-1)] * -0.05
+                          +in[(i)*n+(j-1)] * -0.05
+                          +in[(i+2)*n+(j-1)] * 0.00833333333333
+                          +in[(i+3)*n+(j-1)] * 0.00333333333333
+                          +in[(i+4)*n+(j-1)] * 0.00178571428571
+                          +in[(i+5)*n+(j-1)] * 0.00111111111111
+                          +in[(i-5)*n+(j)] * -0.00111111111111
+                          +in[(i-4)*n+(j)] * -0.00178571428571
+                          +in[(i-3)*n+(j)] * -0.00333333333333
+                          +in[(i-2)*n+(j)] * -0.00833333333333
+                          +in[(i-1)*n+(j)] * -0.05
+                          +in[(i+1)*n+(j)] * 0.05
+                          +in[(i+2)*n+(j)] * 0.00833333333333
+                          +in[(i+3)*n+(j)] * 0.00333333333333
+                          +in[(i+4)*n+(j)] * 0.00178571428571
+                          +in[(i+5)*n+(j)] * 0.00111111111111
+                          +in[(i-5)*n+(j+1)] * -0.00111111111111
+                          +in[(i-4)*n+(j+1)] * -0.00178571428571
+                          +in[(i-3)*n+(j+1)] * -0.00333333333333
+                          +in[(i-2)*n+(j+1)] * -0.00833333333333
+                          +in[(i)*n+(j+1)] * 0.05
                           +in[(i+1)*n+(j+1)] * 0.05
-                          +in[(i+1)*n+(j+2)] * 0.00833333333333
-                          +in[(i+1)*n+(j+3)] * 0.00333333333333
-                          +in[(i+1)*n+(j+4)] * 0.00178571428571
-                          +in[(i+1)*n+(j+5)] * 0.00111111111111
-                          +in[(i+2)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+2)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+2)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+2)*n+(j+-1)] * 0.00833333333333
-                          +in[(i+2)*n+(j+0)] * 0.00833333333333
                           +in[(i+2)*n+(j+1)] * 0.00833333333333
-                          +in[(i+2)*n+(j+2)] * 0.025
-                          +in[(i+2)*n+(j+3)] * 0.00333333333333
-                          +in[(i+2)*n+(j+4)] * 0.00178571428571
-                          +in[(i+2)*n+(j+5)] * 0.00111111111111
-                          +in[(i+3)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+3)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+3)*n+(j+-2)] * 0.00333333333333
-                          +in[(i+3)*n+(j+-1)] * 0.00333333333333
-                          +in[(i+3)*n+(j+0)] * 0.00333333333333
                           +in[(i+3)*n+(j+1)] * 0.00333333333333
-                          +in[(i+3)*n+(j+2)] * 0.00333333333333
-                          +in[(i+3)*n+(j+3)] * 0.0166666666667
-                          +in[(i+3)*n+(j+4)] * 0.00178571428571
-                          +in[(i+3)*n+(j+5)] * 0.00111111111111
-                          +in[(i+4)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+4)*n+(j+-3)] * 0.00178571428571
-                          +in[(i+4)*n+(j+-2)] * 0.00178571428571
-                          +in[(i+4)*n+(j+-1)] * 0.00178571428571
-                          +in[(i+4)*n+(j+0)] * 0.00178571428571
                           +in[(i+4)*n+(j+1)] * 0.00178571428571
-                          +in[(i+4)*n+(j+2)] * 0.00178571428571
-                          +in[(i+4)*n+(j+3)] * 0.00178571428571
-                          +in[(i+4)*n+(j+4)] * 0.0125
-                          +in[(i+4)*n+(j+5)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-4)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-3)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-2)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-1)] * 0.00111111111111
-                          +in[(i+5)*n+(j+0)] * 0.00111111111111
                           +in[(i+5)*n+(j+1)] * 0.00111111111111
+                          +in[(i-5)*n+(j+2)] * -0.00111111111111
+                          +in[(i-4)*n+(j+2)] * -0.00178571428571
+                          +in[(i-3)*n+(j+2)] * -0.00333333333333
+                          +in[(i-1)*n+(j+2)] * 0.00833333333333
+                          +in[(i)*n+(j+2)] * 0.00833333333333
+                          +in[(i+1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+2)*n+(j+2)] * 0.025
+                          +in[(i+3)*n+(j+2)] * 0.00333333333333
+                          +in[(i+4)*n+(j+2)] * 0.00178571428571
                           +in[(i+5)*n+(j+2)] * 0.00111111111111
+                          +in[(i-5)*n+(j+3)] * -0.00111111111111
+                          +in[(i-4)*n+(j+3)] * -0.00178571428571
+                          +in[(i-2)*n+(j+3)] * 0.00333333333333
+                          +in[(i-1)*n+(j+3)] * 0.00333333333333
+                          +in[(i)*n+(j+3)] * 0.00333333333333
+                          +in[(i+1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+3)*n+(j+3)] * 0.0166666666667
+                          +in[(i+4)*n+(j+3)] * 0.00178571428571
                           +in[(i+5)*n+(j+3)] * 0.00111111111111
+                          +in[(i-5)*n+(j+4)] * -0.00111111111111
+                          +in[(i-3)*n+(j+4)] * 0.00178571428571
+                          +in[(i-2)*n+(j+4)] * 0.00178571428571
+                          +in[(i-1)*n+(j+4)] * 0.00178571428571
+                          +in[(i)*n+(j+4)] * 0.00178571428571
+                          +in[(i+1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+4)*n+(j+4)] * 0.0125
                           +in[(i+5)*n+(j+4)] * 0.00111111111111
+                          +in[(i-4)*n+(j+5)] * 0.00111111111111
+                          +in[(i-3)*n+(j+5)] * 0.00111111111111
+                          +in[(i-2)*n+(j+5)] * 0.00111111111111
+                          +in[(i-1)*n+(j+5)] * 0.00111111111111
+                          +in[(i)*n+(j+5)] * 0.00111111111111
+                          +in[(i+1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+4)*n+(j+5)] * 0.00111111111111
                           +in[(i+5)*n+(j+5)] * 0.01
                           ;
      }
diff --git a/Cxx11/stencil_kokkos.hpp b/Cxx11/stencil_kokkos.hpp
index 94fffab58..f82b3ac02 100644
--- a/Cxx11/stencil_kokkos.hpp
+++ b/Cxx11/stencil_kokkos.hpp
@@ -1,101 +1,101 @@
 void star1(const int n, const int t, matrix & in, matrix & out) {
     auto inside = Kokkos::MDRangePolicy<Kokkos::Rank<2>>({1,1},{n-1,n-1},{t,t});
     Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) {
-              out(i,j) += +in(i+-1,j+0) * -0.5
-                          +in(i+0,j+-1) * -0.5
-                          +in(i+0,j+1) * 0.5
-                          +in(i+1,j+0) * 0.5;
+              out(i,j) += +in(i,j-1) * -0.5
+                          +in(i-1,j) * -0.5
+                          +in(i+1,j) * 0.5
+                          +in(i,j+1) * 0.5;
     });
 }
 
 void star2(const int n, const int t, matrix & in, matrix & out) {
     auto inside = Kokkos::MDRangePolicy<Kokkos::Rank<2>>({2,2},{n-2,n-2},{t,t});
     Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) {
-              out(i,j) += +in(i+-2,j+0) * -0.125
-                          +in(i+-1,j+0) * -0.25
-                          +in(i+0,j+-2) * -0.125
-                          +in(i+0,j+-1) * -0.25
-                          +in(i+0,j+1) * 0.25
-                          +in(i+0,j+2) * 0.125
-                          +in(i+1,j+0) * 0.25
-                          +in(i+2,j+0) * 0.125;
+              out(i,j) += +in(i,j-2) * -0.125
+                          +in(i,j-1) * -0.25
+                          +in(i-2,j) * -0.125
+                          +in(i-1,j) * -0.25
+                          +in(i+1,j) * 0.25
+                          +in(i+2,j) * 0.125
+                          +in(i,j+1) * 0.25
+                          +in(i,j+2) * 0.125;
     });
 }
 
 void star3(const int n, const int t, matrix & in, matrix & out) {
     auto inside = Kokkos::MDRangePolicy<Kokkos::Rank<2>>({3,3},{n-3,n-3},{t,t});
     Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) {
-              out(i,j) += +in(i+-3,j+0) * -0.0555555555556
-                          +in(i+-2,j+0) * -0.0833333333333
-                          +in(i+-1,j+0) * -0.166666666667
-                          +in(i+0,j+-3) * -0.0555555555556
-                          +in(i+0,j+-2) * -0.0833333333333
-                          +in(i+0,j+-1) * -0.166666666667
-                          +in(i+0,j+1) * 0.166666666667
-                          +in(i+0,j+2) * 0.0833333333333
-                          +in(i+0,j+3) * 0.0555555555556
-                          +in(i+1,j+0) * 0.166666666667
-                          +in(i+2,j+0) * 0.0833333333333
-                          +in(i+3,j+0) * 0.0555555555556;
+              out(i,j) += +in(i,j-3) * -0.0555555555556
+                          +in(i,j-2) * -0.0833333333333
+                          +in(i,j-1) * -0.166666666667
+                          +in(i-3,j) * -0.0555555555556
+                          +in(i-2,j) * -0.0833333333333
+                          +in(i-1,j) * -0.166666666667
+                          +in(i+1,j) * 0.166666666667
+                          +in(i+2,j) * 0.0833333333333
+                          +in(i+3,j) * 0.0555555555556
+                          +in(i,j+1) * 0.166666666667
+                          +in(i,j+2) * 0.0833333333333
+                          +in(i,j+3) * 0.0555555555556;
     });
 }
 
 void star4(const int n, const int t, matrix & in, matrix & out) {
     auto inside = Kokkos::MDRangePolicy<Kokkos::Rank<2>>({4,4},{n-4,n-4},{t,t});
     Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) {
-              out(i,j) += +in(i+-4,j+0) * -0.03125
-                          +in(i+-3,j+0) * -0.0416666666667
-                          +in(i+-2,j+0) * -0.0625
-                          +in(i+-1,j+0) * -0.125
-                          +in(i+0,j+-4) * -0.03125
-                          +in(i+0,j+-3) * -0.0416666666667
-                          +in(i+0,j+-2) * -0.0625
-                          +in(i+0,j+-1) * -0.125
-                          +in(i+0,j+1) * 0.125
-                          +in(i+0,j+2) * 0.0625
-                          +in(i+0,j+3) * 0.0416666666667
-                          +in(i+0,j+4) * 0.03125
-                          +in(i+1,j+0) * 0.125
-                          +in(i+2,j+0) * 0.0625
-                          +in(i+3,j+0) * 0.0416666666667
-                          +in(i+4,j+0) * 0.03125;
+              out(i,j) += +in(i,j-4) * -0.03125
+                          +in(i,j-3) * -0.0416666666667
+                          +in(i,j-2) * -0.0625
+                          +in(i,j-1) * -0.125
+                          +in(i-4,j) * -0.03125
+                          +in(i-3,j) * -0.0416666666667
+                          +in(i-2,j) * -0.0625
+                          +in(i-1,j) * -0.125
+                          +in(i+1,j) * 0.125
+                          +in(i+2,j) * 0.0625
+                          +in(i+3,j) * 0.0416666666667
+                          +in(i+4,j) * 0.03125
+                          +in(i,j+1) * 0.125
+                          +in(i,j+2) * 0.0625
+                          +in(i,j+3) * 0.0416666666667
+                          +in(i,j+4) * 0.03125;
     });
 }
 
 void star5(const int n, const int t, matrix & in, matrix & out) {
     auto inside = Kokkos::MDRangePolicy<Kokkos::Rank<2>>({5,5},{n-5,n-5},{t,t});
     Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) {
-              out(i,j) += +in(i+-5,j+0) * -0.02
-                          +in(i+-4,j+0) * -0.025
-                          +in(i+-3,j+0) * -0.0333333333333
-                          +in(i+-2,j+0) * -0.05
-                          +in(i+-1,j+0) * -0.1
-                          +in(i+0,j+-5) * -0.02
-                          +in(i+0,j+-4) * -0.025
-                          +in(i+0,j+-3) * -0.0333333333333
-                          +in(i+0,j+-2) * -0.05
-                          +in(i+0,j+-1) * -0.1
-                          +in(i+0,j+1) * 0.1
-                          +in(i+0,j+2) * 0.05
-                          +in(i+0,j+3) * 0.0333333333333
-                          +in(i+0,j+4) * 0.025
-                          +in(i+0,j+5) * 0.02
-                          +in(i+1,j+0) * 0.1
-                          +in(i+2,j+0) * 0.05
-                          +in(i+3,j+0) * 0.0333333333333
-                          +in(i+4,j+0) * 0.025
-                          +in(i+5,j+0) * 0.02;
+              out(i,j) += +in(i,j-5) * -0.02
+                          +in(i,j-4) * -0.025
+                          +in(i,j-3) * -0.0333333333333
+                          +in(i,j-2) * -0.05
+                          +in(i,j-1) * -0.1
+                          +in(i-5,j) * -0.02
+                          +in(i-4,j) * -0.025
+                          +in(i-3,j) * -0.0333333333333
+                          +in(i-2,j) * -0.05
+                          +in(i-1,j) * -0.1
+                          +in(i+1,j) * 0.1
+                          +in(i+2,j) * 0.05
+                          +in(i+3,j) * 0.0333333333333
+                          +in(i+4,j) * 0.025
+                          +in(i+5,j) * 0.02
+                          +in(i,j+1) * 0.1
+                          +in(i,j+2) * 0.05
+                          +in(i,j+3) * 0.0333333333333
+                          +in(i,j+4) * 0.025
+                          +in(i,j+5) * 0.02;
     });
 }
 
 void grid1(const int n, const int t, matrix & in, matrix & out) {
     auto inside = Kokkos::MDRangePolicy<Kokkos::Rank<2>>({1,1},{n-1,n-1},{t,t});
     Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) {
-              out(i,j) += +in(i+-1,j+-1) * -0.25
-                          +in(i+-1,j+0) * -0.25
-                          +in(i+0,j+-1) * -0.25
-                          +in(i+0,j+1) * 0.25
-                          +in(i+1,j+0) * 0.25
+              out(i,j) += +in(i-1,j-1) * -0.25
+                          +in(i,j-1) * -0.25
+                          +in(i-1,j) * -0.25
+                          +in(i+1,j) * 0.25
+                          +in(i,j+1) * 0.25
                           +in(i+1,j+1) * 0.25
                           ;
     });
@@ -104,25 +104,25 @@ void grid1(const int n, const int t, matrix & in, matrix & out) {
 void grid2(const int n, const int t, matrix & in, matrix & out) {
     auto inside = Kokkos::MDRangePolicy<Kokkos::Rank<2>>({2,2},{n-2,n-2},{t,t});
     Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) {
-              out(i,j) += +in(i+-2,j+-2) * -0.0625
-                          +in(i+-2,j+-1) * -0.0208333333333
-                          +in(i+-2,j+0) * -0.0208333333333
-                          +in(i+-2,j+1) * -0.0208333333333
-                          +in(i+-1,j+-2) * -0.0208333333333
-                          +in(i+-1,j+-1) * -0.125
-                          +in(i+-1,j+0) * -0.125
-                          +in(i+-1,j+2) * 0.0208333333333
-                          +in(i+0,j+-2) * -0.0208333333333
-                          +in(i+0,j+-1) * -0.125
-                          +in(i+0,j+1) * 0.125
-                          +in(i+0,j+2) * 0.0208333333333
-                          +in(i+1,j+-2) * -0.0208333333333
-                          +in(i+1,j+0) * 0.125
+              out(i,j) += +in(i-2,j-2) * -0.0625
+                          +in(i-1,j-2) * -0.0208333333333
+                          +in(i,j-2) * -0.0208333333333
+                          +in(i+1,j-2) * -0.0208333333333
+                          +in(i-2,j-1) * -0.0208333333333
+                          +in(i-1,j-1) * -0.125
+                          +in(i,j-1) * -0.125
+                          +in(i+2,j-1) * 0.0208333333333
+                          +in(i-2,j) * -0.0208333333333
+                          +in(i-1,j) * -0.125
+                          +in(i+1,j) * 0.125
+                          +in(i+2,j) * 0.0208333333333
+                          +in(i-2,j+1) * -0.0208333333333
+                          +in(i,j+1) * 0.125
                           +in(i+1,j+1) * 0.125
-                          +in(i+1,j+2) * 0.0208333333333
-                          +in(i+2,j+-1) * 0.0208333333333
-                          +in(i+2,j+0) * 0.0208333333333
                           +in(i+2,j+1) * 0.0208333333333
+                          +in(i-1,j+2) * 0.0208333333333
+                          +in(i,j+2) * 0.0208333333333
+                          +in(i+1,j+2) * 0.0208333333333
                           +in(i+2,j+2) * 0.0625
                           ;
     });
@@ -131,47 +131,47 @@ void grid2(const int n, const int t, matrix & in, matrix & out) {
 void grid3(const int n, const int t, matrix & in, matrix & out) {
     auto inside = Kokkos::MDRangePolicy<Kokkos::Rank<2>>({3,3},{n-3,n-3},{t,t});
     Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) {
-              out(i,j) += +in(i+-3,j+-3) * -0.0277777777778
-                          +in(i+-3,j+-2) * -0.00555555555556
-                          +in(i+-3,j+-1) * -0.00555555555556
-                          +in(i+-3,j+0) * -0.00555555555556
-                          +in(i+-3,j+1) * -0.00555555555556
-                          +in(i+-3,j+2) * -0.00555555555556
-                          +in(i+-2,j+-3) * -0.00555555555556
-                          +in(i+-2,j+-2) * -0.0416666666667
-                          +in(i+-2,j+-1) * -0.0138888888889
-                          +in(i+-2,j+0) * -0.0138888888889
-                          +in(i+-2,j+1) * -0.0138888888889
-                          +in(i+-2,j+3) * 0.00555555555556
-                          +in(i+-1,j+-3) * -0.00555555555556
-                          +in(i+-1,j+-2) * -0.0138888888889
-                          +in(i+-1,j+-1) * -0.0833333333333
-                          +in(i+-1,j+0) * -0.0833333333333
-                          +in(i+-1,j+2) * 0.0138888888889
-                          +in(i+-1,j+3) * 0.00555555555556
-                          +in(i+0,j+-3) * -0.00555555555556
-                          +in(i+0,j+-2) * -0.0138888888889
-                          +in(i+0,j+-1) * -0.0833333333333
-                          +in(i+0,j+1) * 0.0833333333333
-                          +in(i+0,j+2) * 0.0138888888889
-                          +in(i+0,j+3) * 0.00555555555556
-                          +in(i+1,j+-3) * -0.00555555555556
-                          +in(i+1,j+-2) * -0.0138888888889
-                          +in(i+1,j+0) * 0.0833333333333
+              out(i,j) += +in(i-3,j-3) * -0.0277777777778
+                          +in(i-2,j-3) * -0.00555555555556
+                          +in(i-1,j-3) * -0.00555555555556
+                          +in(i,j-3) * -0.00555555555556
+                          +in(i+1,j-3) * -0.00555555555556
+                          +in(i+2,j-3) * -0.00555555555556
+                          +in(i-3,j-2) * -0.00555555555556
+                          +in(i-2,j-2) * -0.0416666666667
+                          +in(i-1,j-2) * -0.0138888888889
+                          +in(i,j-2) * -0.0138888888889
+                          +in(i+1,j-2) * -0.0138888888889
+                          +in(i+3,j-2) * 0.00555555555556
+                          +in(i-3,j-1) * -0.00555555555556
+                          +in(i-2,j-1) * -0.0138888888889
+                          +in(i-1,j-1) * -0.0833333333333
+                          +in(i,j-1) * -0.0833333333333
+                          +in(i+2,j-1) * 0.0138888888889
+                          +in(i+3,j-1) * 0.00555555555556
+                          +in(i-3,j) * -0.00555555555556
+                          +in(i-2,j) * -0.0138888888889
+                          +in(i-1,j) * -0.0833333333333
+                          +in(i+1,j) * 0.0833333333333
+                          +in(i+2,j) * 0.0138888888889
+                          +in(i+3,j) * 0.00555555555556
+                          +in(i-3,j+1) * -0.00555555555556
+                          +in(i-2,j+1) * -0.0138888888889
+                          +in(i,j+1) * 0.0833333333333
                           +in(i+1,j+1) * 0.0833333333333
-                          +in(i+1,j+2) * 0.0138888888889
-                          +in(i+1,j+3) * 0.00555555555556
-                          +in(i+2,j+-3) * -0.00555555555556
-                          +in(i+2,j+-1) * 0.0138888888889
-                          +in(i+2,j+0) * 0.0138888888889
                           +in(i+2,j+1) * 0.0138888888889
-                          +in(i+2,j+2) * 0.0416666666667
-                          +in(i+2,j+3) * 0.00555555555556
-                          +in(i+3,j+-2) * 0.00555555555556
-                          +in(i+3,j+-1) * 0.00555555555556
-                          +in(i+3,j+0) * 0.00555555555556
                           +in(i+3,j+1) * 0.00555555555556
+                          +in(i-3,j+2) * -0.00555555555556
+                          +in(i-1,j+2) * 0.0138888888889
+                          +in(i,j+2) * 0.0138888888889
+                          +in(i+1,j+2) * 0.0138888888889
+                          +in(i+2,j+2) * 0.0416666666667
                           +in(i+3,j+2) * 0.00555555555556
+                          +in(i-2,j+3) * 0.00555555555556
+                          +in(i-1,j+3) * 0.00555555555556
+                          +in(i,j+3) * 0.00555555555556
+                          +in(i+1,j+3) * 0.00555555555556
+                          +in(i+2,j+3) * 0.00555555555556
                           +in(i+3,j+3) * 0.0277777777778
                           ;
     });
@@ -180,77 +180,77 @@ void grid3(const int n, const int t, matrix & in, matrix & out) {
 void grid4(const int n, const int t, matrix & in, matrix & out) {
     auto inside = Kokkos::MDRangePolicy<Kokkos::Rank<2>>({4,4},{n-4,n-4},{t,t});
     Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) {
-              out(i,j) += +in(i+-4,j+-4) * -0.015625
-                          +in(i+-4,j+-3) * -0.00223214285714
-                          +in(i+-4,j+-2) * -0.00223214285714
-                          +in(i+-4,j+-1) * -0.00223214285714
-                          +in(i+-4,j+0) * -0.00223214285714
-                          +in(i+-4,j+1) * -0.00223214285714
-                          +in(i+-4,j+2) * -0.00223214285714
-                          +in(i+-4,j+3) * -0.00223214285714
-                          +in(i+-3,j+-4) * -0.00223214285714
-                          +in(i+-3,j+-3) * -0.0208333333333
-                          +in(i+-3,j+-2) * -0.00416666666667
-                          +in(i+-3,j+-1) * -0.00416666666667
-                          +in(i+-3,j+0) * -0.00416666666667
-                          +in(i+-3,j+1) * -0.00416666666667
-                          +in(i+-3,j+2) * -0.00416666666667
-                          +in(i+-3,j+4) * 0.00223214285714
-                          +in(i+-2,j+-4) * -0.00223214285714
-                          +in(i+-2,j+-3) * -0.00416666666667
-                          +in(i+-2,j+-2) * -0.03125
-                          +in(i+-2,j+-1) * -0.0104166666667
-                          +in(i+-2,j+0) * -0.0104166666667
-                          +in(i+-2,j+1) * -0.0104166666667
-                          +in(i+-2,j+3) * 0.00416666666667
-                          +in(i+-2,j+4) * 0.00223214285714
-                          +in(i+-1,j+-4) * -0.00223214285714
-                          +in(i+-1,j+-3) * -0.00416666666667
-                          +in(i+-1,j+-2) * -0.0104166666667
-                          +in(i+-1,j+-1) * -0.0625
-                          +in(i+-1,j+0) * -0.0625
-                          +in(i+-1,j+2) * 0.0104166666667
-                          +in(i+-1,j+3) * 0.00416666666667
-                          +in(i+-1,j+4) * 0.00223214285714
-                          +in(i+0,j+-4) * -0.00223214285714
-                          +in(i+0,j+-3) * -0.00416666666667
-                          +in(i+0,j+-2) * -0.0104166666667
-                          +in(i+0,j+-1) * -0.0625
-                          +in(i+0,j+1) * 0.0625
-                          +in(i+0,j+2) * 0.0104166666667
-                          +in(i+0,j+3) * 0.00416666666667
-                          +in(i+0,j+4) * 0.00223214285714
-                          +in(i+1,j+-4) * -0.00223214285714
-                          +in(i+1,j+-3) * -0.00416666666667
-                          +in(i+1,j+-2) * -0.0104166666667
-                          +in(i+1,j+0) * 0.0625
+              out(i,j) += +in(i-4,j-4) * -0.015625
+                          +in(i-3,j-4) * -0.00223214285714
+                          +in(i-2,j-4) * -0.00223214285714
+                          +in(i-1,j-4) * -0.00223214285714
+                          +in(i,j-4) * -0.00223214285714
+                          +in(i+1,j-4) * -0.00223214285714
+                          +in(i+2,j-4) * -0.00223214285714
+                          +in(i+3,j-4) * -0.00223214285714
+                          +in(i-4,j-3) * -0.00223214285714
+                          +in(i-3,j-3) * -0.0208333333333
+                          +in(i-2,j-3) * -0.00416666666667
+                          +in(i-1,j-3) * -0.00416666666667
+                          +in(i,j-3) * -0.00416666666667
+                          +in(i+1,j-3) * -0.00416666666667
+                          +in(i+2,j-3) * -0.00416666666667
+                          +in(i+4,j-3) * 0.00223214285714
+                          +in(i-4,j-2) * -0.00223214285714
+                          +in(i-3,j-2) * -0.00416666666667
+                          +in(i-2,j-2) * -0.03125
+                          +in(i-1,j-2) * -0.0104166666667
+                          +in(i,j-2) * -0.0104166666667
+                          +in(i+1,j-2) * -0.0104166666667
+                          +in(i+3,j-2) * 0.00416666666667
+                          +in(i+4,j-2) * 0.00223214285714
+                          +in(i-4,j-1) * -0.00223214285714
+                          +in(i-3,j-1) * -0.00416666666667
+                          +in(i-2,j-1) * -0.0104166666667
+                          +in(i-1,j-1) * -0.0625
+                          +in(i,j-1) * -0.0625
+                          +in(i+2,j-1) * 0.0104166666667
+                          +in(i+3,j-1) * 0.00416666666667
+                          +in(i+4,j-1) * 0.00223214285714
+                          +in(i-4,j) * -0.00223214285714
+                          +in(i-3,j) * -0.00416666666667
+                          +in(i-2,j) * -0.0104166666667
+                          +in(i-1,j) * -0.0625
+                          +in(i+1,j) * 0.0625
+                          +in(i+2,j) * 0.0104166666667
+                          +in(i+3,j) * 0.00416666666667
+                          +in(i+4,j) * 0.00223214285714
+                          +in(i-4,j+1) * -0.00223214285714
+                          +in(i-3,j+1) * -0.00416666666667
+                          +in(i-2,j+1) * -0.0104166666667
+                          +in(i,j+1) * 0.0625
                           +in(i+1,j+1) * 0.0625
-                          +in(i+1,j+2) * 0.0104166666667
-                          +in(i+1,j+3) * 0.00416666666667
-                          +in(i+1,j+4) * 0.00223214285714
-                          +in(i+2,j+-4) * -0.00223214285714
-                          +in(i+2,j+-3) * -0.00416666666667
-                          +in(i+2,j+-1) * 0.0104166666667
-                          +in(i+2,j+0) * 0.0104166666667
                           +in(i+2,j+1) * 0.0104166666667
-                          +in(i+2,j+2) * 0.03125
-                          +in(i+2,j+3) * 0.00416666666667
-                          +in(i+2,j+4) * 0.00223214285714
-                          +in(i+3,j+-4) * -0.00223214285714
-                          +in(i+3,j+-2) * 0.00416666666667
-                          +in(i+3,j+-1) * 0.00416666666667
-                          +in(i+3,j+0) * 0.00416666666667
                           +in(i+3,j+1) * 0.00416666666667
-                          +in(i+3,j+2) * 0.00416666666667
-                          +in(i+3,j+3) * 0.0208333333333
-                          +in(i+3,j+4) * 0.00223214285714
-                          +in(i+4,j+-3) * 0.00223214285714
-                          +in(i+4,j+-2) * 0.00223214285714
-                          +in(i+4,j+-1) * 0.00223214285714
-                          +in(i+4,j+0) * 0.00223214285714
                           +in(i+4,j+1) * 0.00223214285714
+                          +in(i-4,j+2) * -0.00223214285714
+                          +in(i-3,j+2) * -0.00416666666667
+                          +in(i-1,j+2) * 0.0104166666667
+                          +in(i,j+2) * 0.0104166666667
+                          +in(i+1,j+2) * 0.0104166666667
+                          +in(i+2,j+2) * 0.03125
+                          +in(i+3,j+2) * 0.00416666666667
                           +in(i+4,j+2) * 0.00223214285714
+                          +in(i-4,j+3) * -0.00223214285714
+                          +in(i-2,j+3) * 0.00416666666667
+                          +in(i-1,j+3) * 0.00416666666667
+                          +in(i,j+3) * 0.00416666666667
+                          +in(i+1,j+3) * 0.00416666666667
+                          +in(i+2,j+3) * 0.00416666666667
+                          +in(i+3,j+3) * 0.0208333333333
                           +in(i+4,j+3) * 0.00223214285714
+                          +in(i-3,j+4) * 0.00223214285714
+                          +in(i-2,j+4) * 0.00223214285714
+                          +in(i-1,j+4) * 0.00223214285714
+                          +in(i,j+4) * 0.00223214285714
+                          +in(i+1,j+4) * 0.00223214285714
+                          +in(i+2,j+4) * 0.00223214285714
+                          +in(i+3,j+4) * 0.00223214285714
                           +in(i+4,j+4) * 0.015625
                           ;
     });
@@ -259,115 +259,115 @@ void grid4(const int n, const int t, matrix & in, matrix & out) {
 void grid5(const int n, const int t, matrix & in, matrix & out) {
     auto inside = Kokkos::MDRangePolicy<Kokkos::Rank<2>>({5,5},{n-5,n-5},{t,t});
     Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) {
-              out(i,j) += +in(i+-5,j+-5) * -0.01
-                          +in(i+-5,j+-4) * -0.00111111111111
-                          +in(i+-5,j+-3) * -0.00111111111111
-                          +in(i+-5,j+-2) * -0.00111111111111
-                          +in(i+-5,j+-1) * -0.00111111111111
-                          +in(i+-5,j+0) * -0.00111111111111
-                          +in(i+-5,j+1) * -0.00111111111111
-                          +in(i+-5,j+2) * -0.00111111111111
-                          +in(i+-5,j+3) * -0.00111111111111
-                          +in(i+-5,j+4) * -0.00111111111111
-                          +in(i+-4,j+-5) * -0.00111111111111
-                          +in(i+-4,j+-4) * -0.0125
-                          +in(i+-4,j+-3) * -0.00178571428571
-                          +in(i+-4,j+-2) * -0.00178571428571
-                          +in(i+-4,j+-1) * -0.00178571428571
-                          +in(i+-4,j+0) * -0.00178571428571
-                          +in(i+-4,j+1) * -0.00178571428571
-                          +in(i+-4,j+2) * -0.00178571428571
-                          +in(i+-4,j+3) * -0.00178571428571
-                          +in(i+-4,j+5) * 0.00111111111111
-                          +in(i+-3,j+-5) * -0.00111111111111
-                          +in(i+-3,j+-4) * -0.00178571428571
-                          +in(i+-3,j+-3) * -0.0166666666667
-                          +in(i+-3,j+-2) * -0.00333333333333
-                          +in(i+-3,j+-1) * -0.00333333333333
-                          +in(i+-3,j+0) * -0.00333333333333
-                          +in(i+-3,j+1) * -0.00333333333333
-                          +in(i+-3,j+2) * -0.00333333333333
-                          +in(i+-3,j+4) * 0.00178571428571
-                          +in(i+-3,j+5) * 0.00111111111111
-                          +in(i+-2,j+-5) * -0.00111111111111
-                          +in(i+-2,j+-4) * -0.00178571428571
-                          +in(i+-2,j+-3) * -0.00333333333333
-                          +in(i+-2,j+-2) * -0.025
-                          +in(i+-2,j+-1) * -0.00833333333333
-                          +in(i+-2,j+0) * -0.00833333333333
-                          +in(i+-2,j+1) * -0.00833333333333
-                          +in(i+-2,j+3) * 0.00333333333333
-                          +in(i+-2,j+4) * 0.00178571428571
-                          +in(i+-2,j+5) * 0.00111111111111
-                          +in(i+-1,j+-5) * -0.00111111111111
-                          +in(i+-1,j+-4) * -0.00178571428571
-                          +in(i+-1,j+-3) * -0.00333333333333
-                          +in(i+-1,j+-2) * -0.00833333333333
-                          +in(i+-1,j+-1) * -0.05
-                          +in(i+-1,j+0) * -0.05
-                          +in(i+-1,j+2) * 0.00833333333333
-                          +in(i+-1,j+3) * 0.00333333333333
-                          +in(i+-1,j+4) * 0.00178571428571
-                          +in(i+-1,j+5) * 0.00111111111111
-                          +in(i+0,j+-5) * -0.00111111111111
-                          +in(i+0,j+-4) * -0.00178571428571
-                          +in(i+0,j+-3) * -0.00333333333333
-                          +in(i+0,j+-2) * -0.00833333333333
-                          +in(i+0,j+-1) * -0.05
-                          +in(i+0,j+1) * 0.05
-                          +in(i+0,j+2) * 0.00833333333333
-                          +in(i+0,j+3) * 0.00333333333333
-                          +in(i+0,j+4) * 0.00178571428571
-                          +in(i+0,j+5) * 0.00111111111111
-                          +in(i+1,j+-5) * -0.00111111111111
-                          +in(i+1,j+-4) * -0.00178571428571
-                          +in(i+1,j+-3) * -0.00333333333333
-                          +in(i+1,j+-2) * -0.00833333333333
-                          +in(i+1,j+0) * 0.05
+              out(i,j) += +in(i-5,j-5) * -0.01
+                          +in(i-4,j-5) * -0.00111111111111
+                          +in(i-3,j-5) * -0.00111111111111
+                          +in(i-2,j-5) * -0.00111111111111
+                          +in(i-1,j-5) * -0.00111111111111
+                          +in(i,j-5) * -0.00111111111111
+                          +in(i+1,j-5) * -0.00111111111111
+                          +in(i+2,j-5) * -0.00111111111111
+                          +in(i+3,j-5) * -0.00111111111111
+                          +in(i+4,j-5) * -0.00111111111111
+                          +in(i-5,j-4) * -0.00111111111111
+                          +in(i-4,j-4) * -0.0125
+                          +in(i-3,j-4) * -0.00178571428571
+                          +in(i-2,j-4) * -0.00178571428571
+                          +in(i-1,j-4) * -0.00178571428571
+                          +in(i,j-4) * -0.00178571428571
+                          +in(i+1,j-4) * -0.00178571428571
+                          +in(i+2,j-4) * -0.00178571428571
+                          +in(i+3,j-4) * -0.00178571428571
+                          +in(i+5,j-4) * 0.00111111111111
+                          +in(i-5,j-3) * -0.00111111111111
+                          +in(i-4,j-3) * -0.00178571428571
+                          +in(i-3,j-3) * -0.0166666666667
+                          +in(i-2,j-3) * -0.00333333333333
+                          +in(i-1,j-3) * -0.00333333333333
+                          +in(i,j-3) * -0.00333333333333
+                          +in(i+1,j-3) * -0.00333333333333
+                          +in(i+2,j-3) * -0.00333333333333
+                          +in(i+4,j-3) * 0.00178571428571
+                          +in(i+5,j-3) * 0.00111111111111
+                          +in(i-5,j-2) * -0.00111111111111
+                          +in(i-4,j-2) * -0.00178571428571
+                          +in(i-3,j-2) * -0.00333333333333
+                          +in(i-2,j-2) * -0.025
+                          +in(i-1,j-2) * -0.00833333333333
+                          +in(i,j-2) * -0.00833333333333
+                          +in(i+1,j-2) * -0.00833333333333
+                          +in(i+3,j-2) * 0.00333333333333
+                          +in(i+4,j-2) * 0.00178571428571
+                          +in(i+5,j-2) * 0.00111111111111
+                          +in(i-5,j-1) * -0.00111111111111
+                          +in(i-4,j-1) * -0.00178571428571
+                          +in(i-3,j-1) * -0.00333333333333
+                          +in(i-2,j-1) * -0.00833333333333
+                          +in(i-1,j-1) * -0.05
+                          +in(i,j-1) * -0.05
+                          +in(i+2,j-1) * 0.00833333333333
+                          +in(i+3,j-1) * 0.00333333333333
+                          +in(i+4,j-1) * 0.00178571428571
+                          +in(i+5,j-1) * 0.00111111111111
+                          +in(i-5,j) * -0.00111111111111
+                          +in(i-4,j) * -0.00178571428571
+                          +in(i-3,j) * -0.00333333333333
+                          +in(i-2,j) * -0.00833333333333
+                          +in(i-1,j) * -0.05
+                          +in(i+1,j) * 0.05
+                          +in(i+2,j) * 0.00833333333333
+                          +in(i+3,j) * 0.00333333333333
+                          +in(i+4,j) * 0.00178571428571
+                          +in(i+5,j) * 0.00111111111111
+                          +in(i-5,j+1) * -0.00111111111111
+                          +in(i-4,j+1) * -0.00178571428571
+                          +in(i-3,j+1) * -0.00333333333333
+                          +in(i-2,j+1) * -0.00833333333333
+                          +in(i,j+1) * 0.05
                           +in(i+1,j+1) * 0.05
-                          +in(i+1,j+2) * 0.00833333333333
-                          +in(i+1,j+3) * 0.00333333333333
-                          +in(i+1,j+4) * 0.00178571428571
-                          +in(i+1,j+5) * 0.00111111111111
-                          +in(i+2,j+-5) * -0.00111111111111
-                          +in(i+2,j+-4) * -0.00178571428571
-                          +in(i+2,j+-3) * -0.00333333333333
-                          +in(i+2,j+-1) * 0.00833333333333
-                          +in(i+2,j+0) * 0.00833333333333
                           +in(i+2,j+1) * 0.00833333333333
-                          +in(i+2,j+2) * 0.025
-                          +in(i+2,j+3) * 0.00333333333333
-                          +in(i+2,j+4) * 0.00178571428571
-                          +in(i+2,j+5) * 0.00111111111111
-                          +in(i+3,j+-5) * -0.00111111111111
-                          +in(i+3,j+-4) * -0.00178571428571
-                          +in(i+3,j+-2) * 0.00333333333333
-                          +in(i+3,j+-1) * 0.00333333333333
-                          +in(i+3,j+0) * 0.00333333333333
                           +in(i+3,j+1) * 0.00333333333333
-                          +in(i+3,j+2) * 0.00333333333333
-                          +in(i+3,j+3) * 0.0166666666667
-                          +in(i+3,j+4) * 0.00178571428571
-                          +in(i+3,j+5) * 0.00111111111111
-                          +in(i+4,j+-5) * -0.00111111111111
-                          +in(i+4,j+-3) * 0.00178571428571
-                          +in(i+4,j+-2) * 0.00178571428571
-                          +in(i+4,j+-1) * 0.00178571428571
-                          +in(i+4,j+0) * 0.00178571428571
                           +in(i+4,j+1) * 0.00178571428571
-                          +in(i+4,j+2) * 0.00178571428571
-                          +in(i+4,j+3) * 0.00178571428571
-                          +in(i+4,j+4) * 0.0125
-                          +in(i+4,j+5) * 0.00111111111111
-                          +in(i+5,j+-4) * 0.00111111111111
-                          +in(i+5,j+-3) * 0.00111111111111
-                          +in(i+5,j+-2) * 0.00111111111111
-                          +in(i+5,j+-1) * 0.00111111111111
-                          +in(i+5,j+0) * 0.00111111111111
                           +in(i+5,j+1) * 0.00111111111111
+                          +in(i-5,j+2) * -0.00111111111111
+                          +in(i-4,j+2) * -0.00178571428571
+                          +in(i-3,j+2) * -0.00333333333333
+                          +in(i-1,j+2) * 0.00833333333333
+                          +in(i,j+2) * 0.00833333333333
+                          +in(i+1,j+2) * 0.00833333333333
+                          +in(i+2,j+2) * 0.025
+                          +in(i+3,j+2) * 0.00333333333333
+                          +in(i+4,j+2) * 0.00178571428571
                           +in(i+5,j+2) * 0.00111111111111
+                          +in(i-5,j+3) * -0.00111111111111
+                          +in(i-4,j+3) * -0.00178571428571
+                          +in(i-2,j+3) * 0.00333333333333
+                          +in(i-1,j+3) * 0.00333333333333
+                          +in(i,j+3) * 0.00333333333333
+                          +in(i+1,j+3) * 0.00333333333333
+                          +in(i+2,j+3) * 0.00333333333333
+                          +in(i+3,j+3) * 0.0166666666667
+                          +in(i+4,j+3) * 0.00178571428571
                           +in(i+5,j+3) * 0.00111111111111
+                          +in(i-5,j+4) * -0.00111111111111
+                          +in(i-3,j+4) * 0.00178571428571
+                          +in(i-2,j+4) * 0.00178571428571
+                          +in(i-1,j+4) * 0.00178571428571
+                          +in(i,j+4) * 0.00178571428571
+                          +in(i+1,j+4) * 0.00178571428571
+                          +in(i+2,j+4) * 0.00178571428571
+                          +in(i+3,j+4) * 0.00178571428571
+                          +in(i+4,j+4) * 0.0125
                           +in(i+5,j+4) * 0.00111111111111
+                          +in(i-4,j+5) * 0.00111111111111
+                          +in(i-3,j+5) * 0.00111111111111
+                          +in(i-2,j+5) * 0.00111111111111
+                          +in(i-1,j+5) * 0.00111111111111
+                          +in(i,j+5) * 0.00111111111111
+                          +in(i+1,j+5) * 0.00111111111111
+                          +in(i+2,j+5) * 0.00111111111111
+                          +in(i+3,j+5) * 0.00111111111111
+                          +in(i+4,j+5) * 0.00111111111111
                           +in(i+5,j+5) * 0.01
                           ;
     });
diff --git a/Cxx11/stencil_openmp.hpp b/Cxx11/stencil_openmp.hpp
index 2a42b437c..009a32bc2 100644
--- a/Cxx11/stencil_openmp.hpp
+++ b/Cxx11/stencil_openmp.hpp
@@ -5,10 +5,10 @@ void star1(const int n, const int t, const double * RESTRICT in, double * RESTRI
         for (auto i=it; i<std::min(n-1,it+t); ++i) {
           OMP_SIMD
           for (auto j=jt; j<std::min(n-1,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-1)*n+(j+0)] * -0.5
-                          +in[(i+0)*n+(j+-1)] * -0.5
-                          +in[(i+0)*n+(j+1)] * 0.5
-                          +in[(i+1)*n+(j+0)] * 0.5;
+            out[i*n+j] += +in[(i)*n+(j-1)] * -0.5
+                          +in[(i-1)*n+(j)] * -0.5
+                          +in[(i+1)*n+(j)] * 0.5
+                          +in[(i)*n+(j+1)] * 0.5;
            }
          }
        }
@@ -22,14 +22,14 @@ void star2(const int n, const int t, const double * RESTRICT in, double * RESTRI
         for (auto i=it; i<std::min(n-2,it+t); ++i) {
           OMP_SIMD
           for (auto j=jt; j<std::min(n-2,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-2)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+0)] * -0.25
-                          +in[(i+0)*n+(j+-2)] * -0.125
-                          +in[(i+0)*n+(j+-1)] * -0.25
-                          +in[(i+0)*n+(j+1)] * 0.25
-                          +in[(i+0)*n+(j+2)] * 0.125
-                          +in[(i+1)*n+(j+0)] * 0.25
-                          +in[(i+2)*n+(j+0)] * 0.125;
+            out[i*n+j] += +in[(i)*n+(j-2)] * -0.125
+                          +in[(i)*n+(j-1)] * -0.25
+                          +in[(i-2)*n+(j)] * -0.125
+                          +in[(i-1)*n+(j)] * -0.25
+                          +in[(i+1)*n+(j)] * 0.25
+                          +in[(i+2)*n+(j)] * 0.125
+                          +in[(i)*n+(j+1)] * 0.25
+                          +in[(i)*n+(j+2)] * 0.125;
            }
          }
        }
@@ -43,18 +43,18 @@ void star3(const int n, const int t, const double * RESTRICT in, double * RESTRI
         for (auto i=it; i<std::min(n-3,it+t); ++i) {
           OMP_SIMD
           for (auto j=jt; j<std::min(n-3,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556
-                          +in[(i+-2)*n+(j+0)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+0)] * -0.166666666667
-                          +in[(i+0)*n+(j+-3)] * -0.0555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.0833333333333
-                          +in[(i+0)*n+(j+-1)] * -0.166666666667
-                          +in[(i+0)*n+(j+1)] * 0.166666666667
-                          +in[(i+0)*n+(j+2)] * 0.0833333333333
-                          +in[(i+0)*n+(j+3)] * 0.0555555555556
-                          +in[(i+1)*n+(j+0)] * 0.166666666667
-                          +in[(i+2)*n+(j+0)] * 0.0833333333333
-                          +in[(i+3)*n+(j+0)] * 0.0555555555556;
+            out[i*n+j] += +in[(i)*n+(j-3)] * -0.0555555555556
+                          +in[(i)*n+(j-2)] * -0.0833333333333
+                          +in[(i)*n+(j-1)] * -0.166666666667
+                          +in[(i-3)*n+(j)] * -0.0555555555556
+                          +in[(i-2)*n+(j)] * -0.0833333333333
+                          +in[(i-1)*n+(j)] * -0.166666666667
+                          +in[(i+1)*n+(j)] * 0.166666666667
+                          +in[(i+2)*n+(j)] * 0.0833333333333
+                          +in[(i+3)*n+(j)] * 0.0555555555556
+                          +in[(i)*n+(j+1)] * 0.166666666667
+                          +in[(i)*n+(j+2)] * 0.0833333333333
+                          +in[(i)*n+(j+3)] * 0.0555555555556;
            }
          }
        }
@@ -68,22 +68,22 @@ void star4(const int n, const int t, const double * RESTRICT in, double * RESTRI
         for (auto i=it; i<std::min(n-4,it+t); ++i) {
           OMP_SIMD
           for (auto j=jt; j<std::min(n-4,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125
-                          +in[(i+-3)*n+(j+0)] * -0.0416666666667
-                          +in[(i+-2)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+0)*n+(j+-4)] * -0.03125
-                          +in[(i+0)*n+(j+-3)] * -0.0416666666667
-                          +in[(i+0)*n+(j+-2)] * -0.0625
-                          +in[(i+0)*n+(j+-1)] * -0.125
-                          +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.0625
-                          +in[(i+0)*n+(j+3)] * 0.0416666666667
-                          +in[(i+0)*n+(j+4)] * 0.03125
-                          +in[(i+1)*n+(j+0)] * 0.125
-                          +in[(i+2)*n+(j+0)] * 0.0625
-                          +in[(i+3)*n+(j+0)] * 0.0416666666667
-                          +in[(i+4)*n+(j+0)] * 0.03125;
+            out[i*n+j] += +in[(i)*n+(j-4)] * -0.03125
+                          +in[(i)*n+(j-3)] * -0.0416666666667
+                          +in[(i)*n+(j-2)] * -0.0625
+                          +in[(i)*n+(j-1)] * -0.125
+                          +in[(i-4)*n+(j)] * -0.03125
+                          +in[(i-3)*n+(j)] * -0.0416666666667
+                          +in[(i-2)*n+(j)] * -0.0625
+                          +in[(i-1)*n+(j)] * -0.125
+                          +in[(i+1)*n+(j)] * 0.125
+                          +in[(i+2)*n+(j)] * 0.0625
+                          +in[(i+3)*n+(j)] * 0.0416666666667
+                          +in[(i+4)*n+(j)] * 0.03125
+                          +in[(i)*n+(j+1)] * 0.125
+                          +in[(i)*n+(j+2)] * 0.0625
+                          +in[(i)*n+(j+3)] * 0.0416666666667
+                          +in[(i)*n+(j+4)] * 0.03125;
            }
          }
        }
@@ -97,26 +97,26 @@ void star5(const int n, const int t, const double * RESTRICT in, double * RESTRI
         for (auto i=it; i<std::min(n-5,it+t); ++i) {
           OMP_SIMD
           for (auto j=jt; j<std::min(n-5,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02
-                          +in[(i+-4)*n+(j+0)] * -0.025
-                          +in[(i+-3)*n+(j+0)] * -0.0333333333333
-                          +in[(i+-2)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+0)] * -0.1
-                          +in[(i+0)*n+(j+-5)] * -0.02
-                          +in[(i+0)*n+(j+-4)] * -0.025
-                          +in[(i+0)*n+(j+-3)] * -0.0333333333333
-                          +in[(i+0)*n+(j+-2)] * -0.05
-                          +in[(i+0)*n+(j+-1)] * -0.1
-                          +in[(i+0)*n+(j+1)] * 0.1
-                          +in[(i+0)*n+(j+2)] * 0.05
-                          +in[(i+0)*n+(j+3)] * 0.0333333333333
-                          +in[(i+0)*n+(j+4)] * 0.025
-                          +in[(i+0)*n+(j+5)] * 0.02
-                          +in[(i+1)*n+(j+0)] * 0.1
-                          +in[(i+2)*n+(j+0)] * 0.05
-                          +in[(i+3)*n+(j+0)] * 0.0333333333333
-                          +in[(i+4)*n+(j+0)] * 0.025
-                          +in[(i+5)*n+(j+0)] * 0.02;
+            out[i*n+j] += +in[(i)*n+(j-5)] * -0.02
+                          +in[(i)*n+(j-4)] * -0.025
+                          +in[(i)*n+(j-3)] * -0.0333333333333
+                          +in[(i)*n+(j-2)] * -0.05
+                          +in[(i)*n+(j-1)] * -0.1
+                          +in[(i-5)*n+(j)] * -0.02
+                          +in[(i-4)*n+(j)] * -0.025
+                          +in[(i-3)*n+(j)] * -0.0333333333333
+                          +in[(i-2)*n+(j)] * -0.05
+                          +in[(i-1)*n+(j)] * -0.1
+                          +in[(i+1)*n+(j)] * 0.1
+                          +in[(i+2)*n+(j)] * 0.05
+                          +in[(i+3)*n+(j)] * 0.0333333333333
+                          +in[(i+4)*n+(j)] * 0.025
+                          +in[(i+5)*n+(j)] * 0.02
+                          +in[(i)*n+(j+1)] * 0.1
+                          +in[(i)*n+(j+2)] * 0.05
+                          +in[(i)*n+(j+3)] * 0.0333333333333
+                          +in[(i)*n+(j+4)] * 0.025
+                          +in[(i)*n+(j+5)] * 0.02;
            }
          }
        }
@@ -130,11 +130,11 @@ void grid1(const int n, const int t, const double * RESTRICT in, double * RESTRI
         for (auto i=it; i<std::min(n-1,it+t); ++i) {
           OMP_SIMD
           for (auto j=jt; j<std::min(n-1,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-1)*n+(j+-1)] * -0.25
-                          +in[(i+-1)*n+(j+0)] * -0.25
-                          +in[(i+0)*n+(j+-1)] * -0.25
-                          +in[(i+0)*n+(j+1)] * 0.25
-                          +in[(i+1)*n+(j+0)] * 0.25
+            out[i*n+j] += +in[(i-1)*n+(j-1)] * -0.25
+                          +in[(i)*n+(j-1)] * -0.25
+                          +in[(i-1)*n+(j)] * -0.25
+                          +in[(i+1)*n+(j)] * 0.25
+                          +in[(i)*n+(j+1)] * 0.25
                           +in[(i+1)*n+(j+1)] * 0.25
                           ;
            }
@@ -150,25 +150,25 @@ void grid2(const int n, const int t, const double * RESTRICT in, double * RESTRI
         for (auto i=it; i<std::min(n-2,it+t); ++i) {
           OMP_SIMD
           for (auto j=jt; j<std::min(n-2,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625
-                          +in[(i+-2)*n+(j+-1)] * -0.0208333333333
-                          +in[(i+-2)*n+(j+0)] * -0.0208333333333
-                          +in[(i+-2)*n+(j+1)] * -0.0208333333333
-                          +in[(i+-1)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+-1)*n+(j+-1)] * -0.125
-                          +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+2)] * 0.0208333333333
-                          +in[(i+0)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+0)*n+(j+-1)] * -0.125
-                          +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.0208333333333
-                          +in[(i+1)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+1)*n+(j+0)] * 0.125
+            out[i*n+j] += +in[(i-2)*n+(j-2)] * -0.0625
+                          +in[(i-1)*n+(j-2)] * -0.0208333333333
+                          +in[(i)*n+(j-2)] * -0.0208333333333
+                          +in[(i+1)*n+(j-2)] * -0.0208333333333
+                          +in[(i-2)*n+(j-1)] * -0.0208333333333
+                          +in[(i-1)*n+(j-1)] * -0.125
+                          +in[(i)*n+(j-1)] * -0.125
+                          +in[(i+2)*n+(j-1)] * 0.0208333333333
+                          +in[(i-2)*n+(j)] * -0.0208333333333
+                          +in[(i-1)*n+(j)] * -0.125
+                          +in[(i+1)*n+(j)] * 0.125
+                          +in[(i+2)*n+(j)] * 0.0208333333333
+                          +in[(i-2)*n+(j+1)] * -0.0208333333333
+                          +in[(i)*n+(j+1)] * 0.125
                           +in[(i+1)*n+(j+1)] * 0.125
-                          +in[(i+1)*n+(j+2)] * 0.0208333333333
-                          +in[(i+2)*n+(j+-1)] * 0.0208333333333
-                          +in[(i+2)*n+(j+0)] * 0.0208333333333
                           +in[(i+2)*n+(j+1)] * 0.0208333333333
+                          +in[(i-1)*n+(j+2)] * 0.0208333333333
+                          +in[(i)*n+(j+2)] * 0.0208333333333
+                          +in[(i+1)*n+(j+2)] * 0.0208333333333
                           +in[(i+2)*n+(j+2)] * 0.0625
                           ;
            }
@@ -184,47 +184,47 @@ void grid3(const int n, const int t, const double * RESTRICT in, double * RESTRI
         for (auto i=it; i<std::min(n-3,it+t); ++i) {
           OMP_SIMD
           for (auto j=jt; j<std::min(n-3,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778
-                          +in[(i+-3)*n+(j+-2)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+-1)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+0)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+1)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+2)] * -0.00555555555556
-                          +in[(i+-2)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+-2)*n+(j+-2)] * -0.0416666666667
-                          +in[(i+-2)*n+(j+-1)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+0)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+1)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+3)] * 0.00555555555556
-                          +in[(i+-1)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+-1)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+-1)*n+(j+-1)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+0)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+2)] * 0.0138888888889
-                          +in[(i+-1)*n+(j+3)] * 0.00555555555556
-                          +in[(i+0)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+0)*n+(j+-1)] * -0.0833333333333
-                          +in[(i+0)*n+(j+1)] * 0.0833333333333
-                          +in[(i+0)*n+(j+2)] * 0.0138888888889
-                          +in[(i+0)*n+(j+3)] * 0.00555555555556
-                          +in[(i+1)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+1)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+1)*n+(j+0)] * 0.0833333333333
+            out[i*n+j] += +in[(i-3)*n+(j-3)] * -0.0277777777778
+                          +in[(i-2)*n+(j-3)] * -0.00555555555556
+                          +in[(i-1)*n+(j-3)] * -0.00555555555556
+                          +in[(i)*n+(j-3)] * -0.00555555555556
+                          +in[(i+1)*n+(j-3)] * -0.00555555555556
+                          +in[(i+2)*n+(j-3)] * -0.00555555555556
+                          +in[(i-3)*n+(j-2)] * -0.00555555555556
+                          +in[(i-2)*n+(j-2)] * -0.0416666666667
+                          +in[(i-1)*n+(j-2)] * -0.0138888888889
+                          +in[(i)*n+(j-2)] * -0.0138888888889
+                          +in[(i+1)*n+(j-2)] * -0.0138888888889
+                          +in[(i+3)*n+(j-2)] * 0.00555555555556
+                          +in[(i-3)*n+(j-1)] * -0.00555555555556
+                          +in[(i-2)*n+(j-1)] * -0.0138888888889
+                          +in[(i-1)*n+(j-1)] * -0.0833333333333
+                          +in[(i)*n+(j-1)] * -0.0833333333333
+                          +in[(i+2)*n+(j-1)] * 0.0138888888889
+                          +in[(i+3)*n+(j-1)] * 0.00555555555556
+                          +in[(i-3)*n+(j)] * -0.00555555555556
+                          +in[(i-2)*n+(j)] * -0.0138888888889
+                          +in[(i-1)*n+(j)] * -0.0833333333333
+                          +in[(i+1)*n+(j)] * 0.0833333333333
+                          +in[(i+2)*n+(j)] * 0.0138888888889
+                          +in[(i+3)*n+(j)] * 0.00555555555556
+                          +in[(i-3)*n+(j+1)] * -0.00555555555556
+                          +in[(i-2)*n+(j+1)] * -0.0138888888889
+                          +in[(i)*n+(j+1)] * 0.0833333333333
                           +in[(i+1)*n+(j+1)] * 0.0833333333333
-                          +in[(i+1)*n+(j+2)] * 0.0138888888889
-                          +in[(i+1)*n+(j+3)] * 0.00555555555556
-                          +in[(i+2)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+2)*n+(j+-1)] * 0.0138888888889
-                          +in[(i+2)*n+(j+0)] * 0.0138888888889
                           +in[(i+2)*n+(j+1)] * 0.0138888888889
-                          +in[(i+2)*n+(j+2)] * 0.0416666666667
-                          +in[(i+2)*n+(j+3)] * 0.00555555555556
-                          +in[(i+3)*n+(j+-2)] * 0.00555555555556
-                          +in[(i+3)*n+(j+-1)] * 0.00555555555556
-                          +in[(i+3)*n+(j+0)] * 0.00555555555556
                           +in[(i+3)*n+(j+1)] * 0.00555555555556
+                          +in[(i-3)*n+(j+2)] * -0.00555555555556
+                          +in[(i-1)*n+(j+2)] * 0.0138888888889
+                          +in[(i)*n+(j+2)] * 0.0138888888889
+                          +in[(i+1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+2)*n+(j+2)] * 0.0416666666667
                           +in[(i+3)*n+(j+2)] * 0.00555555555556
+                          +in[(i-2)*n+(j+3)] * 0.00555555555556
+                          +in[(i-1)*n+(j+3)] * 0.00555555555556
+                          +in[(i)*n+(j+3)] * 0.00555555555556
+                          +in[(i+1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+2)*n+(j+3)] * 0.00555555555556
                           +in[(i+3)*n+(j+3)] * 0.0277777777778
                           ;
            }
@@ -240,77 +240,77 @@ void grid4(const int n, const int t, const double * RESTRICT in, double * RESTRI
         for (auto i=it; i<std::min(n-4,it+t); ++i) {
           OMP_SIMD
           for (auto j=jt; j<std::min(n-4,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625
-                          +in[(i+-4)*n+(j+-3)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+-2)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+-1)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+0)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+1)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+2)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+3)] * -0.00223214285714
-                          +in[(i+-3)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-3)*n+(j+-3)] * -0.0208333333333
-                          +in[(i+-3)*n+(j+-2)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+-1)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+0)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+1)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+2)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+4)] * 0.00223214285714
-                          +in[(i+-2)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-2)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+-2)*n+(j+-2)] * -0.03125
-                          +in[(i+-2)*n+(j+-1)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+0)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+1)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+3)] * 0.00416666666667
-                          +in[(i+-2)*n+(j+4)] * 0.00223214285714
-                          +in[(i+-1)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-1)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+-1)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+-1)*n+(j+-1)] * -0.0625
-                          +in[(i+-1)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+2)] * 0.0104166666667
-                          +in[(i+-1)*n+(j+3)] * 0.00416666666667
-                          +in[(i+-1)*n+(j+4)] * 0.00223214285714
-                          +in[(i+0)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+0)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+0)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+0)*n+(j+-1)] * -0.0625
-                          +in[(i+0)*n+(j+1)] * 0.0625
-                          +in[(i+0)*n+(j+2)] * 0.0104166666667
-                          +in[(i+0)*n+(j+3)] * 0.00416666666667
-                          +in[(i+0)*n+(j+4)] * 0.00223214285714
-                          +in[(i+1)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+1)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+1)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+1)*n+(j+0)] * 0.0625
+            out[i*n+j] += +in[(i-4)*n+(j-4)] * -0.015625
+                          +in[(i-3)*n+(j-4)] * -0.00223214285714
+                          +in[(i-2)*n+(j-4)] * -0.00223214285714
+                          +in[(i-1)*n+(j-4)] * -0.00223214285714
+                          +in[(i)*n+(j-4)] * -0.00223214285714
+                          +in[(i+1)*n+(j-4)] * -0.00223214285714
+                          +in[(i+2)*n+(j-4)] * -0.00223214285714
+                          +in[(i+3)*n+(j-4)] * -0.00223214285714
+                          +in[(i-4)*n+(j-3)] * -0.00223214285714
+                          +in[(i-3)*n+(j-3)] * -0.0208333333333
+                          +in[(i-2)*n+(j-3)] * -0.00416666666667
+                          +in[(i-1)*n+(j-3)] * -0.00416666666667
+                          +in[(i)*n+(j-3)] * -0.00416666666667
+                          +in[(i+1)*n+(j-3)] * -0.00416666666667
+                          +in[(i+2)*n+(j-3)] * -0.00416666666667
+                          +in[(i+4)*n+(j-3)] * 0.00223214285714
+                          +in[(i-4)*n+(j-2)] * -0.00223214285714
+                          +in[(i-3)*n+(j-2)] * -0.00416666666667
+                          +in[(i-2)*n+(j-2)] * -0.03125
+                          +in[(i-1)*n+(j-2)] * -0.0104166666667
+                          +in[(i)*n+(j-2)] * -0.0104166666667
+                          +in[(i+1)*n+(j-2)] * -0.0104166666667
+                          +in[(i+3)*n+(j-2)] * 0.00416666666667
+                          +in[(i+4)*n+(j-2)] * 0.00223214285714
+                          +in[(i-4)*n+(j-1)] * -0.00223214285714
+                          +in[(i-3)*n+(j-1)] * -0.00416666666667
+                          +in[(i-2)*n+(j-1)] * -0.0104166666667
+                          +in[(i-1)*n+(j-1)] * -0.0625
+                          +in[(i)*n+(j-1)] * -0.0625
+                          +in[(i+2)*n+(j-1)] * 0.0104166666667
+                          +in[(i+3)*n+(j-1)] * 0.00416666666667
+                          +in[(i+4)*n+(j-1)] * 0.00223214285714
+                          +in[(i-4)*n+(j)] * -0.00223214285714
+                          +in[(i-3)*n+(j)] * -0.00416666666667
+                          +in[(i-2)*n+(j)] * -0.0104166666667
+                          +in[(i-1)*n+(j)] * -0.0625
+                          +in[(i+1)*n+(j)] * 0.0625
+                          +in[(i+2)*n+(j)] * 0.0104166666667
+                          +in[(i+3)*n+(j)] * 0.00416666666667
+                          +in[(i+4)*n+(j)] * 0.00223214285714
+                          +in[(i-4)*n+(j+1)] * -0.00223214285714
+                          +in[(i-3)*n+(j+1)] * -0.00416666666667
+                          +in[(i-2)*n+(j+1)] * -0.0104166666667
+                          +in[(i)*n+(j+1)] * 0.0625
                           +in[(i+1)*n+(j+1)] * 0.0625
-                          +in[(i+1)*n+(j+2)] * 0.0104166666667
-                          +in[(i+1)*n+(j+3)] * 0.00416666666667
-                          +in[(i+1)*n+(j+4)] * 0.00223214285714
-                          +in[(i+2)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+2)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+2)*n+(j+-1)] * 0.0104166666667
-                          +in[(i+2)*n+(j+0)] * 0.0104166666667
                           +in[(i+2)*n+(j+1)] * 0.0104166666667
-                          +in[(i+2)*n+(j+2)] * 0.03125
-                          +in[(i+2)*n+(j+3)] * 0.00416666666667
-                          +in[(i+2)*n+(j+4)] * 0.00223214285714
-                          +in[(i+3)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+3)*n+(j+-2)] * 0.00416666666667
-                          +in[(i+3)*n+(j+-1)] * 0.00416666666667
-                          +in[(i+3)*n+(j+0)] * 0.00416666666667
                           +in[(i+3)*n+(j+1)] * 0.00416666666667
-                          +in[(i+3)*n+(j+2)] * 0.00416666666667
-                          +in[(i+3)*n+(j+3)] * 0.0208333333333
-                          +in[(i+3)*n+(j+4)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-3)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-2)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-1)] * 0.00223214285714
-                          +in[(i+4)*n+(j+0)] * 0.00223214285714
                           +in[(i+4)*n+(j+1)] * 0.00223214285714
+                          +in[(i-4)*n+(j+2)] * -0.00223214285714
+                          +in[(i-3)*n+(j+2)] * -0.00416666666667
+                          +in[(i-1)*n+(j+2)] * 0.0104166666667
+                          +in[(i)*n+(j+2)] * 0.0104166666667
+                          +in[(i+1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+2)*n+(j+2)] * 0.03125
+                          +in[(i+3)*n+(j+2)] * 0.00416666666667
                           +in[(i+4)*n+(j+2)] * 0.00223214285714
+                          +in[(i-4)*n+(j+3)] * -0.00223214285714
+                          +in[(i-2)*n+(j+3)] * 0.00416666666667
+                          +in[(i-1)*n+(j+3)] * 0.00416666666667
+                          +in[(i)*n+(j+3)] * 0.00416666666667
+                          +in[(i+1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+3)*n+(j+3)] * 0.0208333333333
                           +in[(i+4)*n+(j+3)] * 0.00223214285714
+                          +in[(i-3)*n+(j+4)] * 0.00223214285714
+                          +in[(i-2)*n+(j+4)] * 0.00223214285714
+                          +in[(i-1)*n+(j+4)] * 0.00223214285714
+                          +in[(i)*n+(j+4)] * 0.00223214285714
+                          +in[(i+1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+3)*n+(j+4)] * 0.00223214285714
                           +in[(i+4)*n+(j+4)] * 0.015625
                           ;
            }
@@ -326,115 +326,115 @@ void grid5(const int n, const int t, const double * RESTRICT in, double * RESTRI
         for (auto i=it; i<std::min(n-5,it+t); ++i) {
           OMP_SIMD
           for (auto j=jt; j<std::min(n-5,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01
-                          +in[(i+-5)*n+(j+-4)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-3)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-2)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-1)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+0)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+1)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+2)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+3)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+4)] * -0.00111111111111
-                          +in[(i+-4)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-4)*n+(j+-4)] * -0.0125
-                          +in[(i+-4)*n+(j+-3)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+-2)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+-1)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+0)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+1)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+2)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+3)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-3)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-3)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-3)*n+(j+-3)] * -0.0166666666667
-                          +in[(i+-3)*n+(j+-2)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+-1)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+0)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+1)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+2)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-3)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-2)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-2)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-2)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+-2)*n+(j+-2)] * -0.025
-                          +in[(i+-2)*n+(j+-1)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+0)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+1)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+3)] * 0.00333333333333
-                          +in[(i+-2)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-2)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-1)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-1)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-1)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+-1)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+-1)*n+(j+-1)] * -0.05
-                          +in[(i+-1)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+2)] * 0.00833333333333
-                          +in[(i+-1)*n+(j+3)] * 0.00333333333333
-                          +in[(i+-1)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-1)*n+(j+5)] * 0.00111111111111
-                          +in[(i+0)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+0)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+0)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+0)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+0)*n+(j+-1)] * -0.05
-                          +in[(i+0)*n+(j+1)] * 0.05
-                          +in[(i+0)*n+(j+2)] * 0.00833333333333
-                          +in[(i+0)*n+(j+3)] * 0.00333333333333
-                          +in[(i+0)*n+(j+4)] * 0.00178571428571
-                          +in[(i+0)*n+(j+5)] * 0.00111111111111
-                          +in[(i+1)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+1)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+1)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+1)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+1)*n+(j+0)] * 0.05
+            out[i*n+j] += +in[(i-5)*n+(j-5)] * -0.01
+                          +in[(i-4)*n+(j-5)] * -0.00111111111111
+                          +in[(i-3)*n+(j-5)] * -0.00111111111111
+                          +in[(i-2)*n+(j-5)] * -0.00111111111111
+                          +in[(i-1)*n+(j-5)] * -0.00111111111111
+                          +in[(i)*n+(j-5)] * -0.00111111111111
+                          +in[(i+1)*n+(j-5)] * -0.00111111111111
+                          +in[(i+2)*n+(j-5)] * -0.00111111111111
+                          +in[(i+3)*n+(j-5)] * -0.00111111111111
+                          +in[(i+4)*n+(j-5)] * -0.00111111111111
+                          +in[(i-5)*n+(j-4)] * -0.00111111111111
+                          +in[(i-4)*n+(j-4)] * -0.0125
+                          +in[(i-3)*n+(j-4)] * -0.00178571428571
+                          +in[(i-2)*n+(j-4)] * -0.00178571428571
+                          +in[(i-1)*n+(j-4)] * -0.00178571428571
+                          +in[(i)*n+(j-4)] * -0.00178571428571
+                          +in[(i+1)*n+(j-4)] * -0.00178571428571
+                          +in[(i+2)*n+(j-4)] * -0.00178571428571
+                          +in[(i+3)*n+(j-4)] * -0.00178571428571
+                          +in[(i+5)*n+(j-4)] * 0.00111111111111
+                          +in[(i-5)*n+(j-3)] * -0.00111111111111
+                          +in[(i-4)*n+(j-3)] * -0.00178571428571
+                          +in[(i-3)*n+(j-3)] * -0.0166666666667
+                          +in[(i-2)*n+(j-3)] * -0.00333333333333
+                          +in[(i-1)*n+(j-3)] * -0.00333333333333
+                          +in[(i)*n+(j-3)] * -0.00333333333333
+                          +in[(i+1)*n+(j-3)] * -0.00333333333333
+                          +in[(i+2)*n+(j-3)] * -0.00333333333333
+                          +in[(i+4)*n+(j-3)] * 0.00178571428571
+                          +in[(i+5)*n+(j-3)] * 0.00111111111111
+                          +in[(i-5)*n+(j-2)] * -0.00111111111111
+                          +in[(i-4)*n+(j-2)] * -0.00178571428571
+                          +in[(i-3)*n+(j-2)] * -0.00333333333333
+                          +in[(i-2)*n+(j-2)] * -0.025
+                          +in[(i-1)*n+(j-2)] * -0.00833333333333
+                          +in[(i)*n+(j-2)] * -0.00833333333333
+                          +in[(i+1)*n+(j-2)] * -0.00833333333333
+                          +in[(i+3)*n+(j-2)] * 0.00333333333333
+                          +in[(i+4)*n+(j-2)] * 0.00178571428571
+                          +in[(i+5)*n+(j-2)] * 0.00111111111111
+                          +in[(i-5)*n+(j-1)] * -0.00111111111111
+                          +in[(i-4)*n+(j-1)] * -0.00178571428571
+                          +in[(i-3)*n+(j-1)] * -0.00333333333333
+                          +in[(i-2)*n+(j-1)] * -0.00833333333333
+                          +in[(i-1)*n+(j-1)] * -0.05
+                          +in[(i)*n+(j-1)] * -0.05
+                          +in[(i+2)*n+(j-1)] * 0.00833333333333
+                          +in[(i+3)*n+(j-1)] * 0.00333333333333
+                          +in[(i+4)*n+(j-1)] * 0.00178571428571
+                          +in[(i+5)*n+(j-1)] * 0.00111111111111
+                          +in[(i-5)*n+(j)] * -0.00111111111111
+                          +in[(i-4)*n+(j)] * -0.00178571428571
+                          +in[(i-3)*n+(j)] * -0.00333333333333
+                          +in[(i-2)*n+(j)] * -0.00833333333333
+                          +in[(i-1)*n+(j)] * -0.05
+                          +in[(i+1)*n+(j)] * 0.05
+                          +in[(i+2)*n+(j)] * 0.00833333333333
+                          +in[(i+3)*n+(j)] * 0.00333333333333
+                          +in[(i+4)*n+(j)] * 0.00178571428571
+                          +in[(i+5)*n+(j)] * 0.00111111111111
+                          +in[(i-5)*n+(j+1)] * -0.00111111111111
+                          +in[(i-4)*n+(j+1)] * -0.00178571428571
+                          +in[(i-3)*n+(j+1)] * -0.00333333333333
+                          +in[(i-2)*n+(j+1)] * -0.00833333333333
+                          +in[(i)*n+(j+1)] * 0.05
                           +in[(i+1)*n+(j+1)] * 0.05
-                          +in[(i+1)*n+(j+2)] * 0.00833333333333
-                          +in[(i+1)*n+(j+3)] * 0.00333333333333
-                          +in[(i+1)*n+(j+4)] * 0.00178571428571
-                          +in[(i+1)*n+(j+5)] * 0.00111111111111
-                          +in[(i+2)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+2)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+2)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+2)*n+(j+-1)] * 0.00833333333333
-                          +in[(i+2)*n+(j+0)] * 0.00833333333333
                           +in[(i+2)*n+(j+1)] * 0.00833333333333
-                          +in[(i+2)*n+(j+2)] * 0.025
-                          +in[(i+2)*n+(j+3)] * 0.00333333333333
-                          +in[(i+2)*n+(j+4)] * 0.00178571428571
-                          +in[(i+2)*n+(j+5)] * 0.00111111111111
-                          +in[(i+3)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+3)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+3)*n+(j+-2)] * 0.00333333333333
-                          +in[(i+3)*n+(j+-1)] * 0.00333333333333
-                          +in[(i+3)*n+(j+0)] * 0.00333333333333
                           +in[(i+3)*n+(j+1)] * 0.00333333333333
-                          +in[(i+3)*n+(j+2)] * 0.00333333333333
-                          +in[(i+3)*n+(j+3)] * 0.0166666666667
-                          +in[(i+3)*n+(j+4)] * 0.00178571428571
-                          +in[(i+3)*n+(j+5)] * 0.00111111111111
-                          +in[(i+4)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+4)*n+(j+-3)] * 0.00178571428571
-                          +in[(i+4)*n+(j+-2)] * 0.00178571428571
-                          +in[(i+4)*n+(j+-1)] * 0.00178571428571
-                          +in[(i+4)*n+(j+0)] * 0.00178571428571
                           +in[(i+4)*n+(j+1)] * 0.00178571428571
-                          +in[(i+4)*n+(j+2)] * 0.00178571428571
-                          +in[(i+4)*n+(j+3)] * 0.00178571428571
-                          +in[(i+4)*n+(j+4)] * 0.0125
-                          +in[(i+4)*n+(j+5)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-4)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-3)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-2)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-1)] * 0.00111111111111
-                          +in[(i+5)*n+(j+0)] * 0.00111111111111
                           +in[(i+5)*n+(j+1)] * 0.00111111111111
+                          +in[(i-5)*n+(j+2)] * -0.00111111111111
+                          +in[(i-4)*n+(j+2)] * -0.00178571428571
+                          +in[(i-3)*n+(j+2)] * -0.00333333333333
+                          +in[(i-1)*n+(j+2)] * 0.00833333333333
+                          +in[(i)*n+(j+2)] * 0.00833333333333
+                          +in[(i+1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+2)*n+(j+2)] * 0.025
+                          +in[(i+3)*n+(j+2)] * 0.00333333333333
+                          +in[(i+4)*n+(j+2)] * 0.00178571428571
                           +in[(i+5)*n+(j+2)] * 0.00111111111111
+                          +in[(i-5)*n+(j+3)] * -0.00111111111111
+                          +in[(i-4)*n+(j+3)] * -0.00178571428571
+                          +in[(i-2)*n+(j+3)] * 0.00333333333333
+                          +in[(i-1)*n+(j+3)] * 0.00333333333333
+                          +in[(i)*n+(j+3)] * 0.00333333333333
+                          +in[(i+1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+3)*n+(j+3)] * 0.0166666666667
+                          +in[(i+4)*n+(j+3)] * 0.00178571428571
                           +in[(i+5)*n+(j+3)] * 0.00111111111111
+                          +in[(i-5)*n+(j+4)] * -0.00111111111111
+                          +in[(i-3)*n+(j+4)] * 0.00178571428571
+                          +in[(i-2)*n+(j+4)] * 0.00178571428571
+                          +in[(i-1)*n+(j+4)] * 0.00178571428571
+                          +in[(i)*n+(j+4)] * 0.00178571428571
+                          +in[(i+1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+4)*n+(j+4)] * 0.0125
                           +in[(i+5)*n+(j+4)] * 0.00111111111111
+                          +in[(i-4)*n+(j+5)] * 0.00111111111111
+                          +in[(i-3)*n+(j+5)] * 0.00111111111111
+                          +in[(i-2)*n+(j+5)] * 0.00111111111111
+                          +in[(i-1)*n+(j+5)] * 0.00111111111111
+                          +in[(i)*n+(j+5)] * 0.00111111111111
+                          +in[(i+1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+4)*n+(j+5)] * 0.00111111111111
                           +in[(i+5)*n+(j+5)] * 0.01
                           ;
            }
diff --git a/Cxx11/stencil_pgnu.hpp b/Cxx11/stencil_pgnu.hpp
index 0db4fedfc..39f2feb8e 100644
--- a/Cxx11/stencil_pgnu.hpp
+++ b/Cxx11/stencil_pgnu.hpp
@@ -2,10 +2,10 @@ void star1(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = prk::range(1,n-1);
     __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-1)*n+(j+0)] * -0.5
-                          +in[(i+0)*n+(j+-1)] * -0.5
-                          +in[(i+0)*n+(j+1)] * 0.5
-                          +in[(i+1)*n+(j+0)] * 0.5;
+            out[i*n+j] += +in[(i)*n+(j-1)] * -0.5
+                          +in[(i-1)*n+(j)] * -0.5
+                          +in[(i+1)*n+(j)] * 0.5
+                          +in[(i)*n+(j+1)] * 0.5;
       });
     });
 }
@@ -14,14 +14,14 @@ void star2(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = prk::range(2,n-2);
     __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-2)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+0)] * -0.25
-                          +in[(i+0)*n+(j+-2)] * -0.125
-                          +in[(i+0)*n+(j+-1)] * -0.25
-                          +in[(i+0)*n+(j+1)] * 0.25
-                          +in[(i+0)*n+(j+2)] * 0.125
-                          +in[(i+1)*n+(j+0)] * 0.25
-                          +in[(i+2)*n+(j+0)] * 0.125;
+            out[i*n+j] += +in[(i)*n+(j-2)] * -0.125
+                          +in[(i)*n+(j-1)] * -0.25
+                          +in[(i-2)*n+(j)] * -0.125
+                          +in[(i-1)*n+(j)] * -0.25
+                          +in[(i+1)*n+(j)] * 0.25
+                          +in[(i+2)*n+(j)] * 0.125
+                          +in[(i)*n+(j+1)] * 0.25
+                          +in[(i)*n+(j+2)] * 0.125;
       });
     });
 }
@@ -30,18 +30,18 @@ void star3(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = prk::range(3,n-3);
     __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556
-                          +in[(i+-2)*n+(j+0)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+0)] * -0.166666666667
-                          +in[(i+0)*n+(j+-3)] * -0.0555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.0833333333333
-                          +in[(i+0)*n+(j+-1)] * -0.166666666667
-                          +in[(i+0)*n+(j+1)] * 0.166666666667
-                          +in[(i+0)*n+(j+2)] * 0.0833333333333
-                          +in[(i+0)*n+(j+3)] * 0.0555555555556
-                          +in[(i+1)*n+(j+0)] * 0.166666666667
-                          +in[(i+2)*n+(j+0)] * 0.0833333333333
-                          +in[(i+3)*n+(j+0)] * 0.0555555555556;
+            out[i*n+j] += +in[(i)*n+(j-3)] * -0.0555555555556
+                          +in[(i)*n+(j-2)] * -0.0833333333333
+                          +in[(i)*n+(j-1)] * -0.166666666667
+                          +in[(i-3)*n+(j)] * -0.0555555555556
+                          +in[(i-2)*n+(j)] * -0.0833333333333
+                          +in[(i-1)*n+(j)] * -0.166666666667
+                          +in[(i+1)*n+(j)] * 0.166666666667
+                          +in[(i+2)*n+(j)] * 0.0833333333333
+                          +in[(i+3)*n+(j)] * 0.0555555555556
+                          +in[(i)*n+(j+1)] * 0.166666666667
+                          +in[(i)*n+(j+2)] * 0.0833333333333
+                          +in[(i)*n+(j+3)] * 0.0555555555556;
       });
     });
 }
@@ -50,22 +50,22 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = prk::range(4,n-4);
     __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125
-                          +in[(i+-3)*n+(j+0)] * -0.0416666666667
-                          +in[(i+-2)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+0)*n+(j+-4)] * -0.03125
-                          +in[(i+0)*n+(j+-3)] * -0.0416666666667
-                          +in[(i+0)*n+(j+-2)] * -0.0625
-                          +in[(i+0)*n+(j+-1)] * -0.125
-                          +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.0625
-                          +in[(i+0)*n+(j+3)] * 0.0416666666667
-                          +in[(i+0)*n+(j+4)] * 0.03125
-                          +in[(i+1)*n+(j+0)] * 0.125
-                          +in[(i+2)*n+(j+0)] * 0.0625
-                          +in[(i+3)*n+(j+0)] * 0.0416666666667
-                          +in[(i+4)*n+(j+0)] * 0.03125;
+            out[i*n+j] += +in[(i)*n+(j-4)] * -0.03125
+                          +in[(i)*n+(j-3)] * -0.0416666666667
+                          +in[(i)*n+(j-2)] * -0.0625
+                          +in[(i)*n+(j-1)] * -0.125
+                          +in[(i-4)*n+(j)] * -0.03125
+                          +in[(i-3)*n+(j)] * -0.0416666666667
+                          +in[(i-2)*n+(j)] * -0.0625
+                          +in[(i-1)*n+(j)] * -0.125
+                          +in[(i+1)*n+(j)] * 0.125
+                          +in[(i+2)*n+(j)] * 0.0625
+                          +in[(i+3)*n+(j)] * 0.0416666666667
+                          +in[(i+4)*n+(j)] * 0.03125
+                          +in[(i)*n+(j+1)] * 0.125
+                          +in[(i)*n+(j+2)] * 0.0625
+                          +in[(i)*n+(j+3)] * 0.0416666666667
+                          +in[(i)*n+(j+4)] * 0.03125;
       });
     });
 }
@@ -74,26 +74,26 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = prk::range(5,n-5);
     __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02
-                          +in[(i+-4)*n+(j+0)] * -0.025
-                          +in[(i+-3)*n+(j+0)] * -0.0333333333333
-                          +in[(i+-2)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+0)] * -0.1
-                          +in[(i+0)*n+(j+-5)] * -0.02
-                          +in[(i+0)*n+(j+-4)] * -0.025
-                          +in[(i+0)*n+(j+-3)] * -0.0333333333333
-                          +in[(i+0)*n+(j+-2)] * -0.05
-                          +in[(i+0)*n+(j+-1)] * -0.1
-                          +in[(i+0)*n+(j+1)] * 0.1
-                          +in[(i+0)*n+(j+2)] * 0.05
-                          +in[(i+0)*n+(j+3)] * 0.0333333333333
-                          +in[(i+0)*n+(j+4)] * 0.025
-                          +in[(i+0)*n+(j+5)] * 0.02
-                          +in[(i+1)*n+(j+0)] * 0.1
-                          +in[(i+2)*n+(j+0)] * 0.05
-                          +in[(i+3)*n+(j+0)] * 0.0333333333333
-                          +in[(i+4)*n+(j+0)] * 0.025
-                          +in[(i+5)*n+(j+0)] * 0.02;
+            out[i*n+j] += +in[(i)*n+(j-5)] * -0.02
+                          +in[(i)*n+(j-4)] * -0.025
+                          +in[(i)*n+(j-3)] * -0.0333333333333
+                          +in[(i)*n+(j-2)] * -0.05
+                          +in[(i)*n+(j-1)] * -0.1
+                          +in[(i-5)*n+(j)] * -0.02
+                          +in[(i-4)*n+(j)] * -0.025
+                          +in[(i-3)*n+(j)] * -0.0333333333333
+                          +in[(i-2)*n+(j)] * -0.05
+                          +in[(i-1)*n+(j)] * -0.1
+                          +in[(i+1)*n+(j)] * 0.1
+                          +in[(i+2)*n+(j)] * 0.05
+                          +in[(i+3)*n+(j)] * 0.0333333333333
+                          +in[(i+4)*n+(j)] * 0.025
+                          +in[(i+5)*n+(j)] * 0.02
+                          +in[(i)*n+(j+1)] * 0.1
+                          +in[(i)*n+(j+2)] * 0.05
+                          +in[(i)*n+(j+3)] * 0.0333333333333
+                          +in[(i)*n+(j+4)] * 0.025
+                          +in[(i)*n+(j+5)] * 0.02;
       });
     });
 }
@@ -102,11 +102,11 @@ void grid1(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = prk::range(1,n-1);
     __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-1)*n+(j+-1)] * -0.25
-                          +in[(i+-1)*n+(j+0)] * -0.25
-                          +in[(i+0)*n+(j+-1)] * -0.25
-                          +in[(i+0)*n+(j+1)] * 0.25
-                          +in[(i+1)*n+(j+0)] * 0.25
+            out[i*n+j] += +in[(i-1)*n+(j-1)] * -0.25
+                          +in[(i)*n+(j-1)] * -0.25
+                          +in[(i-1)*n+(j)] * -0.25
+                          +in[(i+1)*n+(j)] * 0.25
+                          +in[(i)*n+(j+1)] * 0.25
                           +in[(i+1)*n+(j+1)] * 0.25
                           ;
       });
@@ -117,25 +117,25 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = prk::range(2,n-2);
     __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625
-                          +in[(i+-2)*n+(j+-1)] * -0.0208333333333
-                          +in[(i+-2)*n+(j+0)] * -0.0208333333333
-                          +in[(i+-2)*n+(j+1)] * -0.0208333333333
-                          +in[(i+-1)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+-1)*n+(j+-1)] * -0.125
-                          +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+2)] * 0.0208333333333
-                          +in[(i+0)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+0)*n+(j+-1)] * -0.125
-                          +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.0208333333333
-                          +in[(i+1)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+1)*n+(j+0)] * 0.125
+            out[i*n+j] += +in[(i-2)*n+(j-2)] * -0.0625
+                          +in[(i-1)*n+(j-2)] * -0.0208333333333
+                          +in[(i)*n+(j-2)] * -0.0208333333333
+                          +in[(i+1)*n+(j-2)] * -0.0208333333333
+                          +in[(i-2)*n+(j-1)] * -0.0208333333333
+                          +in[(i-1)*n+(j-1)] * -0.125
+                          +in[(i)*n+(j-1)] * -0.125
+                          +in[(i+2)*n+(j-1)] * 0.0208333333333
+                          +in[(i-2)*n+(j)] * -0.0208333333333
+                          +in[(i-1)*n+(j)] * -0.125
+                          +in[(i+1)*n+(j)] * 0.125
+                          +in[(i+2)*n+(j)] * 0.0208333333333
+                          +in[(i-2)*n+(j+1)] * -0.0208333333333
+                          +in[(i)*n+(j+1)] * 0.125
                           +in[(i+1)*n+(j+1)] * 0.125
-                          +in[(i+1)*n+(j+2)] * 0.0208333333333
-                          +in[(i+2)*n+(j+-1)] * 0.0208333333333
-                          +in[(i+2)*n+(j+0)] * 0.0208333333333
                           +in[(i+2)*n+(j+1)] * 0.0208333333333
+                          +in[(i-1)*n+(j+2)] * 0.0208333333333
+                          +in[(i)*n+(j+2)] * 0.0208333333333
+                          +in[(i+1)*n+(j+2)] * 0.0208333333333
                           +in[(i+2)*n+(j+2)] * 0.0625
                           ;
       });
@@ -146,47 +146,47 @@ void grid3(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = prk::range(3,n-3);
     __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778
-                          +in[(i+-3)*n+(j+-2)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+-1)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+0)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+1)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+2)] * -0.00555555555556
-                          +in[(i+-2)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+-2)*n+(j+-2)] * -0.0416666666667
-                          +in[(i+-2)*n+(j+-1)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+0)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+1)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+3)] * 0.00555555555556
-                          +in[(i+-1)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+-1)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+-1)*n+(j+-1)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+0)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+2)] * 0.0138888888889
-                          +in[(i+-1)*n+(j+3)] * 0.00555555555556
-                          +in[(i+0)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+0)*n+(j+-1)] * -0.0833333333333
-                          +in[(i+0)*n+(j+1)] * 0.0833333333333
-                          +in[(i+0)*n+(j+2)] * 0.0138888888889
-                          +in[(i+0)*n+(j+3)] * 0.00555555555556
-                          +in[(i+1)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+1)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+1)*n+(j+0)] * 0.0833333333333
+            out[i*n+j] += +in[(i-3)*n+(j-3)] * -0.0277777777778
+                          +in[(i-2)*n+(j-3)] * -0.00555555555556
+                          +in[(i-1)*n+(j-3)] * -0.00555555555556
+                          +in[(i)*n+(j-3)] * -0.00555555555556
+                          +in[(i+1)*n+(j-3)] * -0.00555555555556
+                          +in[(i+2)*n+(j-3)] * -0.00555555555556
+                          +in[(i-3)*n+(j-2)] * -0.00555555555556
+                          +in[(i-2)*n+(j-2)] * -0.0416666666667
+                          +in[(i-1)*n+(j-2)] * -0.0138888888889
+                          +in[(i)*n+(j-2)] * -0.0138888888889
+                          +in[(i+1)*n+(j-2)] * -0.0138888888889
+                          +in[(i+3)*n+(j-2)] * 0.00555555555556
+                          +in[(i-3)*n+(j-1)] * -0.00555555555556
+                          +in[(i-2)*n+(j-1)] * -0.0138888888889
+                          +in[(i-1)*n+(j-1)] * -0.0833333333333
+                          +in[(i)*n+(j-1)] * -0.0833333333333
+                          +in[(i+2)*n+(j-1)] * 0.0138888888889
+                          +in[(i+3)*n+(j-1)] * 0.00555555555556
+                          +in[(i-3)*n+(j)] * -0.00555555555556
+                          +in[(i-2)*n+(j)] * -0.0138888888889
+                          +in[(i-1)*n+(j)] * -0.0833333333333
+                          +in[(i+1)*n+(j)] * 0.0833333333333
+                          +in[(i+2)*n+(j)] * 0.0138888888889
+                          +in[(i+3)*n+(j)] * 0.00555555555556
+                          +in[(i-3)*n+(j+1)] * -0.00555555555556
+                          +in[(i-2)*n+(j+1)] * -0.0138888888889
+                          +in[(i)*n+(j+1)] * 0.0833333333333
                           +in[(i+1)*n+(j+1)] * 0.0833333333333
-                          +in[(i+1)*n+(j+2)] * 0.0138888888889
-                          +in[(i+1)*n+(j+3)] * 0.00555555555556
-                          +in[(i+2)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+2)*n+(j+-1)] * 0.0138888888889
-                          +in[(i+2)*n+(j+0)] * 0.0138888888889
                           +in[(i+2)*n+(j+1)] * 0.0138888888889
-                          +in[(i+2)*n+(j+2)] * 0.0416666666667
-                          +in[(i+2)*n+(j+3)] * 0.00555555555556
-                          +in[(i+3)*n+(j+-2)] * 0.00555555555556
-                          +in[(i+3)*n+(j+-1)] * 0.00555555555556
-                          +in[(i+3)*n+(j+0)] * 0.00555555555556
                           +in[(i+3)*n+(j+1)] * 0.00555555555556
+                          +in[(i-3)*n+(j+2)] * -0.00555555555556
+                          +in[(i-1)*n+(j+2)] * 0.0138888888889
+                          +in[(i)*n+(j+2)] * 0.0138888888889
+                          +in[(i+1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+2)*n+(j+2)] * 0.0416666666667
                           +in[(i+3)*n+(j+2)] * 0.00555555555556
+                          +in[(i-2)*n+(j+3)] * 0.00555555555556
+                          +in[(i-1)*n+(j+3)] * 0.00555555555556
+                          +in[(i)*n+(j+3)] * 0.00555555555556
+                          +in[(i+1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+2)*n+(j+3)] * 0.00555555555556
                           +in[(i+3)*n+(j+3)] * 0.0277777777778
                           ;
       });
@@ -197,77 +197,77 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = prk::range(4,n-4);
     __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625
-                          +in[(i+-4)*n+(j+-3)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+-2)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+-1)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+0)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+1)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+2)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+3)] * -0.00223214285714
-                          +in[(i+-3)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-3)*n+(j+-3)] * -0.0208333333333
-                          +in[(i+-3)*n+(j+-2)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+-1)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+0)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+1)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+2)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+4)] * 0.00223214285714
-                          +in[(i+-2)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-2)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+-2)*n+(j+-2)] * -0.03125
-                          +in[(i+-2)*n+(j+-1)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+0)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+1)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+3)] * 0.00416666666667
-                          +in[(i+-2)*n+(j+4)] * 0.00223214285714
-                          +in[(i+-1)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-1)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+-1)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+-1)*n+(j+-1)] * -0.0625
-                          +in[(i+-1)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+2)] * 0.0104166666667
-                          +in[(i+-1)*n+(j+3)] * 0.00416666666667
-                          +in[(i+-1)*n+(j+4)] * 0.00223214285714
-                          +in[(i+0)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+0)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+0)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+0)*n+(j+-1)] * -0.0625
-                          +in[(i+0)*n+(j+1)] * 0.0625
-                          +in[(i+0)*n+(j+2)] * 0.0104166666667
-                          +in[(i+0)*n+(j+3)] * 0.00416666666667
-                          +in[(i+0)*n+(j+4)] * 0.00223214285714
-                          +in[(i+1)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+1)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+1)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+1)*n+(j+0)] * 0.0625
+            out[i*n+j] += +in[(i-4)*n+(j-4)] * -0.015625
+                          +in[(i-3)*n+(j-4)] * -0.00223214285714
+                          +in[(i-2)*n+(j-4)] * -0.00223214285714
+                          +in[(i-1)*n+(j-4)] * -0.00223214285714
+                          +in[(i)*n+(j-4)] * -0.00223214285714
+                          +in[(i+1)*n+(j-4)] * -0.00223214285714
+                          +in[(i+2)*n+(j-4)] * -0.00223214285714
+                          +in[(i+3)*n+(j-4)] * -0.00223214285714
+                          +in[(i-4)*n+(j-3)] * -0.00223214285714
+                          +in[(i-3)*n+(j-3)] * -0.0208333333333
+                          +in[(i-2)*n+(j-3)] * -0.00416666666667
+                          +in[(i-1)*n+(j-3)] * -0.00416666666667
+                          +in[(i)*n+(j-3)] * -0.00416666666667
+                          +in[(i+1)*n+(j-3)] * -0.00416666666667
+                          +in[(i+2)*n+(j-3)] * -0.00416666666667
+                          +in[(i+4)*n+(j-3)] * 0.00223214285714
+                          +in[(i-4)*n+(j-2)] * -0.00223214285714
+                          +in[(i-3)*n+(j-2)] * -0.00416666666667
+                          +in[(i-2)*n+(j-2)] * -0.03125
+                          +in[(i-1)*n+(j-2)] * -0.0104166666667
+                          +in[(i)*n+(j-2)] * -0.0104166666667
+                          +in[(i+1)*n+(j-2)] * -0.0104166666667
+                          +in[(i+3)*n+(j-2)] * 0.00416666666667
+                          +in[(i+4)*n+(j-2)] * 0.00223214285714
+                          +in[(i-4)*n+(j-1)] * -0.00223214285714
+                          +in[(i-3)*n+(j-1)] * -0.00416666666667
+                          +in[(i-2)*n+(j-1)] * -0.0104166666667
+                          +in[(i-1)*n+(j-1)] * -0.0625
+                          +in[(i)*n+(j-1)] * -0.0625
+                          +in[(i+2)*n+(j-1)] * 0.0104166666667
+                          +in[(i+3)*n+(j-1)] * 0.00416666666667
+                          +in[(i+4)*n+(j-1)] * 0.00223214285714
+                          +in[(i-4)*n+(j)] * -0.00223214285714
+                          +in[(i-3)*n+(j)] * -0.00416666666667
+                          +in[(i-2)*n+(j)] * -0.0104166666667
+                          +in[(i-1)*n+(j)] * -0.0625
+                          +in[(i+1)*n+(j)] * 0.0625
+                          +in[(i+2)*n+(j)] * 0.0104166666667
+                          +in[(i+3)*n+(j)] * 0.00416666666667
+                          +in[(i+4)*n+(j)] * 0.00223214285714
+                          +in[(i-4)*n+(j+1)] * -0.00223214285714
+                          +in[(i-3)*n+(j+1)] * -0.00416666666667
+                          +in[(i-2)*n+(j+1)] * -0.0104166666667
+                          +in[(i)*n+(j+1)] * 0.0625
                           +in[(i+1)*n+(j+1)] * 0.0625
-                          +in[(i+1)*n+(j+2)] * 0.0104166666667
-                          +in[(i+1)*n+(j+3)] * 0.00416666666667
-                          +in[(i+1)*n+(j+4)] * 0.00223214285714
-                          +in[(i+2)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+2)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+2)*n+(j+-1)] * 0.0104166666667
-                          +in[(i+2)*n+(j+0)] * 0.0104166666667
                           +in[(i+2)*n+(j+1)] * 0.0104166666667
-                          +in[(i+2)*n+(j+2)] * 0.03125
-                          +in[(i+2)*n+(j+3)] * 0.00416666666667
-                          +in[(i+2)*n+(j+4)] * 0.00223214285714
-                          +in[(i+3)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+3)*n+(j+-2)] * 0.00416666666667
-                          +in[(i+3)*n+(j+-1)] * 0.00416666666667
-                          +in[(i+3)*n+(j+0)] * 0.00416666666667
                           +in[(i+3)*n+(j+1)] * 0.00416666666667
-                          +in[(i+3)*n+(j+2)] * 0.00416666666667
-                          +in[(i+3)*n+(j+3)] * 0.0208333333333
-                          +in[(i+3)*n+(j+4)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-3)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-2)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-1)] * 0.00223214285714
-                          +in[(i+4)*n+(j+0)] * 0.00223214285714
                           +in[(i+4)*n+(j+1)] * 0.00223214285714
+                          +in[(i-4)*n+(j+2)] * -0.00223214285714
+                          +in[(i-3)*n+(j+2)] * -0.00416666666667
+                          +in[(i-1)*n+(j+2)] * 0.0104166666667
+                          +in[(i)*n+(j+2)] * 0.0104166666667
+                          +in[(i+1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+2)*n+(j+2)] * 0.03125
+                          +in[(i+3)*n+(j+2)] * 0.00416666666667
                           +in[(i+4)*n+(j+2)] * 0.00223214285714
+                          +in[(i-4)*n+(j+3)] * -0.00223214285714
+                          +in[(i-2)*n+(j+3)] * 0.00416666666667
+                          +in[(i-1)*n+(j+3)] * 0.00416666666667
+                          +in[(i)*n+(j+3)] * 0.00416666666667
+                          +in[(i+1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+3)*n+(j+3)] * 0.0208333333333
                           +in[(i+4)*n+(j+3)] * 0.00223214285714
+                          +in[(i-3)*n+(j+4)] * 0.00223214285714
+                          +in[(i-2)*n+(j+4)] * 0.00223214285714
+                          +in[(i-1)*n+(j+4)] * 0.00223214285714
+                          +in[(i)*n+(j+4)] * 0.00223214285714
+                          +in[(i+1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+3)*n+(j+4)] * 0.00223214285714
                           +in[(i+4)*n+(j+4)] * 0.015625
                           ;
       });
@@ -278,115 +278,115 @@ void grid5(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = prk::range(5,n-5);
     __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01
-                          +in[(i+-5)*n+(j+-4)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-3)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-2)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-1)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+0)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+1)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+2)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+3)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+4)] * -0.00111111111111
-                          +in[(i+-4)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-4)*n+(j+-4)] * -0.0125
-                          +in[(i+-4)*n+(j+-3)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+-2)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+-1)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+0)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+1)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+2)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+3)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-3)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-3)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-3)*n+(j+-3)] * -0.0166666666667
-                          +in[(i+-3)*n+(j+-2)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+-1)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+0)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+1)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+2)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-3)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-2)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-2)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-2)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+-2)*n+(j+-2)] * -0.025
-                          +in[(i+-2)*n+(j+-1)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+0)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+1)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+3)] * 0.00333333333333
-                          +in[(i+-2)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-2)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-1)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-1)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-1)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+-1)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+-1)*n+(j+-1)] * -0.05
-                          +in[(i+-1)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+2)] * 0.00833333333333
-                          +in[(i+-1)*n+(j+3)] * 0.00333333333333
-                          +in[(i+-1)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-1)*n+(j+5)] * 0.00111111111111
-                          +in[(i+0)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+0)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+0)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+0)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+0)*n+(j+-1)] * -0.05
-                          +in[(i+0)*n+(j+1)] * 0.05
-                          +in[(i+0)*n+(j+2)] * 0.00833333333333
-                          +in[(i+0)*n+(j+3)] * 0.00333333333333
-                          +in[(i+0)*n+(j+4)] * 0.00178571428571
-                          +in[(i+0)*n+(j+5)] * 0.00111111111111
-                          +in[(i+1)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+1)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+1)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+1)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+1)*n+(j+0)] * 0.05
+            out[i*n+j] += +in[(i-5)*n+(j-5)] * -0.01
+                          +in[(i-4)*n+(j-5)] * -0.00111111111111
+                          +in[(i-3)*n+(j-5)] * -0.00111111111111
+                          +in[(i-2)*n+(j-5)] * -0.00111111111111
+                          +in[(i-1)*n+(j-5)] * -0.00111111111111
+                          +in[(i)*n+(j-5)] * -0.00111111111111
+                          +in[(i+1)*n+(j-5)] * -0.00111111111111
+                          +in[(i+2)*n+(j-5)] * -0.00111111111111
+                          +in[(i+3)*n+(j-5)] * -0.00111111111111
+                          +in[(i+4)*n+(j-5)] * -0.00111111111111
+                          +in[(i-5)*n+(j-4)] * -0.00111111111111
+                          +in[(i-4)*n+(j-4)] * -0.0125
+                          +in[(i-3)*n+(j-4)] * -0.00178571428571
+                          +in[(i-2)*n+(j-4)] * -0.00178571428571
+                          +in[(i-1)*n+(j-4)] * -0.00178571428571
+                          +in[(i)*n+(j-4)] * -0.00178571428571
+                          +in[(i+1)*n+(j-4)] * -0.00178571428571
+                          +in[(i+2)*n+(j-4)] * -0.00178571428571
+                          +in[(i+3)*n+(j-4)] * -0.00178571428571
+                          +in[(i+5)*n+(j-4)] * 0.00111111111111
+                          +in[(i-5)*n+(j-3)] * -0.00111111111111
+                          +in[(i-4)*n+(j-3)] * -0.00178571428571
+                          +in[(i-3)*n+(j-3)] * -0.0166666666667
+                          +in[(i-2)*n+(j-3)] * -0.00333333333333
+                          +in[(i-1)*n+(j-3)] * -0.00333333333333
+                          +in[(i)*n+(j-3)] * -0.00333333333333
+                          +in[(i+1)*n+(j-3)] * -0.00333333333333
+                          +in[(i+2)*n+(j-3)] * -0.00333333333333
+                          +in[(i+4)*n+(j-3)] * 0.00178571428571
+                          +in[(i+5)*n+(j-3)] * 0.00111111111111
+                          +in[(i-5)*n+(j-2)] * -0.00111111111111
+                          +in[(i-4)*n+(j-2)] * -0.00178571428571
+                          +in[(i-3)*n+(j-2)] * -0.00333333333333
+                          +in[(i-2)*n+(j-2)] * -0.025
+                          +in[(i-1)*n+(j-2)] * -0.00833333333333
+                          +in[(i)*n+(j-2)] * -0.00833333333333
+                          +in[(i+1)*n+(j-2)] * -0.00833333333333
+                          +in[(i+3)*n+(j-2)] * 0.00333333333333
+                          +in[(i+4)*n+(j-2)] * 0.00178571428571
+                          +in[(i+5)*n+(j-2)] * 0.00111111111111
+                          +in[(i-5)*n+(j-1)] * -0.00111111111111
+                          +in[(i-4)*n+(j-1)] * -0.00178571428571
+                          +in[(i-3)*n+(j-1)] * -0.00333333333333
+                          +in[(i-2)*n+(j-1)] * -0.00833333333333
+                          +in[(i-1)*n+(j-1)] * -0.05
+                          +in[(i)*n+(j-1)] * -0.05
+                          +in[(i+2)*n+(j-1)] * 0.00833333333333
+                          +in[(i+3)*n+(j-1)] * 0.00333333333333
+                          +in[(i+4)*n+(j-1)] * 0.00178571428571
+                          +in[(i+5)*n+(j-1)] * 0.00111111111111
+                          +in[(i-5)*n+(j)] * -0.00111111111111
+                          +in[(i-4)*n+(j)] * -0.00178571428571
+                          +in[(i-3)*n+(j)] * -0.00333333333333
+                          +in[(i-2)*n+(j)] * -0.00833333333333
+                          +in[(i-1)*n+(j)] * -0.05
+                          +in[(i+1)*n+(j)] * 0.05
+                          +in[(i+2)*n+(j)] * 0.00833333333333
+                          +in[(i+3)*n+(j)] * 0.00333333333333
+                          +in[(i+4)*n+(j)] * 0.00178571428571
+                          +in[(i+5)*n+(j)] * 0.00111111111111
+                          +in[(i-5)*n+(j+1)] * -0.00111111111111
+                          +in[(i-4)*n+(j+1)] * -0.00178571428571
+                          +in[(i-3)*n+(j+1)] * -0.00333333333333
+                          +in[(i-2)*n+(j+1)] * -0.00833333333333
+                          +in[(i)*n+(j+1)] * 0.05
                           +in[(i+1)*n+(j+1)] * 0.05
-                          +in[(i+1)*n+(j+2)] * 0.00833333333333
-                          +in[(i+1)*n+(j+3)] * 0.00333333333333
-                          +in[(i+1)*n+(j+4)] * 0.00178571428571
-                          +in[(i+1)*n+(j+5)] * 0.00111111111111
-                          +in[(i+2)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+2)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+2)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+2)*n+(j+-1)] * 0.00833333333333
-                          +in[(i+2)*n+(j+0)] * 0.00833333333333
                           +in[(i+2)*n+(j+1)] * 0.00833333333333
-                          +in[(i+2)*n+(j+2)] * 0.025
-                          +in[(i+2)*n+(j+3)] * 0.00333333333333
-                          +in[(i+2)*n+(j+4)] * 0.00178571428571
-                          +in[(i+2)*n+(j+5)] * 0.00111111111111
-                          +in[(i+3)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+3)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+3)*n+(j+-2)] * 0.00333333333333
-                          +in[(i+3)*n+(j+-1)] * 0.00333333333333
-                          +in[(i+3)*n+(j+0)] * 0.00333333333333
                           +in[(i+3)*n+(j+1)] * 0.00333333333333
-                          +in[(i+3)*n+(j+2)] * 0.00333333333333
-                          +in[(i+3)*n+(j+3)] * 0.0166666666667
-                          +in[(i+3)*n+(j+4)] * 0.00178571428571
-                          +in[(i+3)*n+(j+5)] * 0.00111111111111
-                          +in[(i+4)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+4)*n+(j+-3)] * 0.00178571428571
-                          +in[(i+4)*n+(j+-2)] * 0.00178571428571
-                          +in[(i+4)*n+(j+-1)] * 0.00178571428571
-                          +in[(i+4)*n+(j+0)] * 0.00178571428571
                           +in[(i+4)*n+(j+1)] * 0.00178571428571
-                          +in[(i+4)*n+(j+2)] * 0.00178571428571
-                          +in[(i+4)*n+(j+3)] * 0.00178571428571
-                          +in[(i+4)*n+(j+4)] * 0.0125
-                          +in[(i+4)*n+(j+5)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-4)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-3)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-2)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-1)] * 0.00111111111111
-                          +in[(i+5)*n+(j+0)] * 0.00111111111111
                           +in[(i+5)*n+(j+1)] * 0.00111111111111
+                          +in[(i-5)*n+(j+2)] * -0.00111111111111
+                          +in[(i-4)*n+(j+2)] * -0.00178571428571
+                          +in[(i-3)*n+(j+2)] * -0.00333333333333
+                          +in[(i-1)*n+(j+2)] * 0.00833333333333
+                          +in[(i)*n+(j+2)] * 0.00833333333333
+                          +in[(i+1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+2)*n+(j+2)] * 0.025
+                          +in[(i+3)*n+(j+2)] * 0.00333333333333
+                          +in[(i+4)*n+(j+2)] * 0.00178571428571
                           +in[(i+5)*n+(j+2)] * 0.00111111111111
+                          +in[(i-5)*n+(j+3)] * -0.00111111111111
+                          +in[(i-4)*n+(j+3)] * -0.00178571428571
+                          +in[(i-2)*n+(j+3)] * 0.00333333333333
+                          +in[(i-1)*n+(j+3)] * 0.00333333333333
+                          +in[(i)*n+(j+3)] * 0.00333333333333
+                          +in[(i+1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+3)*n+(j+3)] * 0.0166666666667
+                          +in[(i+4)*n+(j+3)] * 0.00178571428571
                           +in[(i+5)*n+(j+3)] * 0.00111111111111
+                          +in[(i-5)*n+(j+4)] * -0.00111111111111
+                          +in[(i-3)*n+(j+4)] * 0.00178571428571
+                          +in[(i-2)*n+(j+4)] * 0.00178571428571
+                          +in[(i-1)*n+(j+4)] * 0.00178571428571
+                          +in[(i)*n+(j+4)] * 0.00178571428571
+                          +in[(i+1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+4)*n+(j+4)] * 0.0125
                           +in[(i+5)*n+(j+4)] * 0.00111111111111
+                          +in[(i-4)*n+(j+5)] * 0.00111111111111
+                          +in[(i-3)*n+(j+5)] * 0.00111111111111
+                          +in[(i-2)*n+(j+5)] * 0.00111111111111
+                          +in[(i-1)*n+(j+5)] * 0.00111111111111
+                          +in[(i)*n+(j+5)] * 0.00111111111111
+                          +in[(i+1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+4)*n+(j+5)] * 0.00111111111111
                           +in[(i+5)*n+(j+5)] * 0.01
                           ;
       });
diff --git a/Cxx11/stencil_pstl.hpp b/Cxx11/stencil_pstl.hpp
index e0557a6d3..f5a05a52d 100644
--- a/Cxx11/stencil_pstl.hpp
+++ b/Cxx11/stencil_pstl.hpp
@@ -2,10 +2,10 @@ void star1(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = prk::range(1,n-1);
     std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-1)*n+(j+0)] * -0.5
-                          +in[(i+0)*n+(j+-1)] * -0.5
-                          +in[(i+0)*n+(j+1)] * 0.5
-                          +in[(i+1)*n+(j+0)] * 0.5;
+            out[i*n+j] += +in[(i)*n+(j-1)] * -0.5
+                          +in[(i-1)*n+(j)] * -0.5
+                          +in[(i+1)*n+(j)] * 0.5
+                          +in[(i)*n+(j+1)] * 0.5;
       });
     });
 }
@@ -14,14 +14,14 @@ void star2(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = prk::range(2,n-2);
     std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-2)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+0)] * -0.25
-                          +in[(i+0)*n+(j+-2)] * -0.125
-                          +in[(i+0)*n+(j+-1)] * -0.25
-                          +in[(i+0)*n+(j+1)] * 0.25
-                          +in[(i+0)*n+(j+2)] * 0.125
-                          +in[(i+1)*n+(j+0)] * 0.25
-                          +in[(i+2)*n+(j+0)] * 0.125;
+            out[i*n+j] += +in[(i)*n+(j-2)] * -0.125
+                          +in[(i)*n+(j-1)] * -0.25
+                          +in[(i-2)*n+(j)] * -0.125
+                          +in[(i-1)*n+(j)] * -0.25
+                          +in[(i+1)*n+(j)] * 0.25
+                          +in[(i+2)*n+(j)] * 0.125
+                          +in[(i)*n+(j+1)] * 0.25
+                          +in[(i)*n+(j+2)] * 0.125;
       });
     });
 }
@@ -30,18 +30,18 @@ void star3(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = prk::range(3,n-3);
     std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556
-                          +in[(i+-2)*n+(j+0)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+0)] * -0.166666666667
-                          +in[(i+0)*n+(j+-3)] * -0.0555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.0833333333333
-                          +in[(i+0)*n+(j+-1)] * -0.166666666667
-                          +in[(i+0)*n+(j+1)] * 0.166666666667
-                          +in[(i+0)*n+(j+2)] * 0.0833333333333
-                          +in[(i+0)*n+(j+3)] * 0.0555555555556
-                          +in[(i+1)*n+(j+0)] * 0.166666666667
-                          +in[(i+2)*n+(j+0)] * 0.0833333333333
-                          +in[(i+3)*n+(j+0)] * 0.0555555555556;
+            out[i*n+j] += +in[(i)*n+(j-3)] * -0.0555555555556
+                          +in[(i)*n+(j-2)] * -0.0833333333333
+                          +in[(i)*n+(j-1)] * -0.166666666667
+                          +in[(i-3)*n+(j)] * -0.0555555555556
+                          +in[(i-2)*n+(j)] * -0.0833333333333
+                          +in[(i-1)*n+(j)] * -0.166666666667
+                          +in[(i+1)*n+(j)] * 0.166666666667
+                          +in[(i+2)*n+(j)] * 0.0833333333333
+                          +in[(i+3)*n+(j)] * 0.0555555555556
+                          +in[(i)*n+(j+1)] * 0.166666666667
+                          +in[(i)*n+(j+2)] * 0.0833333333333
+                          +in[(i)*n+(j+3)] * 0.0555555555556;
       });
     });
 }
@@ -50,22 +50,22 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = prk::range(4,n-4);
     std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125
-                          +in[(i+-3)*n+(j+0)] * -0.0416666666667
-                          +in[(i+-2)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+0)*n+(j+-4)] * -0.03125
-                          +in[(i+0)*n+(j+-3)] * -0.0416666666667
-                          +in[(i+0)*n+(j+-2)] * -0.0625
-                          +in[(i+0)*n+(j+-1)] * -0.125
-                          +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.0625
-                          +in[(i+0)*n+(j+3)] * 0.0416666666667
-                          +in[(i+0)*n+(j+4)] * 0.03125
-                          +in[(i+1)*n+(j+0)] * 0.125
-                          +in[(i+2)*n+(j+0)] * 0.0625
-                          +in[(i+3)*n+(j+0)] * 0.0416666666667
-                          +in[(i+4)*n+(j+0)] * 0.03125;
+            out[i*n+j] += +in[(i)*n+(j-4)] * -0.03125
+                          +in[(i)*n+(j-3)] * -0.0416666666667
+                          +in[(i)*n+(j-2)] * -0.0625
+                          +in[(i)*n+(j-1)] * -0.125
+                          +in[(i-4)*n+(j)] * -0.03125
+                          +in[(i-3)*n+(j)] * -0.0416666666667
+                          +in[(i-2)*n+(j)] * -0.0625
+                          +in[(i-1)*n+(j)] * -0.125
+                          +in[(i+1)*n+(j)] * 0.125
+                          +in[(i+2)*n+(j)] * 0.0625
+                          +in[(i+3)*n+(j)] * 0.0416666666667
+                          +in[(i+4)*n+(j)] * 0.03125
+                          +in[(i)*n+(j+1)] * 0.125
+                          +in[(i)*n+(j+2)] * 0.0625
+                          +in[(i)*n+(j+3)] * 0.0416666666667
+                          +in[(i)*n+(j+4)] * 0.03125;
       });
     });
 }
@@ -74,26 +74,26 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = prk::range(5,n-5);
     std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02
-                          +in[(i+-4)*n+(j+0)] * -0.025
-                          +in[(i+-3)*n+(j+0)] * -0.0333333333333
-                          +in[(i+-2)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+0)] * -0.1
-                          +in[(i+0)*n+(j+-5)] * -0.02
-                          +in[(i+0)*n+(j+-4)] * -0.025
-                          +in[(i+0)*n+(j+-3)] * -0.0333333333333
-                          +in[(i+0)*n+(j+-2)] * -0.05
-                          +in[(i+0)*n+(j+-1)] * -0.1
-                          +in[(i+0)*n+(j+1)] * 0.1
-                          +in[(i+0)*n+(j+2)] * 0.05
-                          +in[(i+0)*n+(j+3)] * 0.0333333333333
-                          +in[(i+0)*n+(j+4)] * 0.025
-                          +in[(i+0)*n+(j+5)] * 0.02
-                          +in[(i+1)*n+(j+0)] * 0.1
-                          +in[(i+2)*n+(j+0)] * 0.05
-                          +in[(i+3)*n+(j+0)] * 0.0333333333333
-                          +in[(i+4)*n+(j+0)] * 0.025
-                          +in[(i+5)*n+(j+0)] * 0.02;
+            out[i*n+j] += +in[(i)*n+(j-5)] * -0.02
+                          +in[(i)*n+(j-4)] * -0.025
+                          +in[(i)*n+(j-3)] * -0.0333333333333
+                          +in[(i)*n+(j-2)] * -0.05
+                          +in[(i)*n+(j-1)] * -0.1
+                          +in[(i-5)*n+(j)] * -0.02
+                          +in[(i-4)*n+(j)] * -0.025
+                          +in[(i-3)*n+(j)] * -0.0333333333333
+                          +in[(i-2)*n+(j)] * -0.05
+                          +in[(i-1)*n+(j)] * -0.1
+                          +in[(i+1)*n+(j)] * 0.1
+                          +in[(i+2)*n+(j)] * 0.05
+                          +in[(i+3)*n+(j)] * 0.0333333333333
+                          +in[(i+4)*n+(j)] * 0.025
+                          +in[(i+5)*n+(j)] * 0.02
+                          +in[(i)*n+(j+1)] * 0.1
+                          +in[(i)*n+(j+2)] * 0.05
+                          +in[(i)*n+(j+3)] * 0.0333333333333
+                          +in[(i)*n+(j+4)] * 0.025
+                          +in[(i)*n+(j+5)] * 0.02;
       });
     });
 }
@@ -102,11 +102,11 @@ void grid1(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = prk::range(1,n-1);
     std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-1)*n+(j+-1)] * -0.25
-                          +in[(i+-1)*n+(j+0)] * -0.25
-                          +in[(i+0)*n+(j+-1)] * -0.25
-                          +in[(i+0)*n+(j+1)] * 0.25
-                          +in[(i+1)*n+(j+0)] * 0.25
+            out[i*n+j] += +in[(i-1)*n+(j-1)] * -0.25
+                          +in[(i)*n+(j-1)] * -0.25
+                          +in[(i-1)*n+(j)] * -0.25
+                          +in[(i+1)*n+(j)] * 0.25
+                          +in[(i)*n+(j+1)] * 0.25
                           +in[(i+1)*n+(j+1)] * 0.25
                           ;
       });
@@ -117,25 +117,25 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = prk::range(2,n-2);
     std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625
-                          +in[(i+-2)*n+(j+-1)] * -0.0208333333333
-                          +in[(i+-2)*n+(j+0)] * -0.0208333333333
-                          +in[(i+-2)*n+(j+1)] * -0.0208333333333
-                          +in[(i+-1)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+-1)*n+(j+-1)] * -0.125
-                          +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+2)] * 0.0208333333333
-                          +in[(i+0)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+0)*n+(j+-1)] * -0.125
-                          +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.0208333333333
-                          +in[(i+1)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+1)*n+(j+0)] * 0.125
+            out[i*n+j] += +in[(i-2)*n+(j-2)] * -0.0625
+                          +in[(i-1)*n+(j-2)] * -0.0208333333333
+                          +in[(i)*n+(j-2)] * -0.0208333333333
+                          +in[(i+1)*n+(j-2)] * -0.0208333333333
+                          +in[(i-2)*n+(j-1)] * -0.0208333333333
+                          +in[(i-1)*n+(j-1)] * -0.125
+                          +in[(i)*n+(j-1)] * -0.125
+                          +in[(i+2)*n+(j-1)] * 0.0208333333333
+                          +in[(i-2)*n+(j)] * -0.0208333333333
+                          +in[(i-1)*n+(j)] * -0.125
+                          +in[(i+1)*n+(j)] * 0.125
+                          +in[(i+2)*n+(j)] * 0.0208333333333
+                          +in[(i-2)*n+(j+1)] * -0.0208333333333
+                          +in[(i)*n+(j+1)] * 0.125
                           +in[(i+1)*n+(j+1)] * 0.125
-                          +in[(i+1)*n+(j+2)] * 0.0208333333333
-                          +in[(i+2)*n+(j+-1)] * 0.0208333333333
-                          +in[(i+2)*n+(j+0)] * 0.0208333333333
                           +in[(i+2)*n+(j+1)] * 0.0208333333333
+                          +in[(i-1)*n+(j+2)] * 0.0208333333333
+                          +in[(i)*n+(j+2)] * 0.0208333333333
+                          +in[(i+1)*n+(j+2)] * 0.0208333333333
                           +in[(i+2)*n+(j+2)] * 0.0625
                           ;
       });
@@ -146,47 +146,47 @@ void grid3(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = prk::range(3,n-3);
     std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778
-                          +in[(i+-3)*n+(j+-2)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+-1)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+0)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+1)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+2)] * -0.00555555555556
-                          +in[(i+-2)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+-2)*n+(j+-2)] * -0.0416666666667
-                          +in[(i+-2)*n+(j+-1)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+0)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+1)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+3)] * 0.00555555555556
-                          +in[(i+-1)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+-1)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+-1)*n+(j+-1)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+0)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+2)] * 0.0138888888889
-                          +in[(i+-1)*n+(j+3)] * 0.00555555555556
-                          +in[(i+0)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+0)*n+(j+-1)] * -0.0833333333333
-                          +in[(i+0)*n+(j+1)] * 0.0833333333333
-                          +in[(i+0)*n+(j+2)] * 0.0138888888889
-                          +in[(i+0)*n+(j+3)] * 0.00555555555556
-                          +in[(i+1)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+1)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+1)*n+(j+0)] * 0.0833333333333
+            out[i*n+j] += +in[(i-3)*n+(j-3)] * -0.0277777777778
+                          +in[(i-2)*n+(j-3)] * -0.00555555555556
+                          +in[(i-1)*n+(j-3)] * -0.00555555555556
+                          +in[(i)*n+(j-3)] * -0.00555555555556
+                          +in[(i+1)*n+(j-3)] * -0.00555555555556
+                          +in[(i+2)*n+(j-3)] * -0.00555555555556
+                          +in[(i-3)*n+(j-2)] * -0.00555555555556
+                          +in[(i-2)*n+(j-2)] * -0.0416666666667
+                          +in[(i-1)*n+(j-2)] * -0.0138888888889
+                          +in[(i)*n+(j-2)] * -0.0138888888889
+                          +in[(i+1)*n+(j-2)] * -0.0138888888889
+                          +in[(i+3)*n+(j-2)] * 0.00555555555556
+                          +in[(i-3)*n+(j-1)] * -0.00555555555556
+                          +in[(i-2)*n+(j-1)] * -0.0138888888889
+                          +in[(i-1)*n+(j-1)] * -0.0833333333333
+                          +in[(i)*n+(j-1)] * -0.0833333333333
+                          +in[(i+2)*n+(j-1)] * 0.0138888888889
+                          +in[(i+3)*n+(j-1)] * 0.00555555555556
+                          +in[(i-3)*n+(j)] * -0.00555555555556
+                          +in[(i-2)*n+(j)] * -0.0138888888889
+                          +in[(i-1)*n+(j)] * -0.0833333333333
+                          +in[(i+1)*n+(j)] * 0.0833333333333
+                          +in[(i+2)*n+(j)] * 0.0138888888889
+                          +in[(i+3)*n+(j)] * 0.00555555555556
+                          +in[(i-3)*n+(j+1)] * -0.00555555555556
+                          +in[(i-2)*n+(j+1)] * -0.0138888888889
+                          +in[(i)*n+(j+1)] * 0.0833333333333
                           +in[(i+1)*n+(j+1)] * 0.0833333333333
-                          +in[(i+1)*n+(j+2)] * 0.0138888888889
-                          +in[(i+1)*n+(j+3)] * 0.00555555555556
-                          +in[(i+2)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+2)*n+(j+-1)] * 0.0138888888889
-                          +in[(i+2)*n+(j+0)] * 0.0138888888889
                           +in[(i+2)*n+(j+1)] * 0.0138888888889
-                          +in[(i+2)*n+(j+2)] * 0.0416666666667
-                          +in[(i+2)*n+(j+3)] * 0.00555555555556
-                          +in[(i+3)*n+(j+-2)] * 0.00555555555556
-                          +in[(i+3)*n+(j+-1)] * 0.00555555555556
-                          +in[(i+3)*n+(j+0)] * 0.00555555555556
                           +in[(i+3)*n+(j+1)] * 0.00555555555556
+                          +in[(i-3)*n+(j+2)] * -0.00555555555556
+                          +in[(i-1)*n+(j+2)] * 0.0138888888889
+                          +in[(i)*n+(j+2)] * 0.0138888888889
+                          +in[(i+1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+2)*n+(j+2)] * 0.0416666666667
                           +in[(i+3)*n+(j+2)] * 0.00555555555556
+                          +in[(i-2)*n+(j+3)] * 0.00555555555556
+                          +in[(i-1)*n+(j+3)] * 0.00555555555556
+                          +in[(i)*n+(j+3)] * 0.00555555555556
+                          +in[(i+1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+2)*n+(j+3)] * 0.00555555555556
                           +in[(i+3)*n+(j+3)] * 0.0277777777778
                           ;
       });
@@ -197,77 +197,77 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = prk::range(4,n-4);
     std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625
-                          +in[(i+-4)*n+(j+-3)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+-2)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+-1)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+0)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+1)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+2)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+3)] * -0.00223214285714
-                          +in[(i+-3)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-3)*n+(j+-3)] * -0.0208333333333
-                          +in[(i+-3)*n+(j+-2)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+-1)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+0)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+1)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+2)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+4)] * 0.00223214285714
-                          +in[(i+-2)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-2)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+-2)*n+(j+-2)] * -0.03125
-                          +in[(i+-2)*n+(j+-1)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+0)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+1)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+3)] * 0.00416666666667
-                          +in[(i+-2)*n+(j+4)] * 0.00223214285714
-                          +in[(i+-1)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-1)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+-1)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+-1)*n+(j+-1)] * -0.0625
-                          +in[(i+-1)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+2)] * 0.0104166666667
-                          +in[(i+-1)*n+(j+3)] * 0.00416666666667
-                          +in[(i+-1)*n+(j+4)] * 0.00223214285714
-                          +in[(i+0)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+0)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+0)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+0)*n+(j+-1)] * -0.0625
-                          +in[(i+0)*n+(j+1)] * 0.0625
-                          +in[(i+0)*n+(j+2)] * 0.0104166666667
-                          +in[(i+0)*n+(j+3)] * 0.00416666666667
-                          +in[(i+0)*n+(j+4)] * 0.00223214285714
-                          +in[(i+1)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+1)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+1)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+1)*n+(j+0)] * 0.0625
+            out[i*n+j] += +in[(i-4)*n+(j-4)] * -0.015625
+                          +in[(i-3)*n+(j-4)] * -0.00223214285714
+                          +in[(i-2)*n+(j-4)] * -0.00223214285714
+                          +in[(i-1)*n+(j-4)] * -0.00223214285714
+                          +in[(i)*n+(j-4)] * -0.00223214285714
+                          +in[(i+1)*n+(j-4)] * -0.00223214285714
+                          +in[(i+2)*n+(j-4)] * -0.00223214285714
+                          +in[(i+3)*n+(j-4)] * -0.00223214285714
+                          +in[(i-4)*n+(j-3)] * -0.00223214285714
+                          +in[(i-3)*n+(j-3)] * -0.0208333333333
+                          +in[(i-2)*n+(j-3)] * -0.00416666666667
+                          +in[(i-1)*n+(j-3)] * -0.00416666666667
+                          +in[(i)*n+(j-3)] * -0.00416666666667
+                          +in[(i+1)*n+(j-3)] * -0.00416666666667
+                          +in[(i+2)*n+(j-3)] * -0.00416666666667
+                          +in[(i+4)*n+(j-3)] * 0.00223214285714
+                          +in[(i-4)*n+(j-2)] * -0.00223214285714
+                          +in[(i-3)*n+(j-2)] * -0.00416666666667
+                          +in[(i-2)*n+(j-2)] * -0.03125
+                          +in[(i-1)*n+(j-2)] * -0.0104166666667
+                          +in[(i)*n+(j-2)] * -0.0104166666667
+                          +in[(i+1)*n+(j-2)] * -0.0104166666667
+                          +in[(i+3)*n+(j-2)] * 0.00416666666667
+                          +in[(i+4)*n+(j-2)] * 0.00223214285714
+                          +in[(i-4)*n+(j-1)] * -0.00223214285714
+                          +in[(i-3)*n+(j-1)] * -0.00416666666667
+                          +in[(i-2)*n+(j-1)] * -0.0104166666667
+                          +in[(i-1)*n+(j-1)] * -0.0625
+                          +in[(i)*n+(j-1)] * -0.0625
+                          +in[(i+2)*n+(j-1)] * 0.0104166666667
+                          +in[(i+3)*n+(j-1)] * 0.00416666666667
+                          +in[(i+4)*n+(j-1)] * 0.00223214285714
+                          +in[(i-4)*n+(j)] * -0.00223214285714
+                          +in[(i-3)*n+(j)] * -0.00416666666667
+                          +in[(i-2)*n+(j)] * -0.0104166666667
+                          +in[(i-1)*n+(j)] * -0.0625
+                          +in[(i+1)*n+(j)] * 0.0625
+                          +in[(i+2)*n+(j)] * 0.0104166666667
+                          +in[(i+3)*n+(j)] * 0.00416666666667
+                          +in[(i+4)*n+(j)] * 0.00223214285714
+                          +in[(i-4)*n+(j+1)] * -0.00223214285714
+                          +in[(i-3)*n+(j+1)] * -0.00416666666667
+                          +in[(i-2)*n+(j+1)] * -0.0104166666667
+                          +in[(i)*n+(j+1)] * 0.0625
                           +in[(i+1)*n+(j+1)] * 0.0625
-                          +in[(i+1)*n+(j+2)] * 0.0104166666667
-                          +in[(i+1)*n+(j+3)] * 0.00416666666667
-                          +in[(i+1)*n+(j+4)] * 0.00223214285714
-                          +in[(i+2)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+2)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+2)*n+(j+-1)] * 0.0104166666667
-                          +in[(i+2)*n+(j+0)] * 0.0104166666667
                           +in[(i+2)*n+(j+1)] * 0.0104166666667
-                          +in[(i+2)*n+(j+2)] * 0.03125
-                          +in[(i+2)*n+(j+3)] * 0.00416666666667
-                          +in[(i+2)*n+(j+4)] * 0.00223214285714
-                          +in[(i+3)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+3)*n+(j+-2)] * 0.00416666666667
-                          +in[(i+3)*n+(j+-1)] * 0.00416666666667
-                          +in[(i+3)*n+(j+0)] * 0.00416666666667
                           +in[(i+3)*n+(j+1)] * 0.00416666666667
-                          +in[(i+3)*n+(j+2)] * 0.00416666666667
-                          +in[(i+3)*n+(j+3)] * 0.0208333333333
-                          +in[(i+3)*n+(j+4)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-3)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-2)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-1)] * 0.00223214285714
-                          +in[(i+4)*n+(j+0)] * 0.00223214285714
                           +in[(i+4)*n+(j+1)] * 0.00223214285714
+                          +in[(i-4)*n+(j+2)] * -0.00223214285714
+                          +in[(i-3)*n+(j+2)] * -0.00416666666667
+                          +in[(i-1)*n+(j+2)] * 0.0104166666667
+                          +in[(i)*n+(j+2)] * 0.0104166666667
+                          +in[(i+1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+2)*n+(j+2)] * 0.03125
+                          +in[(i+3)*n+(j+2)] * 0.00416666666667
                           +in[(i+4)*n+(j+2)] * 0.00223214285714
+                          +in[(i-4)*n+(j+3)] * -0.00223214285714
+                          +in[(i-2)*n+(j+3)] * 0.00416666666667
+                          +in[(i-1)*n+(j+3)] * 0.00416666666667
+                          +in[(i)*n+(j+3)] * 0.00416666666667
+                          +in[(i+1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+3)*n+(j+3)] * 0.0208333333333
                           +in[(i+4)*n+(j+3)] * 0.00223214285714
+                          +in[(i-3)*n+(j+4)] * 0.00223214285714
+                          +in[(i-2)*n+(j+4)] * 0.00223214285714
+                          +in[(i-1)*n+(j+4)] * 0.00223214285714
+                          +in[(i)*n+(j+4)] * 0.00223214285714
+                          +in[(i+1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+3)*n+(j+4)] * 0.00223214285714
                           +in[(i+4)*n+(j+4)] * 0.015625
                           ;
       });
@@ -278,115 +278,115 @@ void grid5(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = prk::range(5,n-5);
     std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01
-                          +in[(i+-5)*n+(j+-4)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-3)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-2)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-1)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+0)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+1)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+2)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+3)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+4)] * -0.00111111111111
-                          +in[(i+-4)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-4)*n+(j+-4)] * -0.0125
-                          +in[(i+-4)*n+(j+-3)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+-2)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+-1)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+0)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+1)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+2)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+3)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-3)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-3)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-3)*n+(j+-3)] * -0.0166666666667
-                          +in[(i+-3)*n+(j+-2)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+-1)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+0)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+1)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+2)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-3)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-2)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-2)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-2)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+-2)*n+(j+-2)] * -0.025
-                          +in[(i+-2)*n+(j+-1)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+0)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+1)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+3)] * 0.00333333333333
-                          +in[(i+-2)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-2)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-1)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-1)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-1)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+-1)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+-1)*n+(j+-1)] * -0.05
-                          +in[(i+-1)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+2)] * 0.00833333333333
-                          +in[(i+-1)*n+(j+3)] * 0.00333333333333
-                          +in[(i+-1)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-1)*n+(j+5)] * 0.00111111111111
-                          +in[(i+0)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+0)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+0)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+0)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+0)*n+(j+-1)] * -0.05
-                          +in[(i+0)*n+(j+1)] * 0.05
-                          +in[(i+0)*n+(j+2)] * 0.00833333333333
-                          +in[(i+0)*n+(j+3)] * 0.00333333333333
-                          +in[(i+0)*n+(j+4)] * 0.00178571428571
-                          +in[(i+0)*n+(j+5)] * 0.00111111111111
-                          +in[(i+1)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+1)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+1)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+1)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+1)*n+(j+0)] * 0.05
+            out[i*n+j] += +in[(i-5)*n+(j-5)] * -0.01
+                          +in[(i-4)*n+(j-5)] * -0.00111111111111
+                          +in[(i-3)*n+(j-5)] * -0.00111111111111
+                          +in[(i-2)*n+(j-5)] * -0.00111111111111
+                          +in[(i-1)*n+(j-5)] * -0.00111111111111
+                          +in[(i)*n+(j-5)] * -0.00111111111111
+                          +in[(i+1)*n+(j-5)] * -0.00111111111111
+                          +in[(i+2)*n+(j-5)] * -0.00111111111111
+                          +in[(i+3)*n+(j-5)] * -0.00111111111111
+                          +in[(i+4)*n+(j-5)] * -0.00111111111111
+                          +in[(i-5)*n+(j-4)] * -0.00111111111111
+                          +in[(i-4)*n+(j-4)] * -0.0125
+                          +in[(i-3)*n+(j-4)] * -0.00178571428571
+                          +in[(i-2)*n+(j-4)] * -0.00178571428571
+                          +in[(i-1)*n+(j-4)] * -0.00178571428571
+                          +in[(i)*n+(j-4)] * -0.00178571428571
+                          +in[(i+1)*n+(j-4)] * -0.00178571428571
+                          +in[(i+2)*n+(j-4)] * -0.00178571428571
+                          +in[(i+3)*n+(j-4)] * -0.00178571428571
+                          +in[(i+5)*n+(j-4)] * 0.00111111111111
+                          +in[(i-5)*n+(j-3)] * -0.00111111111111
+                          +in[(i-4)*n+(j-3)] * -0.00178571428571
+                          +in[(i-3)*n+(j-3)] * -0.0166666666667
+                          +in[(i-2)*n+(j-3)] * -0.00333333333333
+                          +in[(i-1)*n+(j-3)] * -0.00333333333333
+                          +in[(i)*n+(j-3)] * -0.00333333333333
+                          +in[(i+1)*n+(j-3)] * -0.00333333333333
+                          +in[(i+2)*n+(j-3)] * -0.00333333333333
+                          +in[(i+4)*n+(j-3)] * 0.00178571428571
+                          +in[(i+5)*n+(j-3)] * 0.00111111111111
+                          +in[(i-5)*n+(j-2)] * -0.00111111111111
+                          +in[(i-4)*n+(j-2)] * -0.00178571428571
+                          +in[(i-3)*n+(j-2)] * -0.00333333333333
+                          +in[(i-2)*n+(j-2)] * -0.025
+                          +in[(i-1)*n+(j-2)] * -0.00833333333333
+                          +in[(i)*n+(j-2)] * -0.00833333333333
+                          +in[(i+1)*n+(j-2)] * -0.00833333333333
+                          +in[(i+3)*n+(j-2)] * 0.00333333333333
+                          +in[(i+4)*n+(j-2)] * 0.00178571428571
+                          +in[(i+5)*n+(j-2)] * 0.00111111111111
+                          +in[(i-5)*n+(j-1)] * -0.00111111111111
+                          +in[(i-4)*n+(j-1)] * -0.00178571428571
+                          +in[(i-3)*n+(j-1)] * -0.00333333333333
+                          +in[(i-2)*n+(j-1)] * -0.00833333333333
+                          +in[(i-1)*n+(j-1)] * -0.05
+                          +in[(i)*n+(j-1)] * -0.05
+                          +in[(i+2)*n+(j-1)] * 0.00833333333333
+                          +in[(i+3)*n+(j-1)] * 0.00333333333333
+                          +in[(i+4)*n+(j-1)] * 0.00178571428571
+                          +in[(i+5)*n+(j-1)] * 0.00111111111111
+                          +in[(i-5)*n+(j)] * -0.00111111111111
+                          +in[(i-4)*n+(j)] * -0.00178571428571
+                          +in[(i-3)*n+(j)] * -0.00333333333333
+                          +in[(i-2)*n+(j)] * -0.00833333333333
+                          +in[(i-1)*n+(j)] * -0.05
+                          +in[(i+1)*n+(j)] * 0.05
+                          +in[(i+2)*n+(j)] * 0.00833333333333
+                          +in[(i+3)*n+(j)] * 0.00333333333333
+                          +in[(i+4)*n+(j)] * 0.00178571428571
+                          +in[(i+5)*n+(j)] * 0.00111111111111
+                          +in[(i-5)*n+(j+1)] * -0.00111111111111
+                          +in[(i-4)*n+(j+1)] * -0.00178571428571
+                          +in[(i-3)*n+(j+1)] * -0.00333333333333
+                          +in[(i-2)*n+(j+1)] * -0.00833333333333
+                          +in[(i)*n+(j+1)] * 0.05
                           +in[(i+1)*n+(j+1)] * 0.05
-                          +in[(i+1)*n+(j+2)] * 0.00833333333333
-                          +in[(i+1)*n+(j+3)] * 0.00333333333333
-                          +in[(i+1)*n+(j+4)] * 0.00178571428571
-                          +in[(i+1)*n+(j+5)] * 0.00111111111111
-                          +in[(i+2)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+2)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+2)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+2)*n+(j+-1)] * 0.00833333333333
-                          +in[(i+2)*n+(j+0)] * 0.00833333333333
                           +in[(i+2)*n+(j+1)] * 0.00833333333333
-                          +in[(i+2)*n+(j+2)] * 0.025
-                          +in[(i+2)*n+(j+3)] * 0.00333333333333
-                          +in[(i+2)*n+(j+4)] * 0.00178571428571
-                          +in[(i+2)*n+(j+5)] * 0.00111111111111
-                          +in[(i+3)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+3)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+3)*n+(j+-2)] * 0.00333333333333
-                          +in[(i+3)*n+(j+-1)] * 0.00333333333333
-                          +in[(i+3)*n+(j+0)] * 0.00333333333333
                           +in[(i+3)*n+(j+1)] * 0.00333333333333
-                          +in[(i+3)*n+(j+2)] * 0.00333333333333
-                          +in[(i+3)*n+(j+3)] * 0.0166666666667
-                          +in[(i+3)*n+(j+4)] * 0.00178571428571
-                          +in[(i+3)*n+(j+5)] * 0.00111111111111
-                          +in[(i+4)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+4)*n+(j+-3)] * 0.00178571428571
-                          +in[(i+4)*n+(j+-2)] * 0.00178571428571
-                          +in[(i+4)*n+(j+-1)] * 0.00178571428571
-                          +in[(i+4)*n+(j+0)] * 0.00178571428571
                           +in[(i+4)*n+(j+1)] * 0.00178571428571
-                          +in[(i+4)*n+(j+2)] * 0.00178571428571
-                          +in[(i+4)*n+(j+3)] * 0.00178571428571
-                          +in[(i+4)*n+(j+4)] * 0.0125
-                          +in[(i+4)*n+(j+5)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-4)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-3)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-2)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-1)] * 0.00111111111111
-                          +in[(i+5)*n+(j+0)] * 0.00111111111111
                           +in[(i+5)*n+(j+1)] * 0.00111111111111
+                          +in[(i-5)*n+(j+2)] * -0.00111111111111
+                          +in[(i-4)*n+(j+2)] * -0.00178571428571
+                          +in[(i-3)*n+(j+2)] * -0.00333333333333
+                          +in[(i-1)*n+(j+2)] * 0.00833333333333
+                          +in[(i)*n+(j+2)] * 0.00833333333333
+                          +in[(i+1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+2)*n+(j+2)] * 0.025
+                          +in[(i+3)*n+(j+2)] * 0.00333333333333
+                          +in[(i+4)*n+(j+2)] * 0.00178571428571
                           +in[(i+5)*n+(j+2)] * 0.00111111111111
+                          +in[(i-5)*n+(j+3)] * -0.00111111111111
+                          +in[(i-4)*n+(j+3)] * -0.00178571428571
+                          +in[(i-2)*n+(j+3)] * 0.00333333333333
+                          +in[(i-1)*n+(j+3)] * 0.00333333333333
+                          +in[(i)*n+(j+3)] * 0.00333333333333
+                          +in[(i+1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+3)*n+(j+3)] * 0.0166666666667
+                          +in[(i+4)*n+(j+3)] * 0.00178571428571
                           +in[(i+5)*n+(j+3)] * 0.00111111111111
+                          +in[(i-5)*n+(j+4)] * -0.00111111111111
+                          +in[(i-3)*n+(j+4)] * 0.00178571428571
+                          +in[(i-2)*n+(j+4)] * 0.00178571428571
+                          +in[(i-1)*n+(j+4)] * 0.00178571428571
+                          +in[(i)*n+(j+4)] * 0.00178571428571
+                          +in[(i+1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+4)*n+(j+4)] * 0.0125
                           +in[(i+5)*n+(j+4)] * 0.00111111111111
+                          +in[(i-4)*n+(j+5)] * 0.00111111111111
+                          +in[(i-3)*n+(j+5)] * 0.00111111111111
+                          +in[(i-2)*n+(j+5)] * 0.00111111111111
+                          +in[(i-1)*n+(j+5)] * 0.00111111111111
+                          +in[(i)*n+(j+5)] * 0.00111111111111
+                          +in[(i+1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+4)*n+(j+5)] * 0.00111111111111
                           +in[(i+5)*n+(j+5)] * 0.01
                           ;
       });
diff --git a/Cxx11/stencil_raja.hpp b/Cxx11/stencil_raja.hpp
index 82e4e6d8f..ebd2d28b1 100644
--- a/Cxx11/stencil_raja.hpp
+++ b/Cxx11/stencil_raja.hpp
@@ -1,10 +1,10 @@
 void star1(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
     RAJA::forall<thread_exec>(RAJA::Index_type(1), RAJA::Index_type(n-1), [&](RAJA::Index_type i) {
       RAJA::forall<RAJA::simd_exec>(RAJA::Index_type(1), RAJA::Index_type(n-1), [&](RAJA::Index_type j) {
-            out[i*n+j] += +in[(i+-1)*n+(j+0)] * -0.5
-                          +in[(i+0)*n+(j+-1)] * -0.5
-                          +in[(i+0)*n+(j+1)] * 0.5
-                          +in[(i+1)*n+(j+0)] * 0.5;
+            out[i*n+j] += +in[(i)*n+(j-1)] * -0.5
+                          +in[(i-1)*n+(j)] * -0.5
+                          +in[(i+1)*n+(j)] * 0.5
+                          +in[(i)*n+(j+1)] * 0.5;
       });
     });
 }
@@ -12,14 +12,14 @@ void star1(const int n, const int t, std::vector<double> & in, std::vector<doubl
 void star2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
     RAJA::forall<thread_exec>(RAJA::Index_type(2), RAJA::Index_type(n-2), [&](RAJA::Index_type i) {
       RAJA::forall<RAJA::simd_exec>(RAJA::Index_type(2), RAJA::Index_type(n-2), [&](RAJA::Index_type j) {
-            out[i*n+j] += +in[(i+-2)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+0)] * -0.25
-                          +in[(i+0)*n+(j+-2)] * -0.125
-                          +in[(i+0)*n+(j+-1)] * -0.25
-                          +in[(i+0)*n+(j+1)] * 0.25
-                          +in[(i+0)*n+(j+2)] * 0.125
-                          +in[(i+1)*n+(j+0)] * 0.25
-                          +in[(i+2)*n+(j+0)] * 0.125;
+            out[i*n+j] += +in[(i)*n+(j-2)] * -0.125
+                          +in[(i)*n+(j-1)] * -0.25
+                          +in[(i-2)*n+(j)] * -0.125
+                          +in[(i-1)*n+(j)] * -0.25
+                          +in[(i+1)*n+(j)] * 0.25
+                          +in[(i+2)*n+(j)] * 0.125
+                          +in[(i)*n+(j+1)] * 0.25
+                          +in[(i)*n+(j+2)] * 0.125;
       });
     });
 }
@@ -27,18 +27,18 @@ void star2(const int n, const int t, std::vector<double> & in, std::vector<doubl
 void star3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
     RAJA::forall<thread_exec>(RAJA::Index_type(3), RAJA::Index_type(n-3), [&](RAJA::Index_type i) {
       RAJA::forall<RAJA::simd_exec>(RAJA::Index_type(3), RAJA::Index_type(n-3), [&](RAJA::Index_type j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556
-                          +in[(i+-2)*n+(j+0)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+0)] * -0.166666666667
-                          +in[(i+0)*n+(j+-3)] * -0.0555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.0833333333333
-                          +in[(i+0)*n+(j+-1)] * -0.166666666667
-                          +in[(i+0)*n+(j+1)] * 0.166666666667
-                          +in[(i+0)*n+(j+2)] * 0.0833333333333
-                          +in[(i+0)*n+(j+3)] * 0.0555555555556
-                          +in[(i+1)*n+(j+0)] * 0.166666666667
-                          +in[(i+2)*n+(j+0)] * 0.0833333333333
-                          +in[(i+3)*n+(j+0)] * 0.0555555555556;
+            out[i*n+j] += +in[(i)*n+(j-3)] * -0.0555555555556
+                          +in[(i)*n+(j-2)] * -0.0833333333333
+                          +in[(i)*n+(j-1)] * -0.166666666667
+                          +in[(i-3)*n+(j)] * -0.0555555555556
+                          +in[(i-2)*n+(j)] * -0.0833333333333
+                          +in[(i-1)*n+(j)] * -0.166666666667
+                          +in[(i+1)*n+(j)] * 0.166666666667
+                          +in[(i+2)*n+(j)] * 0.0833333333333
+                          +in[(i+3)*n+(j)] * 0.0555555555556
+                          +in[(i)*n+(j+1)] * 0.166666666667
+                          +in[(i)*n+(j+2)] * 0.0833333333333
+                          +in[(i)*n+(j+3)] * 0.0555555555556;
       });
     });
 }
@@ -46,22 +46,22 @@ void star3(const int n, const int t, std::vector<double> & in, std::vector<doubl
 void star4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
     RAJA::forall<thread_exec>(RAJA::Index_type(4), RAJA::Index_type(n-4), [&](RAJA::Index_type i) {
       RAJA::forall<RAJA::simd_exec>(RAJA::Index_type(4), RAJA::Index_type(n-4), [&](RAJA::Index_type j) {
-            out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125
-                          +in[(i+-3)*n+(j+0)] * -0.0416666666667
-                          +in[(i+-2)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+0)*n+(j+-4)] * -0.03125
-                          +in[(i+0)*n+(j+-3)] * -0.0416666666667
-                          +in[(i+0)*n+(j+-2)] * -0.0625
-                          +in[(i+0)*n+(j+-1)] * -0.125
-                          +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.0625
-                          +in[(i+0)*n+(j+3)] * 0.0416666666667
-                          +in[(i+0)*n+(j+4)] * 0.03125
-                          +in[(i+1)*n+(j+0)] * 0.125
-                          +in[(i+2)*n+(j+0)] * 0.0625
-                          +in[(i+3)*n+(j+0)] * 0.0416666666667
-                          +in[(i+4)*n+(j+0)] * 0.03125;
+            out[i*n+j] += +in[(i)*n+(j-4)] * -0.03125
+                          +in[(i)*n+(j-3)] * -0.0416666666667
+                          +in[(i)*n+(j-2)] * -0.0625
+                          +in[(i)*n+(j-1)] * -0.125
+                          +in[(i-4)*n+(j)] * -0.03125
+                          +in[(i-3)*n+(j)] * -0.0416666666667
+                          +in[(i-2)*n+(j)] * -0.0625
+                          +in[(i-1)*n+(j)] * -0.125
+                          +in[(i+1)*n+(j)] * 0.125
+                          +in[(i+2)*n+(j)] * 0.0625
+                          +in[(i+3)*n+(j)] * 0.0416666666667
+                          +in[(i+4)*n+(j)] * 0.03125
+                          +in[(i)*n+(j+1)] * 0.125
+                          +in[(i)*n+(j+2)] * 0.0625
+                          +in[(i)*n+(j+3)] * 0.0416666666667
+                          +in[(i)*n+(j+4)] * 0.03125;
       });
     });
 }
@@ -69,26 +69,26 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
 void star5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
     RAJA::forall<thread_exec>(RAJA::Index_type(5), RAJA::Index_type(n-5), [&](RAJA::Index_type i) {
       RAJA::forall<RAJA::simd_exec>(RAJA::Index_type(5), RAJA::Index_type(n-5), [&](RAJA::Index_type j) {
-            out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02
-                          +in[(i+-4)*n+(j+0)] * -0.025
-                          +in[(i+-3)*n+(j+0)] * -0.0333333333333
-                          +in[(i+-2)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+0)] * -0.1
-                          +in[(i+0)*n+(j+-5)] * -0.02
-                          +in[(i+0)*n+(j+-4)] * -0.025
-                          +in[(i+0)*n+(j+-3)] * -0.0333333333333
-                          +in[(i+0)*n+(j+-2)] * -0.05
-                          +in[(i+0)*n+(j+-1)] * -0.1
-                          +in[(i+0)*n+(j+1)] * 0.1
-                          +in[(i+0)*n+(j+2)] * 0.05
-                          +in[(i+0)*n+(j+3)] * 0.0333333333333
-                          +in[(i+0)*n+(j+4)] * 0.025
-                          +in[(i+0)*n+(j+5)] * 0.02
-                          +in[(i+1)*n+(j+0)] * 0.1
-                          +in[(i+2)*n+(j+0)] * 0.05
-                          +in[(i+3)*n+(j+0)] * 0.0333333333333
-                          +in[(i+4)*n+(j+0)] * 0.025
-                          +in[(i+5)*n+(j+0)] * 0.02;
+            out[i*n+j] += +in[(i)*n+(j-5)] * -0.02
+                          +in[(i)*n+(j-4)] * -0.025
+                          +in[(i)*n+(j-3)] * -0.0333333333333
+                          +in[(i)*n+(j-2)] * -0.05
+                          +in[(i)*n+(j-1)] * -0.1
+                          +in[(i-5)*n+(j)] * -0.02
+                          +in[(i-4)*n+(j)] * -0.025
+                          +in[(i-3)*n+(j)] * -0.0333333333333
+                          +in[(i-2)*n+(j)] * -0.05
+                          +in[(i-1)*n+(j)] * -0.1
+                          +in[(i+1)*n+(j)] * 0.1
+                          +in[(i+2)*n+(j)] * 0.05
+                          +in[(i+3)*n+(j)] * 0.0333333333333
+                          +in[(i+4)*n+(j)] * 0.025
+                          +in[(i+5)*n+(j)] * 0.02
+                          +in[(i)*n+(j+1)] * 0.1
+                          +in[(i)*n+(j+2)] * 0.05
+                          +in[(i)*n+(j+3)] * 0.0333333333333
+                          +in[(i)*n+(j+4)] * 0.025
+                          +in[(i)*n+(j+5)] * 0.02;
       });
     });
 }
@@ -96,11 +96,11 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
 void grid1(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
     RAJA::forall<thread_exec>(RAJA::Index_type(1), RAJA::Index_type(n-1), [&](RAJA::Index_type i) {
       RAJA::forall<RAJA::simd_exec>(RAJA::Index_type(1), RAJA::Index_type(n-1), [&](RAJA::Index_type j) {
-            out[i*n+j] += +in[(i+-1)*n+(j+-1)] * -0.25
-                          +in[(i+-1)*n+(j+0)] * -0.25
-                          +in[(i+0)*n+(j+-1)] * -0.25
-                          +in[(i+0)*n+(j+1)] * 0.25
-                          +in[(i+1)*n+(j+0)] * 0.25
+            out[i*n+j] += +in[(i-1)*n+(j-1)] * -0.25
+                          +in[(i)*n+(j-1)] * -0.25
+                          +in[(i-1)*n+(j)] * -0.25
+                          +in[(i+1)*n+(j)] * 0.25
+                          +in[(i)*n+(j+1)] * 0.25
                           +in[(i+1)*n+(j+1)] * 0.25
                           ;
       });
@@ -110,25 +110,25 @@ void grid1(const int n, const int t, std::vector<double> & in, std::vector<doubl
 void grid2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
     RAJA::forall<thread_exec>(RAJA::Index_type(2), RAJA::Index_type(n-2), [&](RAJA::Index_type i) {
       RAJA::forall<RAJA::simd_exec>(RAJA::Index_type(2), RAJA::Index_type(n-2), [&](RAJA::Index_type j) {
-            out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625
-                          +in[(i+-2)*n+(j+-1)] * -0.0208333333333
-                          +in[(i+-2)*n+(j+0)] * -0.0208333333333
-                          +in[(i+-2)*n+(j+1)] * -0.0208333333333
-                          +in[(i+-1)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+-1)*n+(j+-1)] * -0.125
-                          +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+2)] * 0.0208333333333
-                          +in[(i+0)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+0)*n+(j+-1)] * -0.125
-                          +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.0208333333333
-                          +in[(i+1)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+1)*n+(j+0)] * 0.125
+            out[i*n+j] += +in[(i-2)*n+(j-2)] * -0.0625
+                          +in[(i-1)*n+(j-2)] * -0.0208333333333
+                          +in[(i)*n+(j-2)] * -0.0208333333333
+                          +in[(i+1)*n+(j-2)] * -0.0208333333333
+                          +in[(i-2)*n+(j-1)] * -0.0208333333333
+                          +in[(i-1)*n+(j-1)] * -0.125
+                          +in[(i)*n+(j-1)] * -0.125
+                          +in[(i+2)*n+(j-1)] * 0.0208333333333
+                          +in[(i-2)*n+(j)] * -0.0208333333333
+                          +in[(i-1)*n+(j)] * -0.125
+                          +in[(i+1)*n+(j)] * 0.125
+                          +in[(i+2)*n+(j)] * 0.0208333333333
+                          +in[(i-2)*n+(j+1)] * -0.0208333333333
+                          +in[(i)*n+(j+1)] * 0.125
                           +in[(i+1)*n+(j+1)] * 0.125
-                          +in[(i+1)*n+(j+2)] * 0.0208333333333
-                          +in[(i+2)*n+(j+-1)] * 0.0208333333333
-                          +in[(i+2)*n+(j+0)] * 0.0208333333333
                           +in[(i+2)*n+(j+1)] * 0.0208333333333
+                          +in[(i-1)*n+(j+2)] * 0.0208333333333
+                          +in[(i)*n+(j+2)] * 0.0208333333333
+                          +in[(i+1)*n+(j+2)] * 0.0208333333333
                           +in[(i+2)*n+(j+2)] * 0.0625
                           ;
       });
@@ -138,47 +138,47 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
 void grid3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
     RAJA::forall<thread_exec>(RAJA::Index_type(3), RAJA::Index_type(n-3), [&](RAJA::Index_type i) {
       RAJA::forall<RAJA::simd_exec>(RAJA::Index_type(3), RAJA::Index_type(n-3), [&](RAJA::Index_type j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778
-                          +in[(i+-3)*n+(j+-2)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+-1)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+0)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+1)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+2)] * -0.00555555555556
-                          +in[(i+-2)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+-2)*n+(j+-2)] * -0.0416666666667
-                          +in[(i+-2)*n+(j+-1)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+0)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+1)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+3)] * 0.00555555555556
-                          +in[(i+-1)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+-1)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+-1)*n+(j+-1)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+0)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+2)] * 0.0138888888889
-                          +in[(i+-1)*n+(j+3)] * 0.00555555555556
-                          +in[(i+0)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+0)*n+(j+-1)] * -0.0833333333333
-                          +in[(i+0)*n+(j+1)] * 0.0833333333333
-                          +in[(i+0)*n+(j+2)] * 0.0138888888889
-                          +in[(i+0)*n+(j+3)] * 0.00555555555556
-                          +in[(i+1)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+1)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+1)*n+(j+0)] * 0.0833333333333
+            out[i*n+j] += +in[(i-3)*n+(j-3)] * -0.0277777777778
+                          +in[(i-2)*n+(j-3)] * -0.00555555555556
+                          +in[(i-1)*n+(j-3)] * -0.00555555555556
+                          +in[(i)*n+(j-3)] * -0.00555555555556
+                          +in[(i+1)*n+(j-3)] * -0.00555555555556
+                          +in[(i+2)*n+(j-3)] * -0.00555555555556
+                          +in[(i-3)*n+(j-2)] * -0.00555555555556
+                          +in[(i-2)*n+(j-2)] * -0.0416666666667
+                          +in[(i-1)*n+(j-2)] * -0.0138888888889
+                          +in[(i)*n+(j-2)] * -0.0138888888889
+                          +in[(i+1)*n+(j-2)] * -0.0138888888889
+                          +in[(i+3)*n+(j-2)] * 0.00555555555556
+                          +in[(i-3)*n+(j-1)] * -0.00555555555556
+                          +in[(i-2)*n+(j-1)] * -0.0138888888889
+                          +in[(i-1)*n+(j-1)] * -0.0833333333333
+                          +in[(i)*n+(j-1)] * -0.0833333333333
+                          +in[(i+2)*n+(j-1)] * 0.0138888888889
+                          +in[(i+3)*n+(j-1)] * 0.00555555555556
+                          +in[(i-3)*n+(j)] * -0.00555555555556
+                          +in[(i-2)*n+(j)] * -0.0138888888889
+                          +in[(i-1)*n+(j)] * -0.0833333333333
+                          +in[(i+1)*n+(j)] * 0.0833333333333
+                          +in[(i+2)*n+(j)] * 0.0138888888889
+                          +in[(i+3)*n+(j)] * 0.00555555555556
+                          +in[(i-3)*n+(j+1)] * -0.00555555555556
+                          +in[(i-2)*n+(j+1)] * -0.0138888888889
+                          +in[(i)*n+(j+1)] * 0.0833333333333
                           +in[(i+1)*n+(j+1)] * 0.0833333333333
-                          +in[(i+1)*n+(j+2)] * 0.0138888888889
-                          +in[(i+1)*n+(j+3)] * 0.00555555555556
-                          +in[(i+2)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+2)*n+(j+-1)] * 0.0138888888889
-                          +in[(i+2)*n+(j+0)] * 0.0138888888889
                           +in[(i+2)*n+(j+1)] * 0.0138888888889
-                          +in[(i+2)*n+(j+2)] * 0.0416666666667
-                          +in[(i+2)*n+(j+3)] * 0.00555555555556
-                          +in[(i+3)*n+(j+-2)] * 0.00555555555556
-                          +in[(i+3)*n+(j+-1)] * 0.00555555555556
-                          +in[(i+3)*n+(j+0)] * 0.00555555555556
                           +in[(i+3)*n+(j+1)] * 0.00555555555556
+                          +in[(i-3)*n+(j+2)] * -0.00555555555556
+                          +in[(i-1)*n+(j+2)] * 0.0138888888889
+                          +in[(i)*n+(j+2)] * 0.0138888888889
+                          +in[(i+1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+2)*n+(j+2)] * 0.0416666666667
                           +in[(i+3)*n+(j+2)] * 0.00555555555556
+                          +in[(i-2)*n+(j+3)] * 0.00555555555556
+                          +in[(i-1)*n+(j+3)] * 0.00555555555556
+                          +in[(i)*n+(j+3)] * 0.00555555555556
+                          +in[(i+1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+2)*n+(j+3)] * 0.00555555555556
                           +in[(i+3)*n+(j+3)] * 0.0277777777778
                           ;
       });
@@ -188,77 +188,77 @@ void grid3(const int n, const int t, std::vector<double> & in, std::vector<doubl
 void grid4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
     RAJA::forall<thread_exec>(RAJA::Index_type(4), RAJA::Index_type(n-4), [&](RAJA::Index_type i) {
       RAJA::forall<RAJA::simd_exec>(RAJA::Index_type(4), RAJA::Index_type(n-4), [&](RAJA::Index_type j) {
-            out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625
-                          +in[(i+-4)*n+(j+-3)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+-2)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+-1)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+0)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+1)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+2)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+3)] * -0.00223214285714
-                          +in[(i+-3)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-3)*n+(j+-3)] * -0.0208333333333
-                          +in[(i+-3)*n+(j+-2)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+-1)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+0)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+1)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+2)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+4)] * 0.00223214285714
-                          +in[(i+-2)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-2)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+-2)*n+(j+-2)] * -0.03125
-                          +in[(i+-2)*n+(j+-1)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+0)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+1)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+3)] * 0.00416666666667
-                          +in[(i+-2)*n+(j+4)] * 0.00223214285714
-                          +in[(i+-1)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-1)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+-1)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+-1)*n+(j+-1)] * -0.0625
-                          +in[(i+-1)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+2)] * 0.0104166666667
-                          +in[(i+-1)*n+(j+3)] * 0.00416666666667
-                          +in[(i+-1)*n+(j+4)] * 0.00223214285714
-                          +in[(i+0)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+0)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+0)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+0)*n+(j+-1)] * -0.0625
-                          +in[(i+0)*n+(j+1)] * 0.0625
-                          +in[(i+0)*n+(j+2)] * 0.0104166666667
-                          +in[(i+0)*n+(j+3)] * 0.00416666666667
-                          +in[(i+0)*n+(j+4)] * 0.00223214285714
-                          +in[(i+1)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+1)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+1)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+1)*n+(j+0)] * 0.0625
+            out[i*n+j] += +in[(i-4)*n+(j-4)] * -0.015625
+                          +in[(i-3)*n+(j-4)] * -0.00223214285714
+                          +in[(i-2)*n+(j-4)] * -0.00223214285714
+                          +in[(i-1)*n+(j-4)] * -0.00223214285714
+                          +in[(i)*n+(j-4)] * -0.00223214285714
+                          +in[(i+1)*n+(j-4)] * -0.00223214285714
+                          +in[(i+2)*n+(j-4)] * -0.00223214285714
+                          +in[(i+3)*n+(j-4)] * -0.00223214285714
+                          +in[(i-4)*n+(j-3)] * -0.00223214285714
+                          +in[(i-3)*n+(j-3)] * -0.0208333333333
+                          +in[(i-2)*n+(j-3)] * -0.00416666666667
+                          +in[(i-1)*n+(j-3)] * -0.00416666666667
+                          +in[(i)*n+(j-3)] * -0.00416666666667
+                          +in[(i+1)*n+(j-3)] * -0.00416666666667
+                          +in[(i+2)*n+(j-3)] * -0.00416666666667
+                          +in[(i+4)*n+(j-3)] * 0.00223214285714
+                          +in[(i-4)*n+(j-2)] * -0.00223214285714
+                          +in[(i-3)*n+(j-2)] * -0.00416666666667
+                          +in[(i-2)*n+(j-2)] * -0.03125
+                          +in[(i-1)*n+(j-2)] * -0.0104166666667
+                          +in[(i)*n+(j-2)] * -0.0104166666667
+                          +in[(i+1)*n+(j-2)] * -0.0104166666667
+                          +in[(i+3)*n+(j-2)] * 0.00416666666667
+                          +in[(i+4)*n+(j-2)] * 0.00223214285714
+                          +in[(i-4)*n+(j-1)] * -0.00223214285714
+                          +in[(i-3)*n+(j-1)] * -0.00416666666667
+                          +in[(i-2)*n+(j-1)] * -0.0104166666667
+                          +in[(i-1)*n+(j-1)] * -0.0625
+                          +in[(i)*n+(j-1)] * -0.0625
+                          +in[(i+2)*n+(j-1)] * 0.0104166666667
+                          +in[(i+3)*n+(j-1)] * 0.00416666666667
+                          +in[(i+4)*n+(j-1)] * 0.00223214285714
+                          +in[(i-4)*n+(j)] * -0.00223214285714
+                          +in[(i-3)*n+(j)] * -0.00416666666667
+                          +in[(i-2)*n+(j)] * -0.0104166666667
+                          +in[(i-1)*n+(j)] * -0.0625
+                          +in[(i+1)*n+(j)] * 0.0625
+                          +in[(i+2)*n+(j)] * 0.0104166666667
+                          +in[(i+3)*n+(j)] * 0.00416666666667
+                          +in[(i+4)*n+(j)] * 0.00223214285714
+                          +in[(i-4)*n+(j+1)] * -0.00223214285714
+                          +in[(i-3)*n+(j+1)] * -0.00416666666667
+                          +in[(i-2)*n+(j+1)] * -0.0104166666667
+                          +in[(i)*n+(j+1)] * 0.0625
                           +in[(i+1)*n+(j+1)] * 0.0625
-                          +in[(i+1)*n+(j+2)] * 0.0104166666667
-                          +in[(i+1)*n+(j+3)] * 0.00416666666667
-                          +in[(i+1)*n+(j+4)] * 0.00223214285714
-                          +in[(i+2)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+2)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+2)*n+(j+-1)] * 0.0104166666667
-                          +in[(i+2)*n+(j+0)] * 0.0104166666667
                           +in[(i+2)*n+(j+1)] * 0.0104166666667
-                          +in[(i+2)*n+(j+2)] * 0.03125
-                          +in[(i+2)*n+(j+3)] * 0.00416666666667
-                          +in[(i+2)*n+(j+4)] * 0.00223214285714
-                          +in[(i+3)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+3)*n+(j+-2)] * 0.00416666666667
-                          +in[(i+3)*n+(j+-1)] * 0.00416666666667
-                          +in[(i+3)*n+(j+0)] * 0.00416666666667
                           +in[(i+3)*n+(j+1)] * 0.00416666666667
-                          +in[(i+3)*n+(j+2)] * 0.00416666666667
-                          +in[(i+3)*n+(j+3)] * 0.0208333333333
-                          +in[(i+3)*n+(j+4)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-3)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-2)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-1)] * 0.00223214285714
-                          +in[(i+4)*n+(j+0)] * 0.00223214285714
                           +in[(i+4)*n+(j+1)] * 0.00223214285714
+                          +in[(i-4)*n+(j+2)] * -0.00223214285714
+                          +in[(i-3)*n+(j+2)] * -0.00416666666667
+                          +in[(i-1)*n+(j+2)] * 0.0104166666667
+                          +in[(i)*n+(j+2)] * 0.0104166666667
+                          +in[(i+1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+2)*n+(j+2)] * 0.03125
+                          +in[(i+3)*n+(j+2)] * 0.00416666666667
                           +in[(i+4)*n+(j+2)] * 0.00223214285714
+                          +in[(i-4)*n+(j+3)] * -0.00223214285714
+                          +in[(i-2)*n+(j+3)] * 0.00416666666667
+                          +in[(i-1)*n+(j+3)] * 0.00416666666667
+                          +in[(i)*n+(j+3)] * 0.00416666666667
+                          +in[(i+1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+3)*n+(j+3)] * 0.0208333333333
                           +in[(i+4)*n+(j+3)] * 0.00223214285714
+                          +in[(i-3)*n+(j+4)] * 0.00223214285714
+                          +in[(i-2)*n+(j+4)] * 0.00223214285714
+                          +in[(i-1)*n+(j+4)] * 0.00223214285714
+                          +in[(i)*n+(j+4)] * 0.00223214285714
+                          +in[(i+1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+3)*n+(j+4)] * 0.00223214285714
                           +in[(i+4)*n+(j+4)] * 0.015625
                           ;
       });
@@ -268,115 +268,115 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
 void grid5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
     RAJA::forall<thread_exec>(RAJA::Index_type(5), RAJA::Index_type(n-5), [&](RAJA::Index_type i) {
       RAJA::forall<RAJA::simd_exec>(RAJA::Index_type(5), RAJA::Index_type(n-5), [&](RAJA::Index_type j) {
-            out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01
-                          +in[(i+-5)*n+(j+-4)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-3)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-2)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-1)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+0)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+1)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+2)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+3)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+4)] * -0.00111111111111
-                          +in[(i+-4)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-4)*n+(j+-4)] * -0.0125
-                          +in[(i+-4)*n+(j+-3)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+-2)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+-1)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+0)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+1)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+2)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+3)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-3)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-3)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-3)*n+(j+-3)] * -0.0166666666667
-                          +in[(i+-3)*n+(j+-2)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+-1)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+0)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+1)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+2)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-3)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-2)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-2)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-2)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+-2)*n+(j+-2)] * -0.025
-                          +in[(i+-2)*n+(j+-1)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+0)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+1)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+3)] * 0.00333333333333
-                          +in[(i+-2)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-2)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-1)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-1)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-1)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+-1)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+-1)*n+(j+-1)] * -0.05
-                          +in[(i+-1)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+2)] * 0.00833333333333
-                          +in[(i+-1)*n+(j+3)] * 0.00333333333333
-                          +in[(i+-1)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-1)*n+(j+5)] * 0.00111111111111
-                          +in[(i+0)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+0)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+0)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+0)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+0)*n+(j+-1)] * -0.05
-                          +in[(i+0)*n+(j+1)] * 0.05
-                          +in[(i+0)*n+(j+2)] * 0.00833333333333
-                          +in[(i+0)*n+(j+3)] * 0.00333333333333
-                          +in[(i+0)*n+(j+4)] * 0.00178571428571
-                          +in[(i+0)*n+(j+5)] * 0.00111111111111
-                          +in[(i+1)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+1)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+1)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+1)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+1)*n+(j+0)] * 0.05
+            out[i*n+j] += +in[(i-5)*n+(j-5)] * -0.01
+                          +in[(i-4)*n+(j-5)] * -0.00111111111111
+                          +in[(i-3)*n+(j-5)] * -0.00111111111111
+                          +in[(i-2)*n+(j-5)] * -0.00111111111111
+                          +in[(i-1)*n+(j-5)] * -0.00111111111111
+                          +in[(i)*n+(j-5)] * -0.00111111111111
+                          +in[(i+1)*n+(j-5)] * -0.00111111111111
+                          +in[(i+2)*n+(j-5)] * -0.00111111111111
+                          +in[(i+3)*n+(j-5)] * -0.00111111111111
+                          +in[(i+4)*n+(j-5)] * -0.00111111111111
+                          +in[(i-5)*n+(j-4)] * -0.00111111111111
+                          +in[(i-4)*n+(j-4)] * -0.0125
+                          +in[(i-3)*n+(j-4)] * -0.00178571428571
+                          +in[(i-2)*n+(j-4)] * -0.00178571428571
+                          +in[(i-1)*n+(j-4)] * -0.00178571428571
+                          +in[(i)*n+(j-4)] * -0.00178571428571
+                          +in[(i+1)*n+(j-4)] * -0.00178571428571
+                          +in[(i+2)*n+(j-4)] * -0.00178571428571
+                          +in[(i+3)*n+(j-4)] * -0.00178571428571
+                          +in[(i+5)*n+(j-4)] * 0.00111111111111
+                          +in[(i-5)*n+(j-3)] * -0.00111111111111
+                          +in[(i-4)*n+(j-3)] * -0.00178571428571
+                          +in[(i-3)*n+(j-3)] * -0.0166666666667
+                          +in[(i-2)*n+(j-3)] * -0.00333333333333
+                          +in[(i-1)*n+(j-3)] * -0.00333333333333
+                          +in[(i)*n+(j-3)] * -0.00333333333333
+                          +in[(i+1)*n+(j-3)] * -0.00333333333333
+                          +in[(i+2)*n+(j-3)] * -0.00333333333333
+                          +in[(i+4)*n+(j-3)] * 0.00178571428571
+                          +in[(i+5)*n+(j-3)] * 0.00111111111111
+                          +in[(i-5)*n+(j-2)] * -0.00111111111111
+                          +in[(i-4)*n+(j-2)] * -0.00178571428571
+                          +in[(i-3)*n+(j-2)] * -0.00333333333333
+                          +in[(i-2)*n+(j-2)] * -0.025
+                          +in[(i-1)*n+(j-2)] * -0.00833333333333
+                          +in[(i)*n+(j-2)] * -0.00833333333333
+                          +in[(i+1)*n+(j-2)] * -0.00833333333333
+                          +in[(i+3)*n+(j-2)] * 0.00333333333333
+                          +in[(i+4)*n+(j-2)] * 0.00178571428571
+                          +in[(i+5)*n+(j-2)] * 0.00111111111111
+                          +in[(i-5)*n+(j-1)] * -0.00111111111111
+                          +in[(i-4)*n+(j-1)] * -0.00178571428571
+                          +in[(i-3)*n+(j-1)] * -0.00333333333333
+                          +in[(i-2)*n+(j-1)] * -0.00833333333333
+                          +in[(i-1)*n+(j-1)] * -0.05
+                          +in[(i)*n+(j-1)] * -0.05
+                          +in[(i+2)*n+(j-1)] * 0.00833333333333
+                          +in[(i+3)*n+(j-1)] * 0.00333333333333
+                          +in[(i+4)*n+(j-1)] * 0.00178571428571
+                          +in[(i+5)*n+(j-1)] * 0.00111111111111
+                          +in[(i-5)*n+(j)] * -0.00111111111111
+                          +in[(i-4)*n+(j)] * -0.00178571428571
+                          +in[(i-3)*n+(j)] * -0.00333333333333
+                          +in[(i-2)*n+(j)] * -0.00833333333333
+                          +in[(i-1)*n+(j)] * -0.05
+                          +in[(i+1)*n+(j)] * 0.05
+                          +in[(i+2)*n+(j)] * 0.00833333333333
+                          +in[(i+3)*n+(j)] * 0.00333333333333
+                          +in[(i+4)*n+(j)] * 0.00178571428571
+                          +in[(i+5)*n+(j)] * 0.00111111111111
+                          +in[(i-5)*n+(j+1)] * -0.00111111111111
+                          +in[(i-4)*n+(j+1)] * -0.00178571428571
+                          +in[(i-3)*n+(j+1)] * -0.00333333333333
+                          +in[(i-2)*n+(j+1)] * -0.00833333333333
+                          +in[(i)*n+(j+1)] * 0.05
                           +in[(i+1)*n+(j+1)] * 0.05
-                          +in[(i+1)*n+(j+2)] * 0.00833333333333
-                          +in[(i+1)*n+(j+3)] * 0.00333333333333
-                          +in[(i+1)*n+(j+4)] * 0.00178571428571
-                          +in[(i+1)*n+(j+5)] * 0.00111111111111
-                          +in[(i+2)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+2)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+2)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+2)*n+(j+-1)] * 0.00833333333333
-                          +in[(i+2)*n+(j+0)] * 0.00833333333333
                           +in[(i+2)*n+(j+1)] * 0.00833333333333
-                          +in[(i+2)*n+(j+2)] * 0.025
-                          +in[(i+2)*n+(j+3)] * 0.00333333333333
-                          +in[(i+2)*n+(j+4)] * 0.00178571428571
-                          +in[(i+2)*n+(j+5)] * 0.00111111111111
-                          +in[(i+3)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+3)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+3)*n+(j+-2)] * 0.00333333333333
-                          +in[(i+3)*n+(j+-1)] * 0.00333333333333
-                          +in[(i+3)*n+(j+0)] * 0.00333333333333
                           +in[(i+3)*n+(j+1)] * 0.00333333333333
-                          +in[(i+3)*n+(j+2)] * 0.00333333333333
-                          +in[(i+3)*n+(j+3)] * 0.0166666666667
-                          +in[(i+3)*n+(j+4)] * 0.00178571428571
-                          +in[(i+3)*n+(j+5)] * 0.00111111111111
-                          +in[(i+4)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+4)*n+(j+-3)] * 0.00178571428571
-                          +in[(i+4)*n+(j+-2)] * 0.00178571428571
-                          +in[(i+4)*n+(j+-1)] * 0.00178571428571
-                          +in[(i+4)*n+(j+0)] * 0.00178571428571
                           +in[(i+4)*n+(j+1)] * 0.00178571428571
-                          +in[(i+4)*n+(j+2)] * 0.00178571428571
-                          +in[(i+4)*n+(j+3)] * 0.00178571428571
-                          +in[(i+4)*n+(j+4)] * 0.0125
-                          +in[(i+4)*n+(j+5)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-4)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-3)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-2)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-1)] * 0.00111111111111
-                          +in[(i+5)*n+(j+0)] * 0.00111111111111
                           +in[(i+5)*n+(j+1)] * 0.00111111111111
+                          +in[(i-5)*n+(j+2)] * -0.00111111111111
+                          +in[(i-4)*n+(j+2)] * -0.00178571428571
+                          +in[(i-3)*n+(j+2)] * -0.00333333333333
+                          +in[(i-1)*n+(j+2)] * 0.00833333333333
+                          +in[(i)*n+(j+2)] * 0.00833333333333
+                          +in[(i+1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+2)*n+(j+2)] * 0.025
+                          +in[(i+3)*n+(j+2)] * 0.00333333333333
+                          +in[(i+4)*n+(j+2)] * 0.00178571428571
                           +in[(i+5)*n+(j+2)] * 0.00111111111111
+                          +in[(i-5)*n+(j+3)] * -0.00111111111111
+                          +in[(i-4)*n+(j+3)] * -0.00178571428571
+                          +in[(i-2)*n+(j+3)] * 0.00333333333333
+                          +in[(i-1)*n+(j+3)] * 0.00333333333333
+                          +in[(i)*n+(j+3)] * 0.00333333333333
+                          +in[(i+1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+3)*n+(j+3)] * 0.0166666666667
+                          +in[(i+4)*n+(j+3)] * 0.00178571428571
                           +in[(i+5)*n+(j+3)] * 0.00111111111111
+                          +in[(i-5)*n+(j+4)] * -0.00111111111111
+                          +in[(i-3)*n+(j+4)] * 0.00178571428571
+                          +in[(i-2)*n+(j+4)] * 0.00178571428571
+                          +in[(i-1)*n+(j+4)] * 0.00178571428571
+                          +in[(i)*n+(j+4)] * 0.00178571428571
+                          +in[(i+1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+4)*n+(j+4)] * 0.0125
                           +in[(i+5)*n+(j+4)] * 0.00111111111111
+                          +in[(i-4)*n+(j+5)] * 0.00111111111111
+                          +in[(i-3)*n+(j+5)] * 0.00111111111111
+                          +in[(i-2)*n+(j+5)] * 0.00111111111111
+                          +in[(i-1)*n+(j+5)] * 0.00111111111111
+                          +in[(i)*n+(j+5)] * 0.00111111111111
+                          +in[(i+1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+4)*n+(j+5)] * 0.00111111111111
                           +in[(i+5)*n+(j+5)] * 0.01
                           ;
       });
diff --git a/Cxx11/stencil_rajaview.hpp b/Cxx11/stencil_rajaview.hpp
index 4a521770f..e6c9c6565 100644
--- a/Cxx11/stencil_rajaview.hpp
+++ b/Cxx11/stencil_rajaview.hpp
@@ -6,10 +6,10 @@ void star1(const int n, const int t, matrix & in, matrix & out) {
     RAJA::RangeSegment inner1(1,n-1);
     auto inner2 = RAJA::make_tuple(inner1, inner1);
     RAJA::kernel<regular_policy>(inner2, [=](int i, int j) {
-              out(i,j) += +in(i+-1,j+0) * -0.5
-                          +in(i+0,j+-1) * -0.5
-                          +in(i+0,j+1) * 0.5
-                          +in(i+1,j+0) * 0.5;
+              out(i,j) += +in(i,j-1) * -0.5
+                          +in(i-1,j) * -0.5
+                          +in(i+1,j) * 0.5
+                          +in(i,j+1) * 0.5;
     });
 }
 
@@ -17,14 +17,14 @@ void star2(const int n, const int t, matrix & in, matrix & out) {
     RAJA::RangeSegment inner1(2,n-2);
     auto inner2 = RAJA::make_tuple(inner1, inner1);
     RAJA::kernel<regular_policy>(inner2, [=](int i, int j) {
-              out(i,j) += +in(i+-2,j+0) * -0.125
-                          +in(i+-1,j+0) * -0.25
-                          +in(i+0,j+-2) * -0.125
-                          +in(i+0,j+-1) * -0.25
-                          +in(i+0,j+1) * 0.25
-                          +in(i+0,j+2) * 0.125
-                          +in(i+1,j+0) * 0.25
-                          +in(i+2,j+0) * 0.125;
+              out(i,j) += +in(i,j-2) * -0.125
+                          +in(i,j-1) * -0.25
+                          +in(i-2,j) * -0.125
+                          +in(i-1,j) * -0.25
+                          +in(i+1,j) * 0.25
+                          +in(i+2,j) * 0.125
+                          +in(i,j+1) * 0.25
+                          +in(i,j+2) * 0.125;
     });
 }
 
@@ -32,18 +32,18 @@ void star3(const int n, const int t, matrix & in, matrix & out) {
     RAJA::RangeSegment inner1(3,n-3);
     auto inner2 = RAJA::make_tuple(inner1, inner1);
     RAJA::kernel<regular_policy>(inner2, [=](int i, int j) {
-              out(i,j) += +in(i+-3,j+0) * -0.0555555555556
-                          +in(i+-2,j+0) * -0.0833333333333
-                          +in(i+-1,j+0) * -0.166666666667
-                          +in(i+0,j+-3) * -0.0555555555556
-                          +in(i+0,j+-2) * -0.0833333333333
-                          +in(i+0,j+-1) * -0.166666666667
-                          +in(i+0,j+1) * 0.166666666667
-                          +in(i+0,j+2) * 0.0833333333333
-                          +in(i+0,j+3) * 0.0555555555556
-                          +in(i+1,j+0) * 0.166666666667
-                          +in(i+2,j+0) * 0.0833333333333
-                          +in(i+3,j+0) * 0.0555555555556;
+              out(i,j) += +in(i,j-3) * -0.0555555555556
+                          +in(i,j-2) * -0.0833333333333
+                          +in(i,j-1) * -0.166666666667
+                          +in(i-3,j) * -0.0555555555556
+                          +in(i-2,j) * -0.0833333333333
+                          +in(i-1,j) * -0.166666666667
+                          +in(i+1,j) * 0.166666666667
+                          +in(i+2,j) * 0.0833333333333
+                          +in(i+3,j) * 0.0555555555556
+                          +in(i,j+1) * 0.166666666667
+                          +in(i,j+2) * 0.0833333333333
+                          +in(i,j+3) * 0.0555555555556;
     });
 }
 
@@ -51,22 +51,22 @@ void star4(const int n, const int t, matrix & in, matrix & out) {
     RAJA::RangeSegment inner1(4,n-4);
     auto inner2 = RAJA::make_tuple(inner1, inner1);
     RAJA::kernel<regular_policy>(inner2, [=](int i, int j) {
-              out(i,j) += +in(i+-4,j+0) * -0.03125
-                          +in(i+-3,j+0) * -0.0416666666667
-                          +in(i+-2,j+0) * -0.0625
-                          +in(i+-1,j+0) * -0.125
-                          +in(i+0,j+-4) * -0.03125
-                          +in(i+0,j+-3) * -0.0416666666667
-                          +in(i+0,j+-2) * -0.0625
-                          +in(i+0,j+-1) * -0.125
-                          +in(i+0,j+1) * 0.125
-                          +in(i+0,j+2) * 0.0625
-                          +in(i+0,j+3) * 0.0416666666667
-                          +in(i+0,j+4) * 0.03125
-                          +in(i+1,j+0) * 0.125
-                          +in(i+2,j+0) * 0.0625
-                          +in(i+3,j+0) * 0.0416666666667
-                          +in(i+4,j+0) * 0.03125;
+              out(i,j) += +in(i,j-4) * -0.03125
+                          +in(i,j-3) * -0.0416666666667
+                          +in(i,j-2) * -0.0625
+                          +in(i,j-1) * -0.125
+                          +in(i-4,j) * -0.03125
+                          +in(i-3,j) * -0.0416666666667
+                          +in(i-2,j) * -0.0625
+                          +in(i-1,j) * -0.125
+                          +in(i+1,j) * 0.125
+                          +in(i+2,j) * 0.0625
+                          +in(i+3,j) * 0.0416666666667
+                          +in(i+4,j) * 0.03125
+                          +in(i,j+1) * 0.125
+                          +in(i,j+2) * 0.0625
+                          +in(i,j+3) * 0.0416666666667
+                          +in(i,j+4) * 0.03125;
     });
 }
 
@@ -74,26 +74,26 @@ void star5(const int n, const int t, matrix & in, matrix & out) {
     RAJA::RangeSegment inner1(5,n-5);
     auto inner2 = RAJA::make_tuple(inner1, inner1);
     RAJA::kernel<regular_policy>(inner2, [=](int i, int j) {
-              out(i,j) += +in(i+-5,j+0) * -0.02
-                          +in(i+-4,j+0) * -0.025
-                          +in(i+-3,j+0) * -0.0333333333333
-                          +in(i+-2,j+0) * -0.05
-                          +in(i+-1,j+0) * -0.1
-                          +in(i+0,j+-5) * -0.02
-                          +in(i+0,j+-4) * -0.025
-                          +in(i+0,j+-3) * -0.0333333333333
-                          +in(i+0,j+-2) * -0.05
-                          +in(i+0,j+-1) * -0.1
-                          +in(i+0,j+1) * 0.1
-                          +in(i+0,j+2) * 0.05
-                          +in(i+0,j+3) * 0.0333333333333
-                          +in(i+0,j+4) * 0.025
-                          +in(i+0,j+5) * 0.02
-                          +in(i+1,j+0) * 0.1
-                          +in(i+2,j+0) * 0.05
-                          +in(i+3,j+0) * 0.0333333333333
-                          +in(i+4,j+0) * 0.025
-                          +in(i+5,j+0) * 0.02;
+              out(i,j) += +in(i,j-5) * -0.02
+                          +in(i,j-4) * -0.025
+                          +in(i,j-3) * -0.0333333333333
+                          +in(i,j-2) * -0.05
+                          +in(i,j-1) * -0.1
+                          +in(i-5,j) * -0.02
+                          +in(i-4,j) * -0.025
+                          +in(i-3,j) * -0.0333333333333
+                          +in(i-2,j) * -0.05
+                          +in(i-1,j) * -0.1
+                          +in(i+1,j) * 0.1
+                          +in(i+2,j) * 0.05
+                          +in(i+3,j) * 0.0333333333333
+                          +in(i+4,j) * 0.025
+                          +in(i+5,j) * 0.02
+                          +in(i,j+1) * 0.1
+                          +in(i,j+2) * 0.05
+                          +in(i,j+3) * 0.0333333333333
+                          +in(i,j+4) * 0.025
+                          +in(i,j+5) * 0.02;
     });
 }
 
@@ -101,11 +101,11 @@ void grid1(const int n, const int t, matrix & in, matrix & out) {
     RAJA::RangeSegment inner1(1,n-1);
     auto inner2 = RAJA::make_tuple(inner1, inner1);
     RAJA::kernel<regular_policy>(inner2, [=](int i, int j) {
-              out(i,j) += +in(i+-1,j+-1) * -0.25
-                          +in(i+-1,j+0) * -0.25
-                          +in(i+0,j+-1) * -0.25
-                          +in(i+0,j+1) * 0.25
-                          +in(i+1,j+0) * 0.25
+              out(i,j) += +in(i-1,j-1) * -0.25
+                          +in(i,j-1) * -0.25
+                          +in(i-1,j) * -0.25
+                          +in(i+1,j) * 0.25
+                          +in(i,j+1) * 0.25
                           +in(i+1,j+1) * 0.25
                           ;
     });
@@ -115,25 +115,25 @@ void grid2(const int n, const int t, matrix & in, matrix & out) {
     RAJA::RangeSegment inner1(2,n-2);
     auto inner2 = RAJA::make_tuple(inner1, inner1);
     RAJA::kernel<regular_policy>(inner2, [=](int i, int j) {
-              out(i,j) += +in(i+-2,j+-2) * -0.0625
-                          +in(i+-2,j+-1) * -0.0208333333333
-                          +in(i+-2,j+0) * -0.0208333333333
-                          +in(i+-2,j+1) * -0.0208333333333
-                          +in(i+-1,j+-2) * -0.0208333333333
-                          +in(i+-1,j+-1) * -0.125
-                          +in(i+-1,j+0) * -0.125
-                          +in(i+-1,j+2) * 0.0208333333333
-                          +in(i+0,j+-2) * -0.0208333333333
-                          +in(i+0,j+-1) * -0.125
-                          +in(i+0,j+1) * 0.125
-                          +in(i+0,j+2) * 0.0208333333333
-                          +in(i+1,j+-2) * -0.0208333333333
-                          +in(i+1,j+0) * 0.125
+              out(i,j) += +in(i-2,j-2) * -0.0625
+                          +in(i-1,j-2) * -0.0208333333333
+                          +in(i,j-2) * -0.0208333333333
+                          +in(i+1,j-2) * -0.0208333333333
+                          +in(i-2,j-1) * -0.0208333333333
+                          +in(i-1,j-1) * -0.125
+                          +in(i,j-1) * -0.125
+                          +in(i+2,j-1) * 0.0208333333333
+                          +in(i-2,j) * -0.0208333333333
+                          +in(i-1,j) * -0.125
+                          +in(i+1,j) * 0.125
+                          +in(i+2,j) * 0.0208333333333
+                          +in(i-2,j+1) * -0.0208333333333
+                          +in(i,j+1) * 0.125
                           +in(i+1,j+1) * 0.125
-                          +in(i+1,j+2) * 0.0208333333333
-                          +in(i+2,j+-1) * 0.0208333333333
-                          +in(i+2,j+0) * 0.0208333333333
                           +in(i+2,j+1) * 0.0208333333333
+                          +in(i-1,j+2) * 0.0208333333333
+                          +in(i,j+2) * 0.0208333333333
+                          +in(i+1,j+2) * 0.0208333333333
                           +in(i+2,j+2) * 0.0625
                           ;
     });
@@ -143,47 +143,47 @@ void grid3(const int n, const int t, matrix & in, matrix & out) {
     RAJA::RangeSegment inner1(3,n-3);
     auto inner2 = RAJA::make_tuple(inner1, inner1);
     RAJA::kernel<regular_policy>(inner2, [=](int i, int j) {
-              out(i,j) += +in(i+-3,j+-3) * -0.0277777777778
-                          +in(i+-3,j+-2) * -0.00555555555556
-                          +in(i+-3,j+-1) * -0.00555555555556
-                          +in(i+-3,j+0) * -0.00555555555556
-                          +in(i+-3,j+1) * -0.00555555555556
-                          +in(i+-3,j+2) * -0.00555555555556
-                          +in(i+-2,j+-3) * -0.00555555555556
-                          +in(i+-2,j+-2) * -0.0416666666667
-                          +in(i+-2,j+-1) * -0.0138888888889
-                          +in(i+-2,j+0) * -0.0138888888889
-                          +in(i+-2,j+1) * -0.0138888888889
-                          +in(i+-2,j+3) * 0.00555555555556
-                          +in(i+-1,j+-3) * -0.00555555555556
-                          +in(i+-1,j+-2) * -0.0138888888889
-                          +in(i+-1,j+-1) * -0.0833333333333
-                          +in(i+-1,j+0) * -0.0833333333333
-                          +in(i+-1,j+2) * 0.0138888888889
-                          +in(i+-1,j+3) * 0.00555555555556
-                          +in(i+0,j+-3) * -0.00555555555556
-                          +in(i+0,j+-2) * -0.0138888888889
-                          +in(i+0,j+-1) * -0.0833333333333
-                          +in(i+0,j+1) * 0.0833333333333
-                          +in(i+0,j+2) * 0.0138888888889
-                          +in(i+0,j+3) * 0.00555555555556
-                          +in(i+1,j+-3) * -0.00555555555556
-                          +in(i+1,j+-2) * -0.0138888888889
-                          +in(i+1,j+0) * 0.0833333333333
+              out(i,j) += +in(i-3,j-3) * -0.0277777777778
+                          +in(i-2,j-3) * -0.00555555555556
+                          +in(i-1,j-3) * -0.00555555555556
+                          +in(i,j-3) * -0.00555555555556
+                          +in(i+1,j-3) * -0.00555555555556
+                          +in(i+2,j-3) * -0.00555555555556
+                          +in(i-3,j-2) * -0.00555555555556
+                          +in(i-2,j-2) * -0.0416666666667
+                          +in(i-1,j-2) * -0.0138888888889
+                          +in(i,j-2) * -0.0138888888889
+                          +in(i+1,j-2) * -0.0138888888889
+                          +in(i+3,j-2) * 0.00555555555556
+                          +in(i-3,j-1) * -0.00555555555556
+                          +in(i-2,j-1) * -0.0138888888889
+                          +in(i-1,j-1) * -0.0833333333333
+                          +in(i,j-1) * -0.0833333333333
+                          +in(i+2,j-1) * 0.0138888888889
+                          +in(i+3,j-1) * 0.00555555555556
+                          +in(i-3,j) * -0.00555555555556
+                          +in(i-2,j) * -0.0138888888889
+                          +in(i-1,j) * -0.0833333333333
+                          +in(i+1,j) * 0.0833333333333
+                          +in(i+2,j) * 0.0138888888889
+                          +in(i+3,j) * 0.00555555555556
+                          +in(i-3,j+1) * -0.00555555555556
+                          +in(i-2,j+1) * -0.0138888888889
+                          +in(i,j+1) * 0.0833333333333
                           +in(i+1,j+1) * 0.0833333333333
-                          +in(i+1,j+2) * 0.0138888888889
-                          +in(i+1,j+3) * 0.00555555555556
-                          +in(i+2,j+-3) * -0.00555555555556
-                          +in(i+2,j+-1) * 0.0138888888889
-                          +in(i+2,j+0) * 0.0138888888889
                           +in(i+2,j+1) * 0.0138888888889
-                          +in(i+2,j+2) * 0.0416666666667
-                          +in(i+2,j+3) * 0.00555555555556
-                          +in(i+3,j+-2) * 0.00555555555556
-                          +in(i+3,j+-1) * 0.00555555555556
-                          +in(i+3,j+0) * 0.00555555555556
                           +in(i+3,j+1) * 0.00555555555556
+                          +in(i-3,j+2) * -0.00555555555556
+                          +in(i-1,j+2) * 0.0138888888889
+                          +in(i,j+2) * 0.0138888888889
+                          +in(i+1,j+2) * 0.0138888888889
+                          +in(i+2,j+2) * 0.0416666666667
                           +in(i+3,j+2) * 0.00555555555556
+                          +in(i-2,j+3) * 0.00555555555556
+                          +in(i-1,j+3) * 0.00555555555556
+                          +in(i,j+3) * 0.00555555555556
+                          +in(i+1,j+3) * 0.00555555555556
+                          +in(i+2,j+3) * 0.00555555555556
                           +in(i+3,j+3) * 0.0277777777778
                           ;
     });
@@ -193,77 +193,77 @@ void grid4(const int n, const int t, matrix & in, matrix & out) {
     RAJA::RangeSegment inner1(4,n-4);
     auto inner2 = RAJA::make_tuple(inner1, inner1);
     RAJA::kernel<regular_policy>(inner2, [=](int i, int j) {
-              out(i,j) += +in(i+-4,j+-4) * -0.015625
-                          +in(i+-4,j+-3) * -0.00223214285714
-                          +in(i+-4,j+-2) * -0.00223214285714
-                          +in(i+-4,j+-1) * -0.00223214285714
-                          +in(i+-4,j+0) * -0.00223214285714
-                          +in(i+-4,j+1) * -0.00223214285714
-                          +in(i+-4,j+2) * -0.00223214285714
-                          +in(i+-4,j+3) * -0.00223214285714
-                          +in(i+-3,j+-4) * -0.00223214285714
-                          +in(i+-3,j+-3) * -0.0208333333333
-                          +in(i+-3,j+-2) * -0.00416666666667
-                          +in(i+-3,j+-1) * -0.00416666666667
-                          +in(i+-3,j+0) * -0.00416666666667
-                          +in(i+-3,j+1) * -0.00416666666667
-                          +in(i+-3,j+2) * -0.00416666666667
-                          +in(i+-3,j+4) * 0.00223214285714
-                          +in(i+-2,j+-4) * -0.00223214285714
-                          +in(i+-2,j+-3) * -0.00416666666667
-                          +in(i+-2,j+-2) * -0.03125
-                          +in(i+-2,j+-1) * -0.0104166666667
-                          +in(i+-2,j+0) * -0.0104166666667
-                          +in(i+-2,j+1) * -0.0104166666667
-                          +in(i+-2,j+3) * 0.00416666666667
-                          +in(i+-2,j+4) * 0.00223214285714
-                          +in(i+-1,j+-4) * -0.00223214285714
-                          +in(i+-1,j+-3) * -0.00416666666667
-                          +in(i+-1,j+-2) * -0.0104166666667
-                          +in(i+-1,j+-1) * -0.0625
-                          +in(i+-1,j+0) * -0.0625
-                          +in(i+-1,j+2) * 0.0104166666667
-                          +in(i+-1,j+3) * 0.00416666666667
-                          +in(i+-1,j+4) * 0.00223214285714
-                          +in(i+0,j+-4) * -0.00223214285714
-                          +in(i+0,j+-3) * -0.00416666666667
-                          +in(i+0,j+-2) * -0.0104166666667
-                          +in(i+0,j+-1) * -0.0625
-                          +in(i+0,j+1) * 0.0625
-                          +in(i+0,j+2) * 0.0104166666667
-                          +in(i+0,j+3) * 0.00416666666667
-                          +in(i+0,j+4) * 0.00223214285714
-                          +in(i+1,j+-4) * -0.00223214285714
-                          +in(i+1,j+-3) * -0.00416666666667
-                          +in(i+1,j+-2) * -0.0104166666667
-                          +in(i+1,j+0) * 0.0625
+              out(i,j) += +in(i-4,j-4) * -0.015625
+                          +in(i-3,j-4) * -0.00223214285714
+                          +in(i-2,j-4) * -0.00223214285714
+                          +in(i-1,j-4) * -0.00223214285714
+                          +in(i,j-4) * -0.00223214285714
+                          +in(i+1,j-4) * -0.00223214285714
+                          +in(i+2,j-4) * -0.00223214285714
+                          +in(i+3,j-4) * -0.00223214285714
+                          +in(i-4,j-3) * -0.00223214285714
+                          +in(i-3,j-3) * -0.0208333333333
+                          +in(i-2,j-3) * -0.00416666666667
+                          +in(i-1,j-3) * -0.00416666666667
+                          +in(i,j-3) * -0.00416666666667
+                          +in(i+1,j-3) * -0.00416666666667
+                          +in(i+2,j-3) * -0.00416666666667
+                          +in(i+4,j-3) * 0.00223214285714
+                          +in(i-4,j-2) * -0.00223214285714
+                          +in(i-3,j-2) * -0.00416666666667
+                          +in(i-2,j-2) * -0.03125
+                          +in(i-1,j-2) * -0.0104166666667
+                          +in(i,j-2) * -0.0104166666667
+                          +in(i+1,j-2) * -0.0104166666667
+                          +in(i+3,j-2) * 0.00416666666667
+                          +in(i+4,j-2) * 0.00223214285714
+                          +in(i-4,j-1) * -0.00223214285714
+                          +in(i-3,j-1) * -0.00416666666667
+                          +in(i-2,j-1) * -0.0104166666667
+                          +in(i-1,j-1) * -0.0625
+                          +in(i,j-1) * -0.0625
+                          +in(i+2,j-1) * 0.0104166666667
+                          +in(i+3,j-1) * 0.00416666666667
+                          +in(i+4,j-1) * 0.00223214285714
+                          +in(i-4,j) * -0.00223214285714
+                          +in(i-3,j) * -0.00416666666667
+                          +in(i-2,j) * -0.0104166666667
+                          +in(i-1,j) * -0.0625
+                          +in(i+1,j) * 0.0625
+                          +in(i+2,j) * 0.0104166666667
+                          +in(i+3,j) * 0.00416666666667
+                          +in(i+4,j) * 0.00223214285714
+                          +in(i-4,j+1) * -0.00223214285714
+                          +in(i-3,j+1) * -0.00416666666667
+                          +in(i-2,j+1) * -0.0104166666667
+                          +in(i,j+1) * 0.0625
                           +in(i+1,j+1) * 0.0625
-                          +in(i+1,j+2) * 0.0104166666667
-                          +in(i+1,j+3) * 0.00416666666667
-                          +in(i+1,j+4) * 0.00223214285714
-                          +in(i+2,j+-4) * -0.00223214285714
-                          +in(i+2,j+-3) * -0.00416666666667
-                          +in(i+2,j+-1) * 0.0104166666667
-                          +in(i+2,j+0) * 0.0104166666667
                           +in(i+2,j+1) * 0.0104166666667
-                          +in(i+2,j+2) * 0.03125
-                          +in(i+2,j+3) * 0.00416666666667
-                          +in(i+2,j+4) * 0.00223214285714
-                          +in(i+3,j+-4) * -0.00223214285714
-                          +in(i+3,j+-2) * 0.00416666666667
-                          +in(i+3,j+-1) * 0.00416666666667
-                          +in(i+3,j+0) * 0.00416666666667
                           +in(i+3,j+1) * 0.00416666666667
-                          +in(i+3,j+2) * 0.00416666666667
-                          +in(i+3,j+3) * 0.0208333333333
-                          +in(i+3,j+4) * 0.00223214285714
-                          +in(i+4,j+-3) * 0.00223214285714
-                          +in(i+4,j+-2) * 0.00223214285714
-                          +in(i+4,j+-1) * 0.00223214285714
-                          +in(i+4,j+0) * 0.00223214285714
                           +in(i+4,j+1) * 0.00223214285714
+                          +in(i-4,j+2) * -0.00223214285714
+                          +in(i-3,j+2) * -0.00416666666667
+                          +in(i-1,j+2) * 0.0104166666667
+                          +in(i,j+2) * 0.0104166666667
+                          +in(i+1,j+2) * 0.0104166666667
+                          +in(i+2,j+2) * 0.03125
+                          +in(i+3,j+2) * 0.00416666666667
                           +in(i+4,j+2) * 0.00223214285714
+                          +in(i-4,j+3) * -0.00223214285714
+                          +in(i-2,j+3) * 0.00416666666667
+                          +in(i-1,j+3) * 0.00416666666667
+                          +in(i,j+3) * 0.00416666666667
+                          +in(i+1,j+3) * 0.00416666666667
+                          +in(i+2,j+3) * 0.00416666666667
+                          +in(i+3,j+3) * 0.0208333333333
                           +in(i+4,j+3) * 0.00223214285714
+                          +in(i-3,j+4) * 0.00223214285714
+                          +in(i-2,j+4) * 0.00223214285714
+                          +in(i-1,j+4) * 0.00223214285714
+                          +in(i,j+4) * 0.00223214285714
+                          +in(i+1,j+4) * 0.00223214285714
+                          +in(i+2,j+4) * 0.00223214285714
+                          +in(i+3,j+4) * 0.00223214285714
                           +in(i+4,j+4) * 0.015625
                           ;
     });
@@ -273,115 +273,115 @@ void grid5(const int n, const int t, matrix & in, matrix & out) {
     RAJA::RangeSegment inner1(5,n-5);
     auto inner2 = RAJA::make_tuple(inner1, inner1);
     RAJA::kernel<regular_policy>(inner2, [=](int i, int j) {
-              out(i,j) += +in(i+-5,j+-5) * -0.01
-                          +in(i+-5,j+-4) * -0.00111111111111
-                          +in(i+-5,j+-3) * -0.00111111111111
-                          +in(i+-5,j+-2) * -0.00111111111111
-                          +in(i+-5,j+-1) * -0.00111111111111
-                          +in(i+-5,j+0) * -0.00111111111111
-                          +in(i+-5,j+1) * -0.00111111111111
-                          +in(i+-5,j+2) * -0.00111111111111
-                          +in(i+-5,j+3) * -0.00111111111111
-                          +in(i+-5,j+4) * -0.00111111111111
-                          +in(i+-4,j+-5) * -0.00111111111111
-                          +in(i+-4,j+-4) * -0.0125
-                          +in(i+-4,j+-3) * -0.00178571428571
-                          +in(i+-4,j+-2) * -0.00178571428571
-                          +in(i+-4,j+-1) * -0.00178571428571
-                          +in(i+-4,j+0) * -0.00178571428571
-                          +in(i+-4,j+1) * -0.00178571428571
-                          +in(i+-4,j+2) * -0.00178571428571
-                          +in(i+-4,j+3) * -0.00178571428571
-                          +in(i+-4,j+5) * 0.00111111111111
-                          +in(i+-3,j+-5) * -0.00111111111111
-                          +in(i+-3,j+-4) * -0.00178571428571
-                          +in(i+-3,j+-3) * -0.0166666666667
-                          +in(i+-3,j+-2) * -0.00333333333333
-                          +in(i+-3,j+-1) * -0.00333333333333
-                          +in(i+-3,j+0) * -0.00333333333333
-                          +in(i+-3,j+1) * -0.00333333333333
-                          +in(i+-3,j+2) * -0.00333333333333
-                          +in(i+-3,j+4) * 0.00178571428571
-                          +in(i+-3,j+5) * 0.00111111111111
-                          +in(i+-2,j+-5) * -0.00111111111111
-                          +in(i+-2,j+-4) * -0.00178571428571
-                          +in(i+-2,j+-3) * -0.00333333333333
-                          +in(i+-2,j+-2) * -0.025
-                          +in(i+-2,j+-1) * -0.00833333333333
-                          +in(i+-2,j+0) * -0.00833333333333
-                          +in(i+-2,j+1) * -0.00833333333333
-                          +in(i+-2,j+3) * 0.00333333333333
-                          +in(i+-2,j+4) * 0.00178571428571
-                          +in(i+-2,j+5) * 0.00111111111111
-                          +in(i+-1,j+-5) * -0.00111111111111
-                          +in(i+-1,j+-4) * -0.00178571428571
-                          +in(i+-1,j+-3) * -0.00333333333333
-                          +in(i+-1,j+-2) * -0.00833333333333
-                          +in(i+-1,j+-1) * -0.05
-                          +in(i+-1,j+0) * -0.05
-                          +in(i+-1,j+2) * 0.00833333333333
-                          +in(i+-1,j+3) * 0.00333333333333
-                          +in(i+-1,j+4) * 0.00178571428571
-                          +in(i+-1,j+5) * 0.00111111111111
-                          +in(i+0,j+-5) * -0.00111111111111
-                          +in(i+0,j+-4) * -0.00178571428571
-                          +in(i+0,j+-3) * -0.00333333333333
-                          +in(i+0,j+-2) * -0.00833333333333
-                          +in(i+0,j+-1) * -0.05
-                          +in(i+0,j+1) * 0.05
-                          +in(i+0,j+2) * 0.00833333333333
-                          +in(i+0,j+3) * 0.00333333333333
-                          +in(i+0,j+4) * 0.00178571428571
-                          +in(i+0,j+5) * 0.00111111111111
-                          +in(i+1,j+-5) * -0.00111111111111
-                          +in(i+1,j+-4) * -0.00178571428571
-                          +in(i+1,j+-3) * -0.00333333333333
-                          +in(i+1,j+-2) * -0.00833333333333
-                          +in(i+1,j+0) * 0.05
+              out(i,j) += +in(i-5,j-5) * -0.01
+                          +in(i-4,j-5) * -0.00111111111111
+                          +in(i-3,j-5) * -0.00111111111111
+                          +in(i-2,j-5) * -0.00111111111111
+                          +in(i-1,j-5) * -0.00111111111111
+                          +in(i,j-5) * -0.00111111111111
+                          +in(i+1,j-5) * -0.00111111111111
+                          +in(i+2,j-5) * -0.00111111111111
+                          +in(i+3,j-5) * -0.00111111111111
+                          +in(i+4,j-5) * -0.00111111111111
+                          +in(i-5,j-4) * -0.00111111111111
+                          +in(i-4,j-4) * -0.0125
+                          +in(i-3,j-4) * -0.00178571428571
+                          +in(i-2,j-4) * -0.00178571428571
+                          +in(i-1,j-4) * -0.00178571428571
+                          +in(i,j-4) * -0.00178571428571
+                          +in(i+1,j-4) * -0.00178571428571
+                          +in(i+2,j-4) * -0.00178571428571
+                          +in(i+3,j-4) * -0.00178571428571
+                          +in(i+5,j-4) * 0.00111111111111
+                          +in(i-5,j-3) * -0.00111111111111
+                          +in(i-4,j-3) * -0.00178571428571
+                          +in(i-3,j-3) * -0.0166666666667
+                          +in(i-2,j-3) * -0.00333333333333
+                          +in(i-1,j-3) * -0.00333333333333
+                          +in(i,j-3) * -0.00333333333333
+                          +in(i+1,j-3) * -0.00333333333333
+                          +in(i+2,j-3) * -0.00333333333333
+                          +in(i+4,j-3) * 0.00178571428571
+                          +in(i+5,j-3) * 0.00111111111111
+                          +in(i-5,j-2) * -0.00111111111111
+                          +in(i-4,j-2) * -0.00178571428571
+                          +in(i-3,j-2) * -0.00333333333333
+                          +in(i-2,j-2) * -0.025
+                          +in(i-1,j-2) * -0.00833333333333
+                          +in(i,j-2) * -0.00833333333333
+                          +in(i+1,j-2) * -0.00833333333333
+                          +in(i+3,j-2) * 0.00333333333333
+                          +in(i+4,j-2) * 0.00178571428571
+                          +in(i+5,j-2) * 0.00111111111111
+                          +in(i-5,j-1) * -0.00111111111111
+                          +in(i-4,j-1) * -0.00178571428571
+                          +in(i-3,j-1) * -0.00333333333333
+                          +in(i-2,j-1) * -0.00833333333333
+                          +in(i-1,j-1) * -0.05
+                          +in(i,j-1) * -0.05
+                          +in(i+2,j-1) * 0.00833333333333
+                          +in(i+3,j-1) * 0.00333333333333
+                          +in(i+4,j-1) * 0.00178571428571
+                          +in(i+5,j-1) * 0.00111111111111
+                          +in(i-5,j) * -0.00111111111111
+                          +in(i-4,j) * -0.00178571428571
+                          +in(i-3,j) * -0.00333333333333
+                          +in(i-2,j) * -0.00833333333333
+                          +in(i-1,j) * -0.05
+                          +in(i+1,j) * 0.05
+                          +in(i+2,j) * 0.00833333333333
+                          +in(i+3,j) * 0.00333333333333
+                          +in(i+4,j) * 0.00178571428571
+                          +in(i+5,j) * 0.00111111111111
+                          +in(i-5,j+1) * -0.00111111111111
+                          +in(i-4,j+1) * -0.00178571428571
+                          +in(i-3,j+1) * -0.00333333333333
+                          +in(i-2,j+1) * -0.00833333333333
+                          +in(i,j+1) * 0.05
                           +in(i+1,j+1) * 0.05
-                          +in(i+1,j+2) * 0.00833333333333
-                          +in(i+1,j+3) * 0.00333333333333
-                          +in(i+1,j+4) * 0.00178571428571
-                          +in(i+1,j+5) * 0.00111111111111
-                          +in(i+2,j+-5) * -0.00111111111111
-                          +in(i+2,j+-4) * -0.00178571428571
-                          +in(i+2,j+-3) * -0.00333333333333
-                          +in(i+2,j+-1) * 0.00833333333333
-                          +in(i+2,j+0) * 0.00833333333333
                           +in(i+2,j+1) * 0.00833333333333
-                          +in(i+2,j+2) * 0.025
-                          +in(i+2,j+3) * 0.00333333333333
-                          +in(i+2,j+4) * 0.00178571428571
-                          +in(i+2,j+5) * 0.00111111111111
-                          +in(i+3,j+-5) * -0.00111111111111
-                          +in(i+3,j+-4) * -0.00178571428571
-                          +in(i+3,j+-2) * 0.00333333333333
-                          +in(i+3,j+-1) * 0.00333333333333
-                          +in(i+3,j+0) * 0.00333333333333
                           +in(i+3,j+1) * 0.00333333333333
-                          +in(i+3,j+2) * 0.00333333333333
-                          +in(i+3,j+3) * 0.0166666666667
-                          +in(i+3,j+4) * 0.00178571428571
-                          +in(i+3,j+5) * 0.00111111111111
-                          +in(i+4,j+-5) * -0.00111111111111
-                          +in(i+4,j+-3) * 0.00178571428571
-                          +in(i+4,j+-2) * 0.00178571428571
-                          +in(i+4,j+-1) * 0.00178571428571
-                          +in(i+4,j+0) * 0.00178571428571
                           +in(i+4,j+1) * 0.00178571428571
-                          +in(i+4,j+2) * 0.00178571428571
-                          +in(i+4,j+3) * 0.00178571428571
-                          +in(i+4,j+4) * 0.0125
-                          +in(i+4,j+5) * 0.00111111111111
-                          +in(i+5,j+-4) * 0.00111111111111
-                          +in(i+5,j+-3) * 0.00111111111111
-                          +in(i+5,j+-2) * 0.00111111111111
-                          +in(i+5,j+-1) * 0.00111111111111
-                          +in(i+5,j+0) * 0.00111111111111
                           +in(i+5,j+1) * 0.00111111111111
+                          +in(i-5,j+2) * -0.00111111111111
+                          +in(i-4,j+2) * -0.00178571428571
+                          +in(i-3,j+2) * -0.00333333333333
+                          +in(i-1,j+2) * 0.00833333333333
+                          +in(i,j+2) * 0.00833333333333
+                          +in(i+1,j+2) * 0.00833333333333
+                          +in(i+2,j+2) * 0.025
+                          +in(i+3,j+2) * 0.00333333333333
+                          +in(i+4,j+2) * 0.00178571428571
                           +in(i+5,j+2) * 0.00111111111111
+                          +in(i-5,j+3) * -0.00111111111111
+                          +in(i-4,j+3) * -0.00178571428571
+                          +in(i-2,j+3) * 0.00333333333333
+                          +in(i-1,j+3) * 0.00333333333333
+                          +in(i,j+3) * 0.00333333333333
+                          +in(i+1,j+3) * 0.00333333333333
+                          +in(i+2,j+3) * 0.00333333333333
+                          +in(i+3,j+3) * 0.0166666666667
+                          +in(i+4,j+3) * 0.00178571428571
                           +in(i+5,j+3) * 0.00111111111111
+                          +in(i-5,j+4) * -0.00111111111111
+                          +in(i-3,j+4) * 0.00178571428571
+                          +in(i-2,j+4) * 0.00178571428571
+                          +in(i-1,j+4) * 0.00178571428571
+                          +in(i,j+4) * 0.00178571428571
+                          +in(i+1,j+4) * 0.00178571428571
+                          +in(i+2,j+4) * 0.00178571428571
+                          +in(i+3,j+4) * 0.00178571428571
+                          +in(i+4,j+4) * 0.0125
                           +in(i+5,j+4) * 0.00111111111111
+                          +in(i-4,j+5) * 0.00111111111111
+                          +in(i-3,j+5) * 0.00111111111111
+                          +in(i-2,j+5) * 0.00111111111111
+                          +in(i-1,j+5) * 0.00111111111111
+                          +in(i,j+5) * 0.00111111111111
+                          +in(i+1,j+5) * 0.00111111111111
+                          +in(i+2,j+5) * 0.00111111111111
+                          +in(i+3,j+5) * 0.00111111111111
+                          +in(i+4,j+5) * 0.00111111111111
                           +in(i+5,j+5) * 0.01
                           ;
     });
diff --git a/Cxx11/stencil_rangefor.hpp b/Cxx11/stencil_rangefor.hpp
index c85964181..f1ecb729e 100644
--- a/Cxx11/stencil_rangefor.hpp
+++ b/Cxx11/stencil_rangefor.hpp
@@ -3,10 +3,10 @@ void star1(const int n, const int t, std::vector<double> & in, std::vector<doubl
     for (auto i : inside) {
       PRAGMA_SIMD
       for (auto j : inside) {
-            out[i*n+j] += +in[(i+-1)*n+(j+0)] * -0.5
-                          +in[(i+0)*n+(j+-1)] * -0.5
-                          +in[(i+0)*n+(j+1)] * 0.5
-                          +in[(i+1)*n+(j+0)] * 0.5;
+            out[i*n+j] += +in[(i)*n+(j-1)] * -0.5
+                          +in[(i-1)*n+(j)] * -0.5
+                          +in[(i+1)*n+(j)] * 0.5
+                          +in[(i)*n+(j+1)] * 0.5;
            }
          }
        }
@@ -18,14 +18,14 @@ void star2(const int n, const int t, std::vector<double> & in, std::vector<doubl
     for (auto i : inside) {
       PRAGMA_SIMD
       for (auto j : inside) {
-            out[i*n+j] += +in[(i+-2)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+0)] * -0.25
-                          +in[(i+0)*n+(j+-2)] * -0.125
-                          +in[(i+0)*n+(j+-1)] * -0.25
-                          +in[(i+0)*n+(j+1)] * 0.25
-                          +in[(i+0)*n+(j+2)] * 0.125
-                          +in[(i+1)*n+(j+0)] * 0.25
-                          +in[(i+2)*n+(j+0)] * 0.125;
+            out[i*n+j] += +in[(i)*n+(j-2)] * -0.125
+                          +in[(i)*n+(j-1)] * -0.25
+                          +in[(i-2)*n+(j)] * -0.125
+                          +in[(i-1)*n+(j)] * -0.25
+                          +in[(i+1)*n+(j)] * 0.25
+                          +in[(i+2)*n+(j)] * 0.125
+                          +in[(i)*n+(j+1)] * 0.25
+                          +in[(i)*n+(j+2)] * 0.125;
            }
          }
        }
@@ -37,18 +37,18 @@ void star3(const int n, const int t, std::vector<double> & in, std::vector<doubl
     for (auto i : inside) {
       PRAGMA_SIMD
       for (auto j : inside) {
-            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556
-                          +in[(i+-2)*n+(j+0)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+0)] * -0.166666666667
-                          +in[(i+0)*n+(j+-3)] * -0.0555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.0833333333333
-                          +in[(i+0)*n+(j+-1)] * -0.166666666667
-                          +in[(i+0)*n+(j+1)] * 0.166666666667
-                          +in[(i+0)*n+(j+2)] * 0.0833333333333
-                          +in[(i+0)*n+(j+3)] * 0.0555555555556
-                          +in[(i+1)*n+(j+0)] * 0.166666666667
-                          +in[(i+2)*n+(j+0)] * 0.0833333333333
-                          +in[(i+3)*n+(j+0)] * 0.0555555555556;
+            out[i*n+j] += +in[(i)*n+(j-3)] * -0.0555555555556
+                          +in[(i)*n+(j-2)] * -0.0833333333333
+                          +in[(i)*n+(j-1)] * -0.166666666667
+                          +in[(i-3)*n+(j)] * -0.0555555555556
+                          +in[(i-2)*n+(j)] * -0.0833333333333
+                          +in[(i-1)*n+(j)] * -0.166666666667
+                          +in[(i+1)*n+(j)] * 0.166666666667
+                          +in[(i+2)*n+(j)] * 0.0833333333333
+                          +in[(i+3)*n+(j)] * 0.0555555555556
+                          +in[(i)*n+(j+1)] * 0.166666666667
+                          +in[(i)*n+(j+2)] * 0.0833333333333
+                          +in[(i)*n+(j+3)] * 0.0555555555556;
            }
          }
        }
@@ -60,22 +60,22 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
     for (auto i : inside) {
       PRAGMA_SIMD
       for (auto j : inside) {
-            out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125
-                          +in[(i+-3)*n+(j+0)] * -0.0416666666667
-                          +in[(i+-2)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+0)*n+(j+-4)] * -0.03125
-                          +in[(i+0)*n+(j+-3)] * -0.0416666666667
-                          +in[(i+0)*n+(j+-2)] * -0.0625
-                          +in[(i+0)*n+(j+-1)] * -0.125
-                          +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.0625
-                          +in[(i+0)*n+(j+3)] * 0.0416666666667
-                          +in[(i+0)*n+(j+4)] * 0.03125
-                          +in[(i+1)*n+(j+0)] * 0.125
-                          +in[(i+2)*n+(j+0)] * 0.0625
-                          +in[(i+3)*n+(j+0)] * 0.0416666666667
-                          +in[(i+4)*n+(j+0)] * 0.03125;
+            out[i*n+j] += +in[(i)*n+(j-4)] * -0.03125
+                          +in[(i)*n+(j-3)] * -0.0416666666667
+                          +in[(i)*n+(j-2)] * -0.0625
+                          +in[(i)*n+(j-1)] * -0.125
+                          +in[(i-4)*n+(j)] * -0.03125
+                          +in[(i-3)*n+(j)] * -0.0416666666667
+                          +in[(i-2)*n+(j)] * -0.0625
+                          +in[(i-1)*n+(j)] * -0.125
+                          +in[(i+1)*n+(j)] * 0.125
+                          +in[(i+2)*n+(j)] * 0.0625
+                          +in[(i+3)*n+(j)] * 0.0416666666667
+                          +in[(i+4)*n+(j)] * 0.03125
+                          +in[(i)*n+(j+1)] * 0.125
+                          +in[(i)*n+(j+2)] * 0.0625
+                          +in[(i)*n+(j+3)] * 0.0416666666667
+                          +in[(i)*n+(j+4)] * 0.03125;
            }
          }
        }
@@ -87,26 +87,26 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
     for (auto i : inside) {
       PRAGMA_SIMD
       for (auto j : inside) {
-            out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02
-                          +in[(i+-4)*n+(j+0)] * -0.025
-                          +in[(i+-3)*n+(j+0)] * -0.0333333333333
-                          +in[(i+-2)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+0)] * -0.1
-                          +in[(i+0)*n+(j+-5)] * -0.02
-                          +in[(i+0)*n+(j+-4)] * -0.025
-                          +in[(i+0)*n+(j+-3)] * -0.0333333333333
-                          +in[(i+0)*n+(j+-2)] * -0.05
-                          +in[(i+0)*n+(j+-1)] * -0.1
-                          +in[(i+0)*n+(j+1)] * 0.1
-                          +in[(i+0)*n+(j+2)] * 0.05
-                          +in[(i+0)*n+(j+3)] * 0.0333333333333
-                          +in[(i+0)*n+(j+4)] * 0.025
-                          +in[(i+0)*n+(j+5)] * 0.02
-                          +in[(i+1)*n+(j+0)] * 0.1
-                          +in[(i+2)*n+(j+0)] * 0.05
-                          +in[(i+3)*n+(j+0)] * 0.0333333333333
-                          +in[(i+4)*n+(j+0)] * 0.025
-                          +in[(i+5)*n+(j+0)] * 0.02;
+            out[i*n+j] += +in[(i)*n+(j-5)] * -0.02
+                          +in[(i)*n+(j-4)] * -0.025
+                          +in[(i)*n+(j-3)] * -0.0333333333333
+                          +in[(i)*n+(j-2)] * -0.05
+                          +in[(i)*n+(j-1)] * -0.1
+                          +in[(i-5)*n+(j)] * -0.02
+                          +in[(i-4)*n+(j)] * -0.025
+                          +in[(i-3)*n+(j)] * -0.0333333333333
+                          +in[(i-2)*n+(j)] * -0.05
+                          +in[(i-1)*n+(j)] * -0.1
+                          +in[(i+1)*n+(j)] * 0.1
+                          +in[(i+2)*n+(j)] * 0.05
+                          +in[(i+3)*n+(j)] * 0.0333333333333
+                          +in[(i+4)*n+(j)] * 0.025
+                          +in[(i+5)*n+(j)] * 0.02
+                          +in[(i)*n+(j+1)] * 0.1
+                          +in[(i)*n+(j+2)] * 0.05
+                          +in[(i)*n+(j+3)] * 0.0333333333333
+                          +in[(i)*n+(j+4)] * 0.025
+                          +in[(i)*n+(j+5)] * 0.02;
            }
          }
        }
@@ -118,11 +118,11 @@ void grid1(const int n, const int t, std::vector<double> & in, std::vector<doubl
     for (auto i : inside) {
       PRAGMA_SIMD
       for (auto j : inside) {
-            out[i*n+j] += +in[(i+-1)*n+(j+-1)] * -0.25
-                          +in[(i+-1)*n+(j+0)] * -0.25
-                          +in[(i+0)*n+(j+-1)] * -0.25
-                          +in[(i+0)*n+(j+1)] * 0.25
-                          +in[(i+1)*n+(j+0)] * 0.25
+            out[i*n+j] += +in[(i-1)*n+(j-1)] * -0.25
+                          +in[(i)*n+(j-1)] * -0.25
+                          +in[(i-1)*n+(j)] * -0.25
+                          +in[(i+1)*n+(j)] * 0.25
+                          +in[(i)*n+(j+1)] * 0.25
                           +in[(i+1)*n+(j+1)] * 0.25
                           ;
            }
@@ -136,25 +136,25 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
     for (auto i : inside) {
       PRAGMA_SIMD
       for (auto j : inside) {
-            out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625
-                          +in[(i+-2)*n+(j+-1)] * -0.0208333333333
-                          +in[(i+-2)*n+(j+0)] * -0.0208333333333
-                          +in[(i+-2)*n+(j+1)] * -0.0208333333333
-                          +in[(i+-1)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+-1)*n+(j+-1)] * -0.125
-                          +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+2)] * 0.0208333333333
-                          +in[(i+0)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+0)*n+(j+-1)] * -0.125
-                          +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.0208333333333
-                          +in[(i+1)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+1)*n+(j+0)] * 0.125
+            out[i*n+j] += +in[(i-2)*n+(j-2)] * -0.0625
+                          +in[(i-1)*n+(j-2)] * -0.0208333333333
+                          +in[(i)*n+(j-2)] * -0.0208333333333
+                          +in[(i+1)*n+(j-2)] * -0.0208333333333
+                          +in[(i-2)*n+(j-1)] * -0.0208333333333
+                          +in[(i-1)*n+(j-1)] * -0.125
+                          +in[(i)*n+(j-1)] * -0.125
+                          +in[(i+2)*n+(j-1)] * 0.0208333333333
+                          +in[(i-2)*n+(j)] * -0.0208333333333
+                          +in[(i-1)*n+(j)] * -0.125
+                          +in[(i+1)*n+(j)] * 0.125
+                          +in[(i+2)*n+(j)] * 0.0208333333333
+                          +in[(i-2)*n+(j+1)] * -0.0208333333333
+                          +in[(i)*n+(j+1)] * 0.125
                           +in[(i+1)*n+(j+1)] * 0.125
-                          +in[(i+1)*n+(j+2)] * 0.0208333333333
-                          +in[(i+2)*n+(j+-1)] * 0.0208333333333
-                          +in[(i+2)*n+(j+0)] * 0.0208333333333
                           +in[(i+2)*n+(j+1)] * 0.0208333333333
+                          +in[(i-1)*n+(j+2)] * 0.0208333333333
+                          +in[(i)*n+(j+2)] * 0.0208333333333
+                          +in[(i+1)*n+(j+2)] * 0.0208333333333
                           +in[(i+2)*n+(j+2)] * 0.0625
                           ;
            }
@@ -168,47 +168,47 @@ void grid3(const int n, const int t, std::vector<double> & in, std::vector<doubl
     for (auto i : inside) {
       PRAGMA_SIMD
       for (auto j : inside) {
-            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778
-                          +in[(i+-3)*n+(j+-2)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+-1)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+0)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+1)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+2)] * -0.00555555555556
-                          +in[(i+-2)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+-2)*n+(j+-2)] * -0.0416666666667
-                          +in[(i+-2)*n+(j+-1)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+0)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+1)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+3)] * 0.00555555555556
-                          +in[(i+-1)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+-1)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+-1)*n+(j+-1)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+0)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+2)] * 0.0138888888889
-                          +in[(i+-1)*n+(j+3)] * 0.00555555555556
-                          +in[(i+0)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+0)*n+(j+-1)] * -0.0833333333333
-                          +in[(i+0)*n+(j+1)] * 0.0833333333333
-                          +in[(i+0)*n+(j+2)] * 0.0138888888889
-                          +in[(i+0)*n+(j+3)] * 0.00555555555556
-                          +in[(i+1)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+1)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+1)*n+(j+0)] * 0.0833333333333
+            out[i*n+j] += +in[(i-3)*n+(j-3)] * -0.0277777777778
+                          +in[(i-2)*n+(j-3)] * -0.00555555555556
+                          +in[(i-1)*n+(j-3)] * -0.00555555555556
+                          +in[(i)*n+(j-3)] * -0.00555555555556
+                          +in[(i+1)*n+(j-3)] * -0.00555555555556
+                          +in[(i+2)*n+(j-3)] * -0.00555555555556
+                          +in[(i-3)*n+(j-2)] * -0.00555555555556
+                          +in[(i-2)*n+(j-2)] * -0.0416666666667
+                          +in[(i-1)*n+(j-2)] * -0.0138888888889
+                          +in[(i)*n+(j-2)] * -0.0138888888889
+                          +in[(i+1)*n+(j-2)] * -0.0138888888889
+                          +in[(i+3)*n+(j-2)] * 0.00555555555556
+                          +in[(i-3)*n+(j-1)] * -0.00555555555556
+                          +in[(i-2)*n+(j-1)] * -0.0138888888889
+                          +in[(i-1)*n+(j-1)] * -0.0833333333333
+                          +in[(i)*n+(j-1)] * -0.0833333333333
+                          +in[(i+2)*n+(j-1)] * 0.0138888888889
+                          +in[(i+3)*n+(j-1)] * 0.00555555555556
+                          +in[(i-3)*n+(j)] * -0.00555555555556
+                          +in[(i-2)*n+(j)] * -0.0138888888889
+                          +in[(i-1)*n+(j)] * -0.0833333333333
+                          +in[(i+1)*n+(j)] * 0.0833333333333
+                          +in[(i+2)*n+(j)] * 0.0138888888889
+                          +in[(i+3)*n+(j)] * 0.00555555555556
+                          +in[(i-3)*n+(j+1)] * -0.00555555555556
+                          +in[(i-2)*n+(j+1)] * -0.0138888888889
+                          +in[(i)*n+(j+1)] * 0.0833333333333
                           +in[(i+1)*n+(j+1)] * 0.0833333333333
-                          +in[(i+1)*n+(j+2)] * 0.0138888888889
-                          +in[(i+1)*n+(j+3)] * 0.00555555555556
-                          +in[(i+2)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+2)*n+(j+-1)] * 0.0138888888889
-                          +in[(i+2)*n+(j+0)] * 0.0138888888889
                           +in[(i+2)*n+(j+1)] * 0.0138888888889
-                          +in[(i+2)*n+(j+2)] * 0.0416666666667
-                          +in[(i+2)*n+(j+3)] * 0.00555555555556
-                          +in[(i+3)*n+(j+-2)] * 0.00555555555556
-                          +in[(i+3)*n+(j+-1)] * 0.00555555555556
-                          +in[(i+3)*n+(j+0)] * 0.00555555555556
                           +in[(i+3)*n+(j+1)] * 0.00555555555556
+                          +in[(i-3)*n+(j+2)] * -0.00555555555556
+                          +in[(i-1)*n+(j+2)] * 0.0138888888889
+                          +in[(i)*n+(j+2)] * 0.0138888888889
+                          +in[(i+1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+2)*n+(j+2)] * 0.0416666666667
                           +in[(i+3)*n+(j+2)] * 0.00555555555556
+                          +in[(i-2)*n+(j+3)] * 0.00555555555556
+                          +in[(i-1)*n+(j+3)] * 0.00555555555556
+                          +in[(i)*n+(j+3)] * 0.00555555555556
+                          +in[(i+1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+2)*n+(j+3)] * 0.00555555555556
                           +in[(i+3)*n+(j+3)] * 0.0277777777778
                           ;
            }
@@ -222,77 +222,77 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
     for (auto i : inside) {
       PRAGMA_SIMD
       for (auto j : inside) {
-            out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625
-                          +in[(i+-4)*n+(j+-3)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+-2)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+-1)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+0)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+1)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+2)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+3)] * -0.00223214285714
-                          +in[(i+-3)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-3)*n+(j+-3)] * -0.0208333333333
-                          +in[(i+-3)*n+(j+-2)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+-1)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+0)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+1)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+2)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+4)] * 0.00223214285714
-                          +in[(i+-2)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-2)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+-2)*n+(j+-2)] * -0.03125
-                          +in[(i+-2)*n+(j+-1)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+0)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+1)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+3)] * 0.00416666666667
-                          +in[(i+-2)*n+(j+4)] * 0.00223214285714
-                          +in[(i+-1)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-1)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+-1)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+-1)*n+(j+-1)] * -0.0625
-                          +in[(i+-1)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+2)] * 0.0104166666667
-                          +in[(i+-1)*n+(j+3)] * 0.00416666666667
-                          +in[(i+-1)*n+(j+4)] * 0.00223214285714
-                          +in[(i+0)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+0)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+0)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+0)*n+(j+-1)] * -0.0625
-                          +in[(i+0)*n+(j+1)] * 0.0625
-                          +in[(i+0)*n+(j+2)] * 0.0104166666667
-                          +in[(i+0)*n+(j+3)] * 0.00416666666667
-                          +in[(i+0)*n+(j+4)] * 0.00223214285714
-                          +in[(i+1)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+1)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+1)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+1)*n+(j+0)] * 0.0625
+            out[i*n+j] += +in[(i-4)*n+(j-4)] * -0.015625
+                          +in[(i-3)*n+(j-4)] * -0.00223214285714
+                          +in[(i-2)*n+(j-4)] * -0.00223214285714
+                          +in[(i-1)*n+(j-4)] * -0.00223214285714
+                          +in[(i)*n+(j-4)] * -0.00223214285714
+                          +in[(i+1)*n+(j-4)] * -0.00223214285714
+                          +in[(i+2)*n+(j-4)] * -0.00223214285714
+                          +in[(i+3)*n+(j-4)] * -0.00223214285714
+                          +in[(i-4)*n+(j-3)] * -0.00223214285714
+                          +in[(i-3)*n+(j-3)] * -0.0208333333333
+                          +in[(i-2)*n+(j-3)] * -0.00416666666667
+                          +in[(i-1)*n+(j-3)] * -0.00416666666667
+                          +in[(i)*n+(j-3)] * -0.00416666666667
+                          +in[(i+1)*n+(j-3)] * -0.00416666666667
+                          +in[(i+2)*n+(j-3)] * -0.00416666666667
+                          +in[(i+4)*n+(j-3)] * 0.00223214285714
+                          +in[(i-4)*n+(j-2)] * -0.00223214285714
+                          +in[(i-3)*n+(j-2)] * -0.00416666666667
+                          +in[(i-2)*n+(j-2)] * -0.03125
+                          +in[(i-1)*n+(j-2)] * -0.0104166666667
+                          +in[(i)*n+(j-2)] * -0.0104166666667
+                          +in[(i+1)*n+(j-2)] * -0.0104166666667
+                          +in[(i+3)*n+(j-2)] * 0.00416666666667
+                          +in[(i+4)*n+(j-2)] * 0.00223214285714
+                          +in[(i-4)*n+(j-1)] * -0.00223214285714
+                          +in[(i-3)*n+(j-1)] * -0.00416666666667
+                          +in[(i-2)*n+(j-1)] * -0.0104166666667
+                          +in[(i-1)*n+(j-1)] * -0.0625
+                          +in[(i)*n+(j-1)] * -0.0625
+                          +in[(i+2)*n+(j-1)] * 0.0104166666667
+                          +in[(i+3)*n+(j-1)] * 0.00416666666667
+                          +in[(i+4)*n+(j-1)] * 0.00223214285714
+                          +in[(i-4)*n+(j)] * -0.00223214285714
+                          +in[(i-3)*n+(j)] * -0.00416666666667
+                          +in[(i-2)*n+(j)] * -0.0104166666667
+                          +in[(i-1)*n+(j)] * -0.0625
+                          +in[(i+1)*n+(j)] * 0.0625
+                          +in[(i+2)*n+(j)] * 0.0104166666667
+                          +in[(i+3)*n+(j)] * 0.00416666666667
+                          +in[(i+4)*n+(j)] * 0.00223214285714
+                          +in[(i-4)*n+(j+1)] * -0.00223214285714
+                          +in[(i-3)*n+(j+1)] * -0.00416666666667
+                          +in[(i-2)*n+(j+1)] * -0.0104166666667
+                          +in[(i)*n+(j+1)] * 0.0625
                           +in[(i+1)*n+(j+1)] * 0.0625
-                          +in[(i+1)*n+(j+2)] * 0.0104166666667
-                          +in[(i+1)*n+(j+3)] * 0.00416666666667
-                          +in[(i+1)*n+(j+4)] * 0.00223214285714
-                          +in[(i+2)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+2)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+2)*n+(j+-1)] * 0.0104166666667
-                          +in[(i+2)*n+(j+0)] * 0.0104166666667
                           +in[(i+2)*n+(j+1)] * 0.0104166666667
-                          +in[(i+2)*n+(j+2)] * 0.03125
-                          +in[(i+2)*n+(j+3)] * 0.00416666666667
-                          +in[(i+2)*n+(j+4)] * 0.00223214285714
-                          +in[(i+3)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+3)*n+(j+-2)] * 0.00416666666667
-                          +in[(i+3)*n+(j+-1)] * 0.00416666666667
-                          +in[(i+3)*n+(j+0)] * 0.00416666666667
                           +in[(i+3)*n+(j+1)] * 0.00416666666667
-                          +in[(i+3)*n+(j+2)] * 0.00416666666667
-                          +in[(i+3)*n+(j+3)] * 0.0208333333333
-                          +in[(i+3)*n+(j+4)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-3)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-2)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-1)] * 0.00223214285714
-                          +in[(i+4)*n+(j+0)] * 0.00223214285714
                           +in[(i+4)*n+(j+1)] * 0.00223214285714
+                          +in[(i-4)*n+(j+2)] * -0.00223214285714
+                          +in[(i-3)*n+(j+2)] * -0.00416666666667
+                          +in[(i-1)*n+(j+2)] * 0.0104166666667
+                          +in[(i)*n+(j+2)] * 0.0104166666667
+                          +in[(i+1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+2)*n+(j+2)] * 0.03125
+                          +in[(i+3)*n+(j+2)] * 0.00416666666667
                           +in[(i+4)*n+(j+2)] * 0.00223214285714
+                          +in[(i-4)*n+(j+3)] * -0.00223214285714
+                          +in[(i-2)*n+(j+3)] * 0.00416666666667
+                          +in[(i-1)*n+(j+3)] * 0.00416666666667
+                          +in[(i)*n+(j+3)] * 0.00416666666667
+                          +in[(i+1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+3)*n+(j+3)] * 0.0208333333333
                           +in[(i+4)*n+(j+3)] * 0.00223214285714
+                          +in[(i-3)*n+(j+4)] * 0.00223214285714
+                          +in[(i-2)*n+(j+4)] * 0.00223214285714
+                          +in[(i-1)*n+(j+4)] * 0.00223214285714
+                          +in[(i)*n+(j+4)] * 0.00223214285714
+                          +in[(i+1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+3)*n+(j+4)] * 0.00223214285714
                           +in[(i+4)*n+(j+4)] * 0.015625
                           ;
            }
@@ -306,115 +306,115 @@ void grid5(const int n, const int t, std::vector<double> & in, std::vector<doubl
     for (auto i : inside) {
       PRAGMA_SIMD
       for (auto j : inside) {
-            out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01
-                          +in[(i+-5)*n+(j+-4)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-3)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-2)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-1)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+0)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+1)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+2)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+3)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+4)] * -0.00111111111111
-                          +in[(i+-4)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-4)*n+(j+-4)] * -0.0125
-                          +in[(i+-4)*n+(j+-3)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+-2)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+-1)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+0)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+1)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+2)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+3)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-3)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-3)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-3)*n+(j+-3)] * -0.0166666666667
-                          +in[(i+-3)*n+(j+-2)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+-1)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+0)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+1)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+2)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-3)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-2)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-2)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-2)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+-2)*n+(j+-2)] * -0.025
-                          +in[(i+-2)*n+(j+-1)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+0)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+1)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+3)] * 0.00333333333333
-                          +in[(i+-2)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-2)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-1)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-1)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-1)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+-1)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+-1)*n+(j+-1)] * -0.05
-                          +in[(i+-1)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+2)] * 0.00833333333333
-                          +in[(i+-1)*n+(j+3)] * 0.00333333333333
-                          +in[(i+-1)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-1)*n+(j+5)] * 0.00111111111111
-                          +in[(i+0)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+0)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+0)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+0)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+0)*n+(j+-1)] * -0.05
-                          +in[(i+0)*n+(j+1)] * 0.05
-                          +in[(i+0)*n+(j+2)] * 0.00833333333333
-                          +in[(i+0)*n+(j+3)] * 0.00333333333333
-                          +in[(i+0)*n+(j+4)] * 0.00178571428571
-                          +in[(i+0)*n+(j+5)] * 0.00111111111111
-                          +in[(i+1)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+1)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+1)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+1)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+1)*n+(j+0)] * 0.05
+            out[i*n+j] += +in[(i-5)*n+(j-5)] * -0.01
+                          +in[(i-4)*n+(j-5)] * -0.00111111111111
+                          +in[(i-3)*n+(j-5)] * -0.00111111111111
+                          +in[(i-2)*n+(j-5)] * -0.00111111111111
+                          +in[(i-1)*n+(j-5)] * -0.00111111111111
+                          +in[(i)*n+(j-5)] * -0.00111111111111
+                          +in[(i+1)*n+(j-5)] * -0.00111111111111
+                          +in[(i+2)*n+(j-5)] * -0.00111111111111
+                          +in[(i+3)*n+(j-5)] * -0.00111111111111
+                          +in[(i+4)*n+(j-5)] * -0.00111111111111
+                          +in[(i-5)*n+(j-4)] * -0.00111111111111
+                          +in[(i-4)*n+(j-4)] * -0.0125
+                          +in[(i-3)*n+(j-4)] * -0.00178571428571
+                          +in[(i-2)*n+(j-4)] * -0.00178571428571
+                          +in[(i-1)*n+(j-4)] * -0.00178571428571
+                          +in[(i)*n+(j-4)] * -0.00178571428571
+                          +in[(i+1)*n+(j-4)] * -0.00178571428571
+                          +in[(i+2)*n+(j-4)] * -0.00178571428571
+                          +in[(i+3)*n+(j-4)] * -0.00178571428571
+                          +in[(i+5)*n+(j-4)] * 0.00111111111111
+                          +in[(i-5)*n+(j-3)] * -0.00111111111111
+                          +in[(i-4)*n+(j-3)] * -0.00178571428571
+                          +in[(i-3)*n+(j-3)] * -0.0166666666667
+                          +in[(i-2)*n+(j-3)] * -0.00333333333333
+                          +in[(i-1)*n+(j-3)] * -0.00333333333333
+                          +in[(i)*n+(j-3)] * -0.00333333333333
+                          +in[(i+1)*n+(j-3)] * -0.00333333333333
+                          +in[(i+2)*n+(j-3)] * -0.00333333333333
+                          +in[(i+4)*n+(j-3)] * 0.00178571428571
+                          +in[(i+5)*n+(j-3)] * 0.00111111111111
+                          +in[(i-5)*n+(j-2)] * -0.00111111111111
+                          +in[(i-4)*n+(j-2)] * -0.00178571428571
+                          +in[(i-3)*n+(j-2)] * -0.00333333333333
+                          +in[(i-2)*n+(j-2)] * -0.025
+                          +in[(i-1)*n+(j-2)] * -0.00833333333333
+                          +in[(i)*n+(j-2)] * -0.00833333333333
+                          +in[(i+1)*n+(j-2)] * -0.00833333333333
+                          +in[(i+3)*n+(j-2)] * 0.00333333333333
+                          +in[(i+4)*n+(j-2)] * 0.00178571428571
+                          +in[(i+5)*n+(j-2)] * 0.00111111111111
+                          +in[(i-5)*n+(j-1)] * -0.00111111111111
+                          +in[(i-4)*n+(j-1)] * -0.00178571428571
+                          +in[(i-3)*n+(j-1)] * -0.00333333333333
+                          +in[(i-2)*n+(j-1)] * -0.00833333333333
+                          +in[(i-1)*n+(j-1)] * -0.05
+                          +in[(i)*n+(j-1)] * -0.05
+                          +in[(i+2)*n+(j-1)] * 0.00833333333333
+                          +in[(i+3)*n+(j-1)] * 0.00333333333333
+                          +in[(i+4)*n+(j-1)] * 0.00178571428571
+                          +in[(i+5)*n+(j-1)] * 0.00111111111111
+                          +in[(i-5)*n+(j)] * -0.00111111111111
+                          +in[(i-4)*n+(j)] * -0.00178571428571
+                          +in[(i-3)*n+(j)] * -0.00333333333333
+                          +in[(i-2)*n+(j)] * -0.00833333333333
+                          +in[(i-1)*n+(j)] * -0.05
+                          +in[(i+1)*n+(j)] * 0.05
+                          +in[(i+2)*n+(j)] * 0.00833333333333
+                          +in[(i+3)*n+(j)] * 0.00333333333333
+                          +in[(i+4)*n+(j)] * 0.00178571428571
+                          +in[(i+5)*n+(j)] * 0.00111111111111
+                          +in[(i-5)*n+(j+1)] * -0.00111111111111
+                          +in[(i-4)*n+(j+1)] * -0.00178571428571
+                          +in[(i-3)*n+(j+1)] * -0.00333333333333
+                          +in[(i-2)*n+(j+1)] * -0.00833333333333
+                          +in[(i)*n+(j+1)] * 0.05
                           +in[(i+1)*n+(j+1)] * 0.05
-                          +in[(i+1)*n+(j+2)] * 0.00833333333333
-                          +in[(i+1)*n+(j+3)] * 0.00333333333333
-                          +in[(i+1)*n+(j+4)] * 0.00178571428571
-                          +in[(i+1)*n+(j+5)] * 0.00111111111111
-                          +in[(i+2)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+2)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+2)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+2)*n+(j+-1)] * 0.00833333333333
-                          +in[(i+2)*n+(j+0)] * 0.00833333333333
                           +in[(i+2)*n+(j+1)] * 0.00833333333333
-                          +in[(i+2)*n+(j+2)] * 0.025
-                          +in[(i+2)*n+(j+3)] * 0.00333333333333
-                          +in[(i+2)*n+(j+4)] * 0.00178571428571
-                          +in[(i+2)*n+(j+5)] * 0.00111111111111
-                          +in[(i+3)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+3)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+3)*n+(j+-2)] * 0.00333333333333
-                          +in[(i+3)*n+(j+-1)] * 0.00333333333333
-                          +in[(i+3)*n+(j+0)] * 0.00333333333333
                           +in[(i+3)*n+(j+1)] * 0.00333333333333
-                          +in[(i+3)*n+(j+2)] * 0.00333333333333
-                          +in[(i+3)*n+(j+3)] * 0.0166666666667
-                          +in[(i+3)*n+(j+4)] * 0.00178571428571
-                          +in[(i+3)*n+(j+5)] * 0.00111111111111
-                          +in[(i+4)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+4)*n+(j+-3)] * 0.00178571428571
-                          +in[(i+4)*n+(j+-2)] * 0.00178571428571
-                          +in[(i+4)*n+(j+-1)] * 0.00178571428571
-                          +in[(i+4)*n+(j+0)] * 0.00178571428571
                           +in[(i+4)*n+(j+1)] * 0.00178571428571
-                          +in[(i+4)*n+(j+2)] * 0.00178571428571
-                          +in[(i+4)*n+(j+3)] * 0.00178571428571
-                          +in[(i+4)*n+(j+4)] * 0.0125
-                          +in[(i+4)*n+(j+5)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-4)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-3)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-2)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-1)] * 0.00111111111111
-                          +in[(i+5)*n+(j+0)] * 0.00111111111111
                           +in[(i+5)*n+(j+1)] * 0.00111111111111
+                          +in[(i-5)*n+(j+2)] * -0.00111111111111
+                          +in[(i-4)*n+(j+2)] * -0.00178571428571
+                          +in[(i-3)*n+(j+2)] * -0.00333333333333
+                          +in[(i-1)*n+(j+2)] * 0.00833333333333
+                          +in[(i)*n+(j+2)] * 0.00833333333333
+                          +in[(i+1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+2)*n+(j+2)] * 0.025
+                          +in[(i+3)*n+(j+2)] * 0.00333333333333
+                          +in[(i+4)*n+(j+2)] * 0.00178571428571
                           +in[(i+5)*n+(j+2)] * 0.00111111111111
+                          +in[(i-5)*n+(j+3)] * -0.00111111111111
+                          +in[(i-4)*n+(j+3)] * -0.00178571428571
+                          +in[(i-2)*n+(j+3)] * 0.00333333333333
+                          +in[(i-1)*n+(j+3)] * 0.00333333333333
+                          +in[(i)*n+(j+3)] * 0.00333333333333
+                          +in[(i+1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+3)*n+(j+3)] * 0.0166666666667
+                          +in[(i+4)*n+(j+3)] * 0.00178571428571
                           +in[(i+5)*n+(j+3)] * 0.00111111111111
+                          +in[(i-5)*n+(j+4)] * -0.00111111111111
+                          +in[(i-3)*n+(j+4)] * 0.00178571428571
+                          +in[(i-2)*n+(j+4)] * 0.00178571428571
+                          +in[(i-1)*n+(j+4)] * 0.00178571428571
+                          +in[(i)*n+(j+4)] * 0.00178571428571
+                          +in[(i+1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+4)*n+(j+4)] * 0.0125
                           +in[(i+5)*n+(j+4)] * 0.00111111111111
+                          +in[(i-4)*n+(j+5)] * 0.00111111111111
+                          +in[(i-3)*n+(j+5)] * 0.00111111111111
+                          +in[(i-2)*n+(j+5)] * 0.00111111111111
+                          +in[(i-1)*n+(j+5)] * 0.00111111111111
+                          +in[(i)*n+(j+5)] * 0.00111111111111
+                          +in[(i+1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+4)*n+(j+5)] * 0.00111111111111
                           +in[(i+5)*n+(j+5)] * 0.01
                           ;
            }
diff --git a/Cxx11/stencil_seq.hpp b/Cxx11/stencil_seq.hpp
index 139082a3b..4ed03972e 100644
--- a/Cxx11/stencil_seq.hpp
+++ b/Cxx11/stencil_seq.hpp
@@ -4,10 +4,10 @@ void star1(const int n, const int t, std::vector<double> & in, std::vector<doubl
         for (auto i=it; i<std::min(n-1,it+t); ++i) {
           PRAGMA_SIMD
           for (auto j=jt; j<std::min(n-1,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-1)*n+(j+0)] * -0.5
-                          +in[(i+0)*n+(j+-1)] * -0.5
-                          +in[(i+0)*n+(j+1)] * 0.5
-                          +in[(i+1)*n+(j+0)] * 0.5;
+            out[i*n+j] += +in[(i)*n+(j-1)] * -0.5
+                          +in[(i-1)*n+(j)] * -0.5
+                          +in[(i+1)*n+(j)] * 0.5
+                          +in[(i)*n+(j+1)] * 0.5;
            }
          }
        }
@@ -20,14 +20,14 @@ void star2(const int n, const int t, std::vector<double> & in, std::vector<doubl
         for (auto i=it; i<std::min(n-2,it+t); ++i) {
           PRAGMA_SIMD
           for (auto j=jt; j<std::min(n-2,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-2)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+0)] * -0.25
-                          +in[(i+0)*n+(j+-2)] * -0.125
-                          +in[(i+0)*n+(j+-1)] * -0.25
-                          +in[(i+0)*n+(j+1)] * 0.25
-                          +in[(i+0)*n+(j+2)] * 0.125
-                          +in[(i+1)*n+(j+0)] * 0.25
-                          +in[(i+2)*n+(j+0)] * 0.125;
+            out[i*n+j] += +in[(i)*n+(j-2)] * -0.125
+                          +in[(i)*n+(j-1)] * -0.25
+                          +in[(i-2)*n+(j)] * -0.125
+                          +in[(i-1)*n+(j)] * -0.25
+                          +in[(i+1)*n+(j)] * 0.25
+                          +in[(i+2)*n+(j)] * 0.125
+                          +in[(i)*n+(j+1)] * 0.25
+                          +in[(i)*n+(j+2)] * 0.125;
            }
          }
        }
@@ -40,18 +40,18 @@ void star3(const int n, const int t, std::vector<double> & in, std::vector<doubl
         for (auto i=it; i<std::min(n-3,it+t); ++i) {
           PRAGMA_SIMD
           for (auto j=jt; j<std::min(n-3,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556
-                          +in[(i+-2)*n+(j+0)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+0)] * -0.166666666667
-                          +in[(i+0)*n+(j+-3)] * -0.0555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.0833333333333
-                          +in[(i+0)*n+(j+-1)] * -0.166666666667
-                          +in[(i+0)*n+(j+1)] * 0.166666666667
-                          +in[(i+0)*n+(j+2)] * 0.0833333333333
-                          +in[(i+0)*n+(j+3)] * 0.0555555555556
-                          +in[(i+1)*n+(j+0)] * 0.166666666667
-                          +in[(i+2)*n+(j+0)] * 0.0833333333333
-                          +in[(i+3)*n+(j+0)] * 0.0555555555556;
+            out[i*n+j] += +in[(i)*n+(j-3)] * -0.0555555555556
+                          +in[(i)*n+(j-2)] * -0.0833333333333
+                          +in[(i)*n+(j-1)] * -0.166666666667
+                          +in[(i-3)*n+(j)] * -0.0555555555556
+                          +in[(i-2)*n+(j)] * -0.0833333333333
+                          +in[(i-1)*n+(j)] * -0.166666666667
+                          +in[(i+1)*n+(j)] * 0.166666666667
+                          +in[(i+2)*n+(j)] * 0.0833333333333
+                          +in[(i+3)*n+(j)] * 0.0555555555556
+                          +in[(i)*n+(j+1)] * 0.166666666667
+                          +in[(i)*n+(j+2)] * 0.0833333333333
+                          +in[(i)*n+(j+3)] * 0.0555555555556;
            }
          }
        }
@@ -64,22 +64,22 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
         for (auto i=it; i<std::min(n-4,it+t); ++i) {
           PRAGMA_SIMD
           for (auto j=jt; j<std::min(n-4,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125
-                          +in[(i+-3)*n+(j+0)] * -0.0416666666667
-                          +in[(i+-2)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+0)*n+(j+-4)] * -0.03125
-                          +in[(i+0)*n+(j+-3)] * -0.0416666666667
-                          +in[(i+0)*n+(j+-2)] * -0.0625
-                          +in[(i+0)*n+(j+-1)] * -0.125
-                          +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.0625
-                          +in[(i+0)*n+(j+3)] * 0.0416666666667
-                          +in[(i+0)*n+(j+4)] * 0.03125
-                          +in[(i+1)*n+(j+0)] * 0.125
-                          +in[(i+2)*n+(j+0)] * 0.0625
-                          +in[(i+3)*n+(j+0)] * 0.0416666666667
-                          +in[(i+4)*n+(j+0)] * 0.03125;
+            out[i*n+j] += +in[(i)*n+(j-4)] * -0.03125
+                          +in[(i)*n+(j-3)] * -0.0416666666667
+                          +in[(i)*n+(j-2)] * -0.0625
+                          +in[(i)*n+(j-1)] * -0.125
+                          +in[(i-4)*n+(j)] * -0.03125
+                          +in[(i-3)*n+(j)] * -0.0416666666667
+                          +in[(i-2)*n+(j)] * -0.0625
+                          +in[(i-1)*n+(j)] * -0.125
+                          +in[(i+1)*n+(j)] * 0.125
+                          +in[(i+2)*n+(j)] * 0.0625
+                          +in[(i+3)*n+(j)] * 0.0416666666667
+                          +in[(i+4)*n+(j)] * 0.03125
+                          +in[(i)*n+(j+1)] * 0.125
+                          +in[(i)*n+(j+2)] * 0.0625
+                          +in[(i)*n+(j+3)] * 0.0416666666667
+                          +in[(i)*n+(j+4)] * 0.03125;
            }
          }
        }
@@ -92,26 +92,26 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
         for (auto i=it; i<std::min(n-5,it+t); ++i) {
           PRAGMA_SIMD
           for (auto j=jt; j<std::min(n-5,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02
-                          +in[(i+-4)*n+(j+0)] * -0.025
-                          +in[(i+-3)*n+(j+0)] * -0.0333333333333
-                          +in[(i+-2)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+0)] * -0.1
-                          +in[(i+0)*n+(j+-5)] * -0.02
-                          +in[(i+0)*n+(j+-4)] * -0.025
-                          +in[(i+0)*n+(j+-3)] * -0.0333333333333
-                          +in[(i+0)*n+(j+-2)] * -0.05
-                          +in[(i+0)*n+(j+-1)] * -0.1
-                          +in[(i+0)*n+(j+1)] * 0.1
-                          +in[(i+0)*n+(j+2)] * 0.05
-                          +in[(i+0)*n+(j+3)] * 0.0333333333333
-                          +in[(i+0)*n+(j+4)] * 0.025
-                          +in[(i+0)*n+(j+5)] * 0.02
-                          +in[(i+1)*n+(j+0)] * 0.1
-                          +in[(i+2)*n+(j+0)] * 0.05
-                          +in[(i+3)*n+(j+0)] * 0.0333333333333
-                          +in[(i+4)*n+(j+0)] * 0.025
-                          +in[(i+5)*n+(j+0)] * 0.02;
+            out[i*n+j] += +in[(i)*n+(j-5)] * -0.02
+                          +in[(i)*n+(j-4)] * -0.025
+                          +in[(i)*n+(j-3)] * -0.0333333333333
+                          +in[(i)*n+(j-2)] * -0.05
+                          +in[(i)*n+(j-1)] * -0.1
+                          +in[(i-5)*n+(j)] * -0.02
+                          +in[(i-4)*n+(j)] * -0.025
+                          +in[(i-3)*n+(j)] * -0.0333333333333
+                          +in[(i-2)*n+(j)] * -0.05
+                          +in[(i-1)*n+(j)] * -0.1
+                          +in[(i+1)*n+(j)] * 0.1
+                          +in[(i+2)*n+(j)] * 0.05
+                          +in[(i+3)*n+(j)] * 0.0333333333333
+                          +in[(i+4)*n+(j)] * 0.025
+                          +in[(i+5)*n+(j)] * 0.02
+                          +in[(i)*n+(j+1)] * 0.1
+                          +in[(i)*n+(j+2)] * 0.05
+                          +in[(i)*n+(j+3)] * 0.0333333333333
+                          +in[(i)*n+(j+4)] * 0.025
+                          +in[(i)*n+(j+5)] * 0.02;
            }
          }
        }
@@ -124,11 +124,11 @@ void grid1(const int n, const int t, std::vector<double> & in, std::vector<doubl
         for (auto i=it; i<std::min(n-1,it+t); ++i) {
           PRAGMA_SIMD
           for (auto j=jt; j<std::min(n-1,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-1)*n+(j+-1)] * -0.25
-                          +in[(i+-1)*n+(j+0)] * -0.25
-                          +in[(i+0)*n+(j+-1)] * -0.25
-                          +in[(i+0)*n+(j+1)] * 0.25
-                          +in[(i+1)*n+(j+0)] * 0.25
+            out[i*n+j] += +in[(i-1)*n+(j-1)] * -0.25
+                          +in[(i)*n+(j-1)] * -0.25
+                          +in[(i-1)*n+(j)] * -0.25
+                          +in[(i+1)*n+(j)] * 0.25
+                          +in[(i)*n+(j+1)] * 0.25
                           +in[(i+1)*n+(j+1)] * 0.25
                           ;
            }
@@ -143,25 +143,25 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
         for (auto i=it; i<std::min(n-2,it+t); ++i) {
           PRAGMA_SIMD
           for (auto j=jt; j<std::min(n-2,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625
-                          +in[(i+-2)*n+(j+-1)] * -0.0208333333333
-                          +in[(i+-2)*n+(j+0)] * -0.0208333333333
-                          +in[(i+-2)*n+(j+1)] * -0.0208333333333
-                          +in[(i+-1)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+-1)*n+(j+-1)] * -0.125
-                          +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+2)] * 0.0208333333333
-                          +in[(i+0)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+0)*n+(j+-1)] * -0.125
-                          +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.0208333333333
-                          +in[(i+1)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+1)*n+(j+0)] * 0.125
+            out[i*n+j] += +in[(i-2)*n+(j-2)] * -0.0625
+                          +in[(i-1)*n+(j-2)] * -0.0208333333333
+                          +in[(i)*n+(j-2)] * -0.0208333333333
+                          +in[(i+1)*n+(j-2)] * -0.0208333333333
+                          +in[(i-2)*n+(j-1)] * -0.0208333333333
+                          +in[(i-1)*n+(j-1)] * -0.125
+                          +in[(i)*n+(j-1)] * -0.125
+                          +in[(i+2)*n+(j-1)] * 0.0208333333333
+                          +in[(i-2)*n+(j)] * -0.0208333333333
+                          +in[(i-1)*n+(j)] * -0.125
+                          +in[(i+1)*n+(j)] * 0.125
+                          +in[(i+2)*n+(j)] * 0.0208333333333
+                          +in[(i-2)*n+(j+1)] * -0.0208333333333
+                          +in[(i)*n+(j+1)] * 0.125
                           +in[(i+1)*n+(j+1)] * 0.125
-                          +in[(i+1)*n+(j+2)] * 0.0208333333333
-                          +in[(i+2)*n+(j+-1)] * 0.0208333333333
-                          +in[(i+2)*n+(j+0)] * 0.0208333333333
                           +in[(i+2)*n+(j+1)] * 0.0208333333333
+                          +in[(i-1)*n+(j+2)] * 0.0208333333333
+                          +in[(i)*n+(j+2)] * 0.0208333333333
+                          +in[(i+1)*n+(j+2)] * 0.0208333333333
                           +in[(i+2)*n+(j+2)] * 0.0625
                           ;
            }
@@ -176,47 +176,47 @@ void grid3(const int n, const int t, std::vector<double> & in, std::vector<doubl
         for (auto i=it; i<std::min(n-3,it+t); ++i) {
           PRAGMA_SIMD
           for (auto j=jt; j<std::min(n-3,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778
-                          +in[(i+-3)*n+(j+-2)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+-1)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+0)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+1)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+2)] * -0.00555555555556
-                          +in[(i+-2)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+-2)*n+(j+-2)] * -0.0416666666667
-                          +in[(i+-2)*n+(j+-1)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+0)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+1)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+3)] * 0.00555555555556
-                          +in[(i+-1)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+-1)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+-1)*n+(j+-1)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+0)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+2)] * 0.0138888888889
-                          +in[(i+-1)*n+(j+3)] * 0.00555555555556
-                          +in[(i+0)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+0)*n+(j+-1)] * -0.0833333333333
-                          +in[(i+0)*n+(j+1)] * 0.0833333333333
-                          +in[(i+0)*n+(j+2)] * 0.0138888888889
-                          +in[(i+0)*n+(j+3)] * 0.00555555555556
-                          +in[(i+1)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+1)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+1)*n+(j+0)] * 0.0833333333333
+            out[i*n+j] += +in[(i-3)*n+(j-3)] * -0.0277777777778
+                          +in[(i-2)*n+(j-3)] * -0.00555555555556
+                          +in[(i-1)*n+(j-3)] * -0.00555555555556
+                          +in[(i)*n+(j-3)] * -0.00555555555556
+                          +in[(i+1)*n+(j-3)] * -0.00555555555556
+                          +in[(i+2)*n+(j-3)] * -0.00555555555556
+                          +in[(i-3)*n+(j-2)] * -0.00555555555556
+                          +in[(i-2)*n+(j-2)] * -0.0416666666667
+                          +in[(i-1)*n+(j-2)] * -0.0138888888889
+                          +in[(i)*n+(j-2)] * -0.0138888888889
+                          +in[(i+1)*n+(j-2)] * -0.0138888888889
+                          +in[(i+3)*n+(j-2)] * 0.00555555555556
+                          +in[(i-3)*n+(j-1)] * -0.00555555555556
+                          +in[(i-2)*n+(j-1)] * -0.0138888888889
+                          +in[(i-1)*n+(j-1)] * -0.0833333333333
+                          +in[(i)*n+(j-1)] * -0.0833333333333
+                          +in[(i+2)*n+(j-1)] * 0.0138888888889
+                          +in[(i+3)*n+(j-1)] * 0.00555555555556
+                          +in[(i-3)*n+(j)] * -0.00555555555556
+                          +in[(i-2)*n+(j)] * -0.0138888888889
+                          +in[(i-1)*n+(j)] * -0.0833333333333
+                          +in[(i+1)*n+(j)] * 0.0833333333333
+                          +in[(i+2)*n+(j)] * 0.0138888888889
+                          +in[(i+3)*n+(j)] * 0.00555555555556
+                          +in[(i-3)*n+(j+1)] * -0.00555555555556
+                          +in[(i-2)*n+(j+1)] * -0.0138888888889
+                          +in[(i)*n+(j+1)] * 0.0833333333333
                           +in[(i+1)*n+(j+1)] * 0.0833333333333
-                          +in[(i+1)*n+(j+2)] * 0.0138888888889
-                          +in[(i+1)*n+(j+3)] * 0.00555555555556
-                          +in[(i+2)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+2)*n+(j+-1)] * 0.0138888888889
-                          +in[(i+2)*n+(j+0)] * 0.0138888888889
                           +in[(i+2)*n+(j+1)] * 0.0138888888889
-                          +in[(i+2)*n+(j+2)] * 0.0416666666667
-                          +in[(i+2)*n+(j+3)] * 0.00555555555556
-                          +in[(i+3)*n+(j+-2)] * 0.00555555555556
-                          +in[(i+3)*n+(j+-1)] * 0.00555555555556
-                          +in[(i+3)*n+(j+0)] * 0.00555555555556
                           +in[(i+3)*n+(j+1)] * 0.00555555555556
+                          +in[(i-3)*n+(j+2)] * -0.00555555555556
+                          +in[(i-1)*n+(j+2)] * 0.0138888888889
+                          +in[(i)*n+(j+2)] * 0.0138888888889
+                          +in[(i+1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+2)*n+(j+2)] * 0.0416666666667
                           +in[(i+3)*n+(j+2)] * 0.00555555555556
+                          +in[(i-2)*n+(j+3)] * 0.00555555555556
+                          +in[(i-1)*n+(j+3)] * 0.00555555555556
+                          +in[(i)*n+(j+3)] * 0.00555555555556
+                          +in[(i+1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+2)*n+(j+3)] * 0.00555555555556
                           +in[(i+3)*n+(j+3)] * 0.0277777777778
                           ;
            }
@@ -231,77 +231,77 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
         for (auto i=it; i<std::min(n-4,it+t); ++i) {
           PRAGMA_SIMD
           for (auto j=jt; j<std::min(n-4,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625
-                          +in[(i+-4)*n+(j+-3)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+-2)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+-1)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+0)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+1)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+2)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+3)] * -0.00223214285714
-                          +in[(i+-3)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-3)*n+(j+-3)] * -0.0208333333333
-                          +in[(i+-3)*n+(j+-2)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+-1)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+0)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+1)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+2)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+4)] * 0.00223214285714
-                          +in[(i+-2)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-2)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+-2)*n+(j+-2)] * -0.03125
-                          +in[(i+-2)*n+(j+-1)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+0)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+1)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+3)] * 0.00416666666667
-                          +in[(i+-2)*n+(j+4)] * 0.00223214285714
-                          +in[(i+-1)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-1)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+-1)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+-1)*n+(j+-1)] * -0.0625
-                          +in[(i+-1)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+2)] * 0.0104166666667
-                          +in[(i+-1)*n+(j+3)] * 0.00416666666667
-                          +in[(i+-1)*n+(j+4)] * 0.00223214285714
-                          +in[(i+0)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+0)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+0)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+0)*n+(j+-1)] * -0.0625
-                          +in[(i+0)*n+(j+1)] * 0.0625
-                          +in[(i+0)*n+(j+2)] * 0.0104166666667
-                          +in[(i+0)*n+(j+3)] * 0.00416666666667
-                          +in[(i+0)*n+(j+4)] * 0.00223214285714
-                          +in[(i+1)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+1)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+1)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+1)*n+(j+0)] * 0.0625
+            out[i*n+j] += +in[(i-4)*n+(j-4)] * -0.015625
+                          +in[(i-3)*n+(j-4)] * -0.00223214285714
+                          +in[(i-2)*n+(j-4)] * -0.00223214285714
+                          +in[(i-1)*n+(j-4)] * -0.00223214285714
+                          +in[(i)*n+(j-4)] * -0.00223214285714
+                          +in[(i+1)*n+(j-4)] * -0.00223214285714
+                          +in[(i+2)*n+(j-4)] * -0.00223214285714
+                          +in[(i+3)*n+(j-4)] * -0.00223214285714
+                          +in[(i-4)*n+(j-3)] * -0.00223214285714
+                          +in[(i-3)*n+(j-3)] * -0.0208333333333
+                          +in[(i-2)*n+(j-3)] * -0.00416666666667
+                          +in[(i-1)*n+(j-3)] * -0.00416666666667
+                          +in[(i)*n+(j-3)] * -0.00416666666667
+                          +in[(i+1)*n+(j-3)] * -0.00416666666667
+                          +in[(i+2)*n+(j-3)] * -0.00416666666667
+                          +in[(i+4)*n+(j-3)] * 0.00223214285714
+                          +in[(i-4)*n+(j-2)] * -0.00223214285714
+                          +in[(i-3)*n+(j-2)] * -0.00416666666667
+                          +in[(i-2)*n+(j-2)] * -0.03125
+                          +in[(i-1)*n+(j-2)] * -0.0104166666667
+                          +in[(i)*n+(j-2)] * -0.0104166666667
+                          +in[(i+1)*n+(j-2)] * -0.0104166666667
+                          +in[(i+3)*n+(j-2)] * 0.00416666666667
+                          +in[(i+4)*n+(j-2)] * 0.00223214285714
+                          +in[(i-4)*n+(j-1)] * -0.00223214285714
+                          +in[(i-3)*n+(j-1)] * -0.00416666666667
+                          +in[(i-2)*n+(j-1)] * -0.0104166666667
+                          +in[(i-1)*n+(j-1)] * -0.0625
+                          +in[(i)*n+(j-1)] * -0.0625
+                          +in[(i+2)*n+(j-1)] * 0.0104166666667
+                          +in[(i+3)*n+(j-1)] * 0.00416666666667
+                          +in[(i+4)*n+(j-1)] * 0.00223214285714
+                          +in[(i-4)*n+(j)] * -0.00223214285714
+                          +in[(i-3)*n+(j)] * -0.00416666666667
+                          +in[(i-2)*n+(j)] * -0.0104166666667
+                          +in[(i-1)*n+(j)] * -0.0625
+                          +in[(i+1)*n+(j)] * 0.0625
+                          +in[(i+2)*n+(j)] * 0.0104166666667
+                          +in[(i+3)*n+(j)] * 0.00416666666667
+                          +in[(i+4)*n+(j)] * 0.00223214285714
+                          +in[(i-4)*n+(j+1)] * -0.00223214285714
+                          +in[(i-3)*n+(j+1)] * -0.00416666666667
+                          +in[(i-2)*n+(j+1)] * -0.0104166666667
+                          +in[(i)*n+(j+1)] * 0.0625
                           +in[(i+1)*n+(j+1)] * 0.0625
-                          +in[(i+1)*n+(j+2)] * 0.0104166666667
-                          +in[(i+1)*n+(j+3)] * 0.00416666666667
-                          +in[(i+1)*n+(j+4)] * 0.00223214285714
-                          +in[(i+2)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+2)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+2)*n+(j+-1)] * 0.0104166666667
-                          +in[(i+2)*n+(j+0)] * 0.0104166666667
                           +in[(i+2)*n+(j+1)] * 0.0104166666667
-                          +in[(i+2)*n+(j+2)] * 0.03125
-                          +in[(i+2)*n+(j+3)] * 0.00416666666667
-                          +in[(i+2)*n+(j+4)] * 0.00223214285714
-                          +in[(i+3)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+3)*n+(j+-2)] * 0.00416666666667
-                          +in[(i+3)*n+(j+-1)] * 0.00416666666667
-                          +in[(i+3)*n+(j+0)] * 0.00416666666667
                           +in[(i+3)*n+(j+1)] * 0.00416666666667
-                          +in[(i+3)*n+(j+2)] * 0.00416666666667
-                          +in[(i+3)*n+(j+3)] * 0.0208333333333
-                          +in[(i+3)*n+(j+4)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-3)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-2)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-1)] * 0.00223214285714
-                          +in[(i+4)*n+(j+0)] * 0.00223214285714
                           +in[(i+4)*n+(j+1)] * 0.00223214285714
+                          +in[(i-4)*n+(j+2)] * -0.00223214285714
+                          +in[(i-3)*n+(j+2)] * -0.00416666666667
+                          +in[(i-1)*n+(j+2)] * 0.0104166666667
+                          +in[(i)*n+(j+2)] * 0.0104166666667
+                          +in[(i+1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+2)*n+(j+2)] * 0.03125
+                          +in[(i+3)*n+(j+2)] * 0.00416666666667
                           +in[(i+4)*n+(j+2)] * 0.00223214285714
+                          +in[(i-4)*n+(j+3)] * -0.00223214285714
+                          +in[(i-2)*n+(j+3)] * 0.00416666666667
+                          +in[(i-1)*n+(j+3)] * 0.00416666666667
+                          +in[(i)*n+(j+3)] * 0.00416666666667
+                          +in[(i+1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+3)*n+(j+3)] * 0.0208333333333
                           +in[(i+4)*n+(j+3)] * 0.00223214285714
+                          +in[(i-3)*n+(j+4)] * 0.00223214285714
+                          +in[(i-2)*n+(j+4)] * 0.00223214285714
+                          +in[(i-1)*n+(j+4)] * 0.00223214285714
+                          +in[(i)*n+(j+4)] * 0.00223214285714
+                          +in[(i+1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+3)*n+(j+4)] * 0.00223214285714
                           +in[(i+4)*n+(j+4)] * 0.015625
                           ;
            }
@@ -316,115 +316,115 @@ void grid5(const int n, const int t, std::vector<double> & in, std::vector<doubl
         for (auto i=it; i<std::min(n-5,it+t); ++i) {
           PRAGMA_SIMD
           for (auto j=jt; j<std::min(n-5,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01
-                          +in[(i+-5)*n+(j+-4)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-3)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-2)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-1)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+0)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+1)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+2)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+3)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+4)] * -0.00111111111111
-                          +in[(i+-4)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-4)*n+(j+-4)] * -0.0125
-                          +in[(i+-4)*n+(j+-3)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+-2)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+-1)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+0)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+1)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+2)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+3)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-3)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-3)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-3)*n+(j+-3)] * -0.0166666666667
-                          +in[(i+-3)*n+(j+-2)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+-1)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+0)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+1)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+2)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-3)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-2)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-2)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-2)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+-2)*n+(j+-2)] * -0.025
-                          +in[(i+-2)*n+(j+-1)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+0)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+1)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+3)] * 0.00333333333333
-                          +in[(i+-2)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-2)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-1)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-1)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-1)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+-1)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+-1)*n+(j+-1)] * -0.05
-                          +in[(i+-1)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+2)] * 0.00833333333333
-                          +in[(i+-1)*n+(j+3)] * 0.00333333333333
-                          +in[(i+-1)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-1)*n+(j+5)] * 0.00111111111111
-                          +in[(i+0)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+0)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+0)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+0)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+0)*n+(j+-1)] * -0.05
-                          +in[(i+0)*n+(j+1)] * 0.05
-                          +in[(i+0)*n+(j+2)] * 0.00833333333333
-                          +in[(i+0)*n+(j+3)] * 0.00333333333333
-                          +in[(i+0)*n+(j+4)] * 0.00178571428571
-                          +in[(i+0)*n+(j+5)] * 0.00111111111111
-                          +in[(i+1)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+1)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+1)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+1)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+1)*n+(j+0)] * 0.05
+            out[i*n+j] += +in[(i-5)*n+(j-5)] * -0.01
+                          +in[(i-4)*n+(j-5)] * -0.00111111111111
+                          +in[(i-3)*n+(j-5)] * -0.00111111111111
+                          +in[(i-2)*n+(j-5)] * -0.00111111111111
+                          +in[(i-1)*n+(j-5)] * -0.00111111111111
+                          +in[(i)*n+(j-5)] * -0.00111111111111
+                          +in[(i+1)*n+(j-5)] * -0.00111111111111
+                          +in[(i+2)*n+(j-5)] * -0.00111111111111
+                          +in[(i+3)*n+(j-5)] * -0.00111111111111
+                          +in[(i+4)*n+(j-5)] * -0.00111111111111
+                          +in[(i-5)*n+(j-4)] * -0.00111111111111
+                          +in[(i-4)*n+(j-4)] * -0.0125
+                          +in[(i-3)*n+(j-4)] * -0.00178571428571
+                          +in[(i-2)*n+(j-4)] * -0.00178571428571
+                          +in[(i-1)*n+(j-4)] * -0.00178571428571
+                          +in[(i)*n+(j-4)] * -0.00178571428571
+                          +in[(i+1)*n+(j-4)] * -0.00178571428571
+                          +in[(i+2)*n+(j-4)] * -0.00178571428571
+                          +in[(i+3)*n+(j-4)] * -0.00178571428571
+                          +in[(i+5)*n+(j-4)] * 0.00111111111111
+                          +in[(i-5)*n+(j-3)] * -0.00111111111111
+                          +in[(i-4)*n+(j-3)] * -0.00178571428571
+                          +in[(i-3)*n+(j-3)] * -0.0166666666667
+                          +in[(i-2)*n+(j-3)] * -0.00333333333333
+                          +in[(i-1)*n+(j-3)] * -0.00333333333333
+                          +in[(i)*n+(j-3)] * -0.00333333333333
+                          +in[(i+1)*n+(j-3)] * -0.00333333333333
+                          +in[(i+2)*n+(j-3)] * -0.00333333333333
+                          +in[(i+4)*n+(j-3)] * 0.00178571428571
+                          +in[(i+5)*n+(j-3)] * 0.00111111111111
+                          +in[(i-5)*n+(j-2)] * -0.00111111111111
+                          +in[(i-4)*n+(j-2)] * -0.00178571428571
+                          +in[(i-3)*n+(j-2)] * -0.00333333333333
+                          +in[(i-2)*n+(j-2)] * -0.025
+                          +in[(i-1)*n+(j-2)] * -0.00833333333333
+                          +in[(i)*n+(j-2)] * -0.00833333333333
+                          +in[(i+1)*n+(j-2)] * -0.00833333333333
+                          +in[(i+3)*n+(j-2)] * 0.00333333333333
+                          +in[(i+4)*n+(j-2)] * 0.00178571428571
+                          +in[(i+5)*n+(j-2)] * 0.00111111111111
+                          +in[(i-5)*n+(j-1)] * -0.00111111111111
+                          +in[(i-4)*n+(j-1)] * -0.00178571428571
+                          +in[(i-3)*n+(j-1)] * -0.00333333333333
+                          +in[(i-2)*n+(j-1)] * -0.00833333333333
+                          +in[(i-1)*n+(j-1)] * -0.05
+                          +in[(i)*n+(j-1)] * -0.05
+                          +in[(i+2)*n+(j-1)] * 0.00833333333333
+                          +in[(i+3)*n+(j-1)] * 0.00333333333333
+                          +in[(i+4)*n+(j-1)] * 0.00178571428571
+                          +in[(i+5)*n+(j-1)] * 0.00111111111111
+                          +in[(i-5)*n+(j)] * -0.00111111111111
+                          +in[(i-4)*n+(j)] * -0.00178571428571
+                          +in[(i-3)*n+(j)] * -0.00333333333333
+                          +in[(i-2)*n+(j)] * -0.00833333333333
+                          +in[(i-1)*n+(j)] * -0.05
+                          +in[(i+1)*n+(j)] * 0.05
+                          +in[(i+2)*n+(j)] * 0.00833333333333
+                          +in[(i+3)*n+(j)] * 0.00333333333333
+                          +in[(i+4)*n+(j)] * 0.00178571428571
+                          +in[(i+5)*n+(j)] * 0.00111111111111
+                          +in[(i-5)*n+(j+1)] * -0.00111111111111
+                          +in[(i-4)*n+(j+1)] * -0.00178571428571
+                          +in[(i-3)*n+(j+1)] * -0.00333333333333
+                          +in[(i-2)*n+(j+1)] * -0.00833333333333
+                          +in[(i)*n+(j+1)] * 0.05
                           +in[(i+1)*n+(j+1)] * 0.05
-                          +in[(i+1)*n+(j+2)] * 0.00833333333333
-                          +in[(i+1)*n+(j+3)] * 0.00333333333333
-                          +in[(i+1)*n+(j+4)] * 0.00178571428571
-                          +in[(i+1)*n+(j+5)] * 0.00111111111111
-                          +in[(i+2)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+2)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+2)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+2)*n+(j+-1)] * 0.00833333333333
-                          +in[(i+2)*n+(j+0)] * 0.00833333333333
                           +in[(i+2)*n+(j+1)] * 0.00833333333333
-                          +in[(i+2)*n+(j+2)] * 0.025
-                          +in[(i+2)*n+(j+3)] * 0.00333333333333
-                          +in[(i+2)*n+(j+4)] * 0.00178571428571
-                          +in[(i+2)*n+(j+5)] * 0.00111111111111
-                          +in[(i+3)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+3)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+3)*n+(j+-2)] * 0.00333333333333
-                          +in[(i+3)*n+(j+-1)] * 0.00333333333333
-                          +in[(i+3)*n+(j+0)] * 0.00333333333333
                           +in[(i+3)*n+(j+1)] * 0.00333333333333
-                          +in[(i+3)*n+(j+2)] * 0.00333333333333
-                          +in[(i+3)*n+(j+3)] * 0.0166666666667
-                          +in[(i+3)*n+(j+4)] * 0.00178571428571
-                          +in[(i+3)*n+(j+5)] * 0.00111111111111
-                          +in[(i+4)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+4)*n+(j+-3)] * 0.00178571428571
-                          +in[(i+4)*n+(j+-2)] * 0.00178571428571
-                          +in[(i+4)*n+(j+-1)] * 0.00178571428571
-                          +in[(i+4)*n+(j+0)] * 0.00178571428571
                           +in[(i+4)*n+(j+1)] * 0.00178571428571
-                          +in[(i+4)*n+(j+2)] * 0.00178571428571
-                          +in[(i+4)*n+(j+3)] * 0.00178571428571
-                          +in[(i+4)*n+(j+4)] * 0.0125
-                          +in[(i+4)*n+(j+5)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-4)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-3)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-2)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-1)] * 0.00111111111111
-                          +in[(i+5)*n+(j+0)] * 0.00111111111111
                           +in[(i+5)*n+(j+1)] * 0.00111111111111
+                          +in[(i-5)*n+(j+2)] * -0.00111111111111
+                          +in[(i-4)*n+(j+2)] * -0.00178571428571
+                          +in[(i-3)*n+(j+2)] * -0.00333333333333
+                          +in[(i-1)*n+(j+2)] * 0.00833333333333
+                          +in[(i)*n+(j+2)] * 0.00833333333333
+                          +in[(i+1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+2)*n+(j+2)] * 0.025
+                          +in[(i+3)*n+(j+2)] * 0.00333333333333
+                          +in[(i+4)*n+(j+2)] * 0.00178571428571
                           +in[(i+5)*n+(j+2)] * 0.00111111111111
+                          +in[(i-5)*n+(j+3)] * -0.00111111111111
+                          +in[(i-4)*n+(j+3)] * -0.00178571428571
+                          +in[(i-2)*n+(j+3)] * 0.00333333333333
+                          +in[(i-1)*n+(j+3)] * 0.00333333333333
+                          +in[(i)*n+(j+3)] * 0.00333333333333
+                          +in[(i+1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+3)*n+(j+3)] * 0.0166666666667
+                          +in[(i+4)*n+(j+3)] * 0.00178571428571
                           +in[(i+5)*n+(j+3)] * 0.00111111111111
+                          +in[(i-5)*n+(j+4)] * -0.00111111111111
+                          +in[(i-3)*n+(j+4)] * 0.00178571428571
+                          +in[(i-2)*n+(j+4)] * 0.00178571428571
+                          +in[(i-1)*n+(j+4)] * 0.00178571428571
+                          +in[(i)*n+(j+4)] * 0.00178571428571
+                          +in[(i+1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+4)*n+(j+4)] * 0.0125
                           +in[(i+5)*n+(j+4)] * 0.00111111111111
+                          +in[(i-4)*n+(j+5)] * 0.00111111111111
+                          +in[(i-3)*n+(j+5)] * 0.00111111111111
+                          +in[(i-2)*n+(j+5)] * 0.00111111111111
+                          +in[(i-1)*n+(j+5)] * 0.00111111111111
+                          +in[(i)*n+(j+5)] * 0.00111111111111
+                          +in[(i+1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+4)*n+(j+5)] * 0.00111111111111
                           +in[(i+5)*n+(j+5)] * 0.01
                           ;
            }
diff --git a/Cxx11/stencil_stl.hpp b/Cxx11/stencil_stl.hpp
index 6633cff00..ac0292652 100644
--- a/Cxx11/stencil_stl.hpp
+++ b/Cxx11/stencil_stl.hpp
@@ -2,10 +2,10 @@ void star1(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = prk::range(1,n-1);
     std::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-1)*n+(j+0)] * -0.5
-                          +in[(i+0)*n+(j+-1)] * -0.5
-                          +in[(i+0)*n+(j+1)] * 0.5
-                          +in[(i+1)*n+(j+0)] * 0.5;
+            out[i*n+j] += +in[(i)*n+(j-1)] * -0.5
+                          +in[(i-1)*n+(j)] * -0.5
+                          +in[(i+1)*n+(j)] * 0.5
+                          +in[(i)*n+(j+1)] * 0.5;
       });
     });
 }
@@ -14,14 +14,14 @@ void star2(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = prk::range(2,n-2);
     std::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-2)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+0)] * -0.25
-                          +in[(i+0)*n+(j+-2)] * -0.125
-                          +in[(i+0)*n+(j+-1)] * -0.25
-                          +in[(i+0)*n+(j+1)] * 0.25
-                          +in[(i+0)*n+(j+2)] * 0.125
-                          +in[(i+1)*n+(j+0)] * 0.25
-                          +in[(i+2)*n+(j+0)] * 0.125;
+            out[i*n+j] += +in[(i)*n+(j-2)] * -0.125
+                          +in[(i)*n+(j-1)] * -0.25
+                          +in[(i-2)*n+(j)] * -0.125
+                          +in[(i-1)*n+(j)] * -0.25
+                          +in[(i+1)*n+(j)] * 0.25
+                          +in[(i+2)*n+(j)] * 0.125
+                          +in[(i)*n+(j+1)] * 0.25
+                          +in[(i)*n+(j+2)] * 0.125;
       });
     });
 }
@@ -30,18 +30,18 @@ void star3(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = prk::range(3,n-3);
     std::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556
-                          +in[(i+-2)*n+(j+0)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+0)] * -0.166666666667
-                          +in[(i+0)*n+(j+-3)] * -0.0555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.0833333333333
-                          +in[(i+0)*n+(j+-1)] * -0.166666666667
-                          +in[(i+0)*n+(j+1)] * 0.166666666667
-                          +in[(i+0)*n+(j+2)] * 0.0833333333333
-                          +in[(i+0)*n+(j+3)] * 0.0555555555556
-                          +in[(i+1)*n+(j+0)] * 0.166666666667
-                          +in[(i+2)*n+(j+0)] * 0.0833333333333
-                          +in[(i+3)*n+(j+0)] * 0.0555555555556;
+            out[i*n+j] += +in[(i)*n+(j-3)] * -0.0555555555556
+                          +in[(i)*n+(j-2)] * -0.0833333333333
+                          +in[(i)*n+(j-1)] * -0.166666666667
+                          +in[(i-3)*n+(j)] * -0.0555555555556
+                          +in[(i-2)*n+(j)] * -0.0833333333333
+                          +in[(i-1)*n+(j)] * -0.166666666667
+                          +in[(i+1)*n+(j)] * 0.166666666667
+                          +in[(i+2)*n+(j)] * 0.0833333333333
+                          +in[(i+3)*n+(j)] * 0.0555555555556
+                          +in[(i)*n+(j+1)] * 0.166666666667
+                          +in[(i)*n+(j+2)] * 0.0833333333333
+                          +in[(i)*n+(j+3)] * 0.0555555555556;
       });
     });
 }
@@ -50,22 +50,22 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = prk::range(4,n-4);
     std::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125
-                          +in[(i+-3)*n+(j+0)] * -0.0416666666667
-                          +in[(i+-2)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+0)*n+(j+-4)] * -0.03125
-                          +in[(i+0)*n+(j+-3)] * -0.0416666666667
-                          +in[(i+0)*n+(j+-2)] * -0.0625
-                          +in[(i+0)*n+(j+-1)] * -0.125
-                          +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.0625
-                          +in[(i+0)*n+(j+3)] * 0.0416666666667
-                          +in[(i+0)*n+(j+4)] * 0.03125
-                          +in[(i+1)*n+(j+0)] * 0.125
-                          +in[(i+2)*n+(j+0)] * 0.0625
-                          +in[(i+3)*n+(j+0)] * 0.0416666666667
-                          +in[(i+4)*n+(j+0)] * 0.03125;
+            out[i*n+j] += +in[(i)*n+(j-4)] * -0.03125
+                          +in[(i)*n+(j-3)] * -0.0416666666667
+                          +in[(i)*n+(j-2)] * -0.0625
+                          +in[(i)*n+(j-1)] * -0.125
+                          +in[(i-4)*n+(j)] * -0.03125
+                          +in[(i-3)*n+(j)] * -0.0416666666667
+                          +in[(i-2)*n+(j)] * -0.0625
+                          +in[(i-1)*n+(j)] * -0.125
+                          +in[(i+1)*n+(j)] * 0.125
+                          +in[(i+2)*n+(j)] * 0.0625
+                          +in[(i+3)*n+(j)] * 0.0416666666667
+                          +in[(i+4)*n+(j)] * 0.03125
+                          +in[(i)*n+(j+1)] * 0.125
+                          +in[(i)*n+(j+2)] * 0.0625
+                          +in[(i)*n+(j+3)] * 0.0416666666667
+                          +in[(i)*n+(j+4)] * 0.03125;
       });
     });
 }
@@ -74,26 +74,26 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = prk::range(5,n-5);
     std::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02
-                          +in[(i+-4)*n+(j+0)] * -0.025
-                          +in[(i+-3)*n+(j+0)] * -0.0333333333333
-                          +in[(i+-2)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+0)] * -0.1
-                          +in[(i+0)*n+(j+-5)] * -0.02
-                          +in[(i+0)*n+(j+-4)] * -0.025
-                          +in[(i+0)*n+(j+-3)] * -0.0333333333333
-                          +in[(i+0)*n+(j+-2)] * -0.05
-                          +in[(i+0)*n+(j+-1)] * -0.1
-                          +in[(i+0)*n+(j+1)] * 0.1
-                          +in[(i+0)*n+(j+2)] * 0.05
-                          +in[(i+0)*n+(j+3)] * 0.0333333333333
-                          +in[(i+0)*n+(j+4)] * 0.025
-                          +in[(i+0)*n+(j+5)] * 0.02
-                          +in[(i+1)*n+(j+0)] * 0.1
-                          +in[(i+2)*n+(j+0)] * 0.05
-                          +in[(i+3)*n+(j+0)] * 0.0333333333333
-                          +in[(i+4)*n+(j+0)] * 0.025
-                          +in[(i+5)*n+(j+0)] * 0.02;
+            out[i*n+j] += +in[(i)*n+(j-5)] * -0.02
+                          +in[(i)*n+(j-4)] * -0.025
+                          +in[(i)*n+(j-3)] * -0.0333333333333
+                          +in[(i)*n+(j-2)] * -0.05
+                          +in[(i)*n+(j-1)] * -0.1
+                          +in[(i-5)*n+(j)] * -0.02
+                          +in[(i-4)*n+(j)] * -0.025
+                          +in[(i-3)*n+(j)] * -0.0333333333333
+                          +in[(i-2)*n+(j)] * -0.05
+                          +in[(i-1)*n+(j)] * -0.1
+                          +in[(i+1)*n+(j)] * 0.1
+                          +in[(i+2)*n+(j)] * 0.05
+                          +in[(i+3)*n+(j)] * 0.0333333333333
+                          +in[(i+4)*n+(j)] * 0.025
+                          +in[(i+5)*n+(j)] * 0.02
+                          +in[(i)*n+(j+1)] * 0.1
+                          +in[(i)*n+(j+2)] * 0.05
+                          +in[(i)*n+(j+3)] * 0.0333333333333
+                          +in[(i)*n+(j+4)] * 0.025
+                          +in[(i)*n+(j+5)] * 0.02;
       });
     });
 }
@@ -102,11 +102,11 @@ void grid1(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = prk::range(1,n-1);
     std::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-1)*n+(j+-1)] * -0.25
-                          +in[(i+-1)*n+(j+0)] * -0.25
-                          +in[(i+0)*n+(j+-1)] * -0.25
-                          +in[(i+0)*n+(j+1)] * 0.25
-                          +in[(i+1)*n+(j+0)] * 0.25
+            out[i*n+j] += +in[(i-1)*n+(j-1)] * -0.25
+                          +in[(i)*n+(j-1)] * -0.25
+                          +in[(i-1)*n+(j)] * -0.25
+                          +in[(i+1)*n+(j)] * 0.25
+                          +in[(i)*n+(j+1)] * 0.25
                           +in[(i+1)*n+(j+1)] * 0.25
                           ;
       });
@@ -117,25 +117,25 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = prk::range(2,n-2);
     std::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625
-                          +in[(i+-2)*n+(j+-1)] * -0.0208333333333
-                          +in[(i+-2)*n+(j+0)] * -0.0208333333333
-                          +in[(i+-2)*n+(j+1)] * -0.0208333333333
-                          +in[(i+-1)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+-1)*n+(j+-1)] * -0.125
-                          +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+2)] * 0.0208333333333
-                          +in[(i+0)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+0)*n+(j+-1)] * -0.125
-                          +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.0208333333333
-                          +in[(i+1)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+1)*n+(j+0)] * 0.125
+            out[i*n+j] += +in[(i-2)*n+(j-2)] * -0.0625
+                          +in[(i-1)*n+(j-2)] * -0.0208333333333
+                          +in[(i)*n+(j-2)] * -0.0208333333333
+                          +in[(i+1)*n+(j-2)] * -0.0208333333333
+                          +in[(i-2)*n+(j-1)] * -0.0208333333333
+                          +in[(i-1)*n+(j-1)] * -0.125
+                          +in[(i)*n+(j-1)] * -0.125
+                          +in[(i+2)*n+(j-1)] * 0.0208333333333
+                          +in[(i-2)*n+(j)] * -0.0208333333333
+                          +in[(i-1)*n+(j)] * -0.125
+                          +in[(i+1)*n+(j)] * 0.125
+                          +in[(i+2)*n+(j)] * 0.0208333333333
+                          +in[(i-2)*n+(j+1)] * -0.0208333333333
+                          +in[(i)*n+(j+1)] * 0.125
                           +in[(i+1)*n+(j+1)] * 0.125
-                          +in[(i+1)*n+(j+2)] * 0.0208333333333
-                          +in[(i+2)*n+(j+-1)] * 0.0208333333333
-                          +in[(i+2)*n+(j+0)] * 0.0208333333333
                           +in[(i+2)*n+(j+1)] * 0.0208333333333
+                          +in[(i-1)*n+(j+2)] * 0.0208333333333
+                          +in[(i)*n+(j+2)] * 0.0208333333333
+                          +in[(i+1)*n+(j+2)] * 0.0208333333333
                           +in[(i+2)*n+(j+2)] * 0.0625
                           ;
       });
@@ -146,47 +146,47 @@ void grid3(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = prk::range(3,n-3);
     std::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778
-                          +in[(i+-3)*n+(j+-2)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+-1)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+0)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+1)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+2)] * -0.00555555555556
-                          +in[(i+-2)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+-2)*n+(j+-2)] * -0.0416666666667
-                          +in[(i+-2)*n+(j+-1)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+0)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+1)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+3)] * 0.00555555555556
-                          +in[(i+-1)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+-1)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+-1)*n+(j+-1)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+0)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+2)] * 0.0138888888889
-                          +in[(i+-1)*n+(j+3)] * 0.00555555555556
-                          +in[(i+0)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+0)*n+(j+-1)] * -0.0833333333333
-                          +in[(i+0)*n+(j+1)] * 0.0833333333333
-                          +in[(i+0)*n+(j+2)] * 0.0138888888889
-                          +in[(i+0)*n+(j+3)] * 0.00555555555556
-                          +in[(i+1)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+1)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+1)*n+(j+0)] * 0.0833333333333
+            out[i*n+j] += +in[(i-3)*n+(j-3)] * -0.0277777777778
+                          +in[(i-2)*n+(j-3)] * -0.00555555555556
+                          +in[(i-1)*n+(j-3)] * -0.00555555555556
+                          +in[(i)*n+(j-3)] * -0.00555555555556
+                          +in[(i+1)*n+(j-3)] * -0.00555555555556
+                          +in[(i+2)*n+(j-3)] * -0.00555555555556
+                          +in[(i-3)*n+(j-2)] * -0.00555555555556
+                          +in[(i-2)*n+(j-2)] * -0.0416666666667
+                          +in[(i-1)*n+(j-2)] * -0.0138888888889
+                          +in[(i)*n+(j-2)] * -0.0138888888889
+                          +in[(i+1)*n+(j-2)] * -0.0138888888889
+                          +in[(i+3)*n+(j-2)] * 0.00555555555556
+                          +in[(i-3)*n+(j-1)] * -0.00555555555556
+                          +in[(i-2)*n+(j-1)] * -0.0138888888889
+                          +in[(i-1)*n+(j-1)] * -0.0833333333333
+                          +in[(i)*n+(j-1)] * -0.0833333333333
+                          +in[(i+2)*n+(j-1)] * 0.0138888888889
+                          +in[(i+3)*n+(j-1)] * 0.00555555555556
+                          +in[(i-3)*n+(j)] * -0.00555555555556
+                          +in[(i-2)*n+(j)] * -0.0138888888889
+                          +in[(i-1)*n+(j)] * -0.0833333333333
+                          +in[(i+1)*n+(j)] * 0.0833333333333
+                          +in[(i+2)*n+(j)] * 0.0138888888889
+                          +in[(i+3)*n+(j)] * 0.00555555555556
+                          +in[(i-3)*n+(j+1)] * -0.00555555555556
+                          +in[(i-2)*n+(j+1)] * -0.0138888888889
+                          +in[(i)*n+(j+1)] * 0.0833333333333
                           +in[(i+1)*n+(j+1)] * 0.0833333333333
-                          +in[(i+1)*n+(j+2)] * 0.0138888888889
-                          +in[(i+1)*n+(j+3)] * 0.00555555555556
-                          +in[(i+2)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+2)*n+(j+-1)] * 0.0138888888889
-                          +in[(i+2)*n+(j+0)] * 0.0138888888889
                           +in[(i+2)*n+(j+1)] * 0.0138888888889
-                          +in[(i+2)*n+(j+2)] * 0.0416666666667
-                          +in[(i+2)*n+(j+3)] * 0.00555555555556
-                          +in[(i+3)*n+(j+-2)] * 0.00555555555556
-                          +in[(i+3)*n+(j+-1)] * 0.00555555555556
-                          +in[(i+3)*n+(j+0)] * 0.00555555555556
                           +in[(i+3)*n+(j+1)] * 0.00555555555556
+                          +in[(i-3)*n+(j+2)] * -0.00555555555556
+                          +in[(i-1)*n+(j+2)] * 0.0138888888889
+                          +in[(i)*n+(j+2)] * 0.0138888888889
+                          +in[(i+1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+2)*n+(j+2)] * 0.0416666666667
                           +in[(i+3)*n+(j+2)] * 0.00555555555556
+                          +in[(i-2)*n+(j+3)] * 0.00555555555556
+                          +in[(i-1)*n+(j+3)] * 0.00555555555556
+                          +in[(i)*n+(j+3)] * 0.00555555555556
+                          +in[(i+1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+2)*n+(j+3)] * 0.00555555555556
                           +in[(i+3)*n+(j+3)] * 0.0277777777778
                           ;
       });
@@ -197,77 +197,77 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = prk::range(4,n-4);
     std::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625
-                          +in[(i+-4)*n+(j+-3)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+-2)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+-1)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+0)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+1)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+2)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+3)] * -0.00223214285714
-                          +in[(i+-3)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-3)*n+(j+-3)] * -0.0208333333333
-                          +in[(i+-3)*n+(j+-2)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+-1)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+0)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+1)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+2)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+4)] * 0.00223214285714
-                          +in[(i+-2)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-2)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+-2)*n+(j+-2)] * -0.03125
-                          +in[(i+-2)*n+(j+-1)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+0)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+1)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+3)] * 0.00416666666667
-                          +in[(i+-2)*n+(j+4)] * 0.00223214285714
-                          +in[(i+-1)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-1)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+-1)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+-1)*n+(j+-1)] * -0.0625
-                          +in[(i+-1)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+2)] * 0.0104166666667
-                          +in[(i+-1)*n+(j+3)] * 0.00416666666667
-                          +in[(i+-1)*n+(j+4)] * 0.00223214285714
-                          +in[(i+0)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+0)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+0)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+0)*n+(j+-1)] * -0.0625
-                          +in[(i+0)*n+(j+1)] * 0.0625
-                          +in[(i+0)*n+(j+2)] * 0.0104166666667
-                          +in[(i+0)*n+(j+3)] * 0.00416666666667
-                          +in[(i+0)*n+(j+4)] * 0.00223214285714
-                          +in[(i+1)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+1)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+1)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+1)*n+(j+0)] * 0.0625
+            out[i*n+j] += +in[(i-4)*n+(j-4)] * -0.015625
+                          +in[(i-3)*n+(j-4)] * -0.00223214285714
+                          +in[(i-2)*n+(j-4)] * -0.00223214285714
+                          +in[(i-1)*n+(j-4)] * -0.00223214285714
+                          +in[(i)*n+(j-4)] * -0.00223214285714
+                          +in[(i+1)*n+(j-4)] * -0.00223214285714
+                          +in[(i+2)*n+(j-4)] * -0.00223214285714
+                          +in[(i+3)*n+(j-4)] * -0.00223214285714
+                          +in[(i-4)*n+(j-3)] * -0.00223214285714
+                          +in[(i-3)*n+(j-3)] * -0.0208333333333
+                          +in[(i-2)*n+(j-3)] * -0.00416666666667
+                          +in[(i-1)*n+(j-3)] * -0.00416666666667
+                          +in[(i)*n+(j-3)] * -0.00416666666667
+                          +in[(i+1)*n+(j-3)] * -0.00416666666667
+                          +in[(i+2)*n+(j-3)] * -0.00416666666667
+                          +in[(i+4)*n+(j-3)] * 0.00223214285714
+                          +in[(i-4)*n+(j-2)] * -0.00223214285714
+                          +in[(i-3)*n+(j-2)] * -0.00416666666667
+                          +in[(i-2)*n+(j-2)] * -0.03125
+                          +in[(i-1)*n+(j-2)] * -0.0104166666667
+                          +in[(i)*n+(j-2)] * -0.0104166666667
+                          +in[(i+1)*n+(j-2)] * -0.0104166666667
+                          +in[(i+3)*n+(j-2)] * 0.00416666666667
+                          +in[(i+4)*n+(j-2)] * 0.00223214285714
+                          +in[(i-4)*n+(j-1)] * -0.00223214285714
+                          +in[(i-3)*n+(j-1)] * -0.00416666666667
+                          +in[(i-2)*n+(j-1)] * -0.0104166666667
+                          +in[(i-1)*n+(j-1)] * -0.0625
+                          +in[(i)*n+(j-1)] * -0.0625
+                          +in[(i+2)*n+(j-1)] * 0.0104166666667
+                          +in[(i+3)*n+(j-1)] * 0.00416666666667
+                          +in[(i+4)*n+(j-1)] * 0.00223214285714
+                          +in[(i-4)*n+(j)] * -0.00223214285714
+                          +in[(i-3)*n+(j)] * -0.00416666666667
+                          +in[(i-2)*n+(j)] * -0.0104166666667
+                          +in[(i-1)*n+(j)] * -0.0625
+                          +in[(i+1)*n+(j)] * 0.0625
+                          +in[(i+2)*n+(j)] * 0.0104166666667
+                          +in[(i+3)*n+(j)] * 0.00416666666667
+                          +in[(i+4)*n+(j)] * 0.00223214285714
+                          +in[(i-4)*n+(j+1)] * -0.00223214285714
+                          +in[(i-3)*n+(j+1)] * -0.00416666666667
+                          +in[(i-2)*n+(j+1)] * -0.0104166666667
+                          +in[(i)*n+(j+1)] * 0.0625
                           +in[(i+1)*n+(j+1)] * 0.0625
-                          +in[(i+1)*n+(j+2)] * 0.0104166666667
-                          +in[(i+1)*n+(j+3)] * 0.00416666666667
-                          +in[(i+1)*n+(j+4)] * 0.00223214285714
-                          +in[(i+2)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+2)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+2)*n+(j+-1)] * 0.0104166666667
-                          +in[(i+2)*n+(j+0)] * 0.0104166666667
                           +in[(i+2)*n+(j+1)] * 0.0104166666667
-                          +in[(i+2)*n+(j+2)] * 0.03125
-                          +in[(i+2)*n+(j+3)] * 0.00416666666667
-                          +in[(i+2)*n+(j+4)] * 0.00223214285714
-                          +in[(i+3)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+3)*n+(j+-2)] * 0.00416666666667
-                          +in[(i+3)*n+(j+-1)] * 0.00416666666667
-                          +in[(i+3)*n+(j+0)] * 0.00416666666667
                           +in[(i+3)*n+(j+1)] * 0.00416666666667
-                          +in[(i+3)*n+(j+2)] * 0.00416666666667
-                          +in[(i+3)*n+(j+3)] * 0.0208333333333
-                          +in[(i+3)*n+(j+4)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-3)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-2)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-1)] * 0.00223214285714
-                          +in[(i+4)*n+(j+0)] * 0.00223214285714
                           +in[(i+4)*n+(j+1)] * 0.00223214285714
+                          +in[(i-4)*n+(j+2)] * -0.00223214285714
+                          +in[(i-3)*n+(j+2)] * -0.00416666666667
+                          +in[(i-1)*n+(j+2)] * 0.0104166666667
+                          +in[(i)*n+(j+2)] * 0.0104166666667
+                          +in[(i+1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+2)*n+(j+2)] * 0.03125
+                          +in[(i+3)*n+(j+2)] * 0.00416666666667
                           +in[(i+4)*n+(j+2)] * 0.00223214285714
+                          +in[(i-4)*n+(j+3)] * -0.00223214285714
+                          +in[(i-2)*n+(j+3)] * 0.00416666666667
+                          +in[(i-1)*n+(j+3)] * 0.00416666666667
+                          +in[(i)*n+(j+3)] * 0.00416666666667
+                          +in[(i+1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+3)*n+(j+3)] * 0.0208333333333
                           +in[(i+4)*n+(j+3)] * 0.00223214285714
+                          +in[(i-3)*n+(j+4)] * 0.00223214285714
+                          +in[(i-2)*n+(j+4)] * 0.00223214285714
+                          +in[(i-1)*n+(j+4)] * 0.00223214285714
+                          +in[(i)*n+(j+4)] * 0.00223214285714
+                          +in[(i+1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+3)*n+(j+4)] * 0.00223214285714
                           +in[(i+4)*n+(j+4)] * 0.015625
                           ;
       });
@@ -278,115 +278,115 @@ void grid5(const int n, const int t, std::vector<double> & in, std::vector<doubl
     auto inside = prk::range(5,n-5);
     std::for_each( std::begin(inside), std::end(inside), [&] (int i) {
       std::for_each( std::begin(inside), std::end(inside), [&] (int j) {
-            out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01
-                          +in[(i+-5)*n+(j+-4)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-3)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-2)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-1)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+0)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+1)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+2)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+3)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+4)] * -0.00111111111111
-                          +in[(i+-4)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-4)*n+(j+-4)] * -0.0125
-                          +in[(i+-4)*n+(j+-3)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+-2)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+-1)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+0)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+1)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+2)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+3)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-3)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-3)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-3)*n+(j+-3)] * -0.0166666666667
-                          +in[(i+-3)*n+(j+-2)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+-1)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+0)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+1)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+2)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-3)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-2)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-2)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-2)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+-2)*n+(j+-2)] * -0.025
-                          +in[(i+-2)*n+(j+-1)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+0)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+1)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+3)] * 0.00333333333333
-                          +in[(i+-2)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-2)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-1)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-1)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-1)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+-1)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+-1)*n+(j+-1)] * -0.05
-                          +in[(i+-1)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+2)] * 0.00833333333333
-                          +in[(i+-1)*n+(j+3)] * 0.00333333333333
-                          +in[(i+-1)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-1)*n+(j+5)] * 0.00111111111111
-                          +in[(i+0)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+0)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+0)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+0)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+0)*n+(j+-1)] * -0.05
-                          +in[(i+0)*n+(j+1)] * 0.05
-                          +in[(i+0)*n+(j+2)] * 0.00833333333333
-                          +in[(i+0)*n+(j+3)] * 0.00333333333333
-                          +in[(i+0)*n+(j+4)] * 0.00178571428571
-                          +in[(i+0)*n+(j+5)] * 0.00111111111111
-                          +in[(i+1)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+1)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+1)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+1)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+1)*n+(j+0)] * 0.05
+            out[i*n+j] += +in[(i-5)*n+(j-5)] * -0.01
+                          +in[(i-4)*n+(j-5)] * -0.00111111111111
+                          +in[(i-3)*n+(j-5)] * -0.00111111111111
+                          +in[(i-2)*n+(j-5)] * -0.00111111111111
+                          +in[(i-1)*n+(j-5)] * -0.00111111111111
+                          +in[(i)*n+(j-5)] * -0.00111111111111
+                          +in[(i+1)*n+(j-5)] * -0.00111111111111
+                          +in[(i+2)*n+(j-5)] * -0.00111111111111
+                          +in[(i+3)*n+(j-5)] * -0.00111111111111
+                          +in[(i+4)*n+(j-5)] * -0.00111111111111
+                          +in[(i-5)*n+(j-4)] * -0.00111111111111
+                          +in[(i-4)*n+(j-4)] * -0.0125
+                          +in[(i-3)*n+(j-4)] * -0.00178571428571
+                          +in[(i-2)*n+(j-4)] * -0.00178571428571
+                          +in[(i-1)*n+(j-4)] * -0.00178571428571
+                          +in[(i)*n+(j-4)] * -0.00178571428571
+                          +in[(i+1)*n+(j-4)] * -0.00178571428571
+                          +in[(i+2)*n+(j-4)] * -0.00178571428571
+                          +in[(i+3)*n+(j-4)] * -0.00178571428571
+                          +in[(i+5)*n+(j-4)] * 0.00111111111111
+                          +in[(i-5)*n+(j-3)] * -0.00111111111111
+                          +in[(i-4)*n+(j-3)] * -0.00178571428571
+                          +in[(i-3)*n+(j-3)] * -0.0166666666667
+                          +in[(i-2)*n+(j-3)] * -0.00333333333333
+                          +in[(i-1)*n+(j-3)] * -0.00333333333333
+                          +in[(i)*n+(j-3)] * -0.00333333333333
+                          +in[(i+1)*n+(j-3)] * -0.00333333333333
+                          +in[(i+2)*n+(j-3)] * -0.00333333333333
+                          +in[(i+4)*n+(j-3)] * 0.00178571428571
+                          +in[(i+5)*n+(j-3)] * 0.00111111111111
+                          +in[(i-5)*n+(j-2)] * -0.00111111111111
+                          +in[(i-4)*n+(j-2)] * -0.00178571428571
+                          +in[(i-3)*n+(j-2)] * -0.00333333333333
+                          +in[(i-2)*n+(j-2)] * -0.025
+                          +in[(i-1)*n+(j-2)] * -0.00833333333333
+                          +in[(i)*n+(j-2)] * -0.00833333333333
+                          +in[(i+1)*n+(j-2)] * -0.00833333333333
+                          +in[(i+3)*n+(j-2)] * 0.00333333333333
+                          +in[(i+4)*n+(j-2)] * 0.00178571428571
+                          +in[(i+5)*n+(j-2)] * 0.00111111111111
+                          +in[(i-5)*n+(j-1)] * -0.00111111111111
+                          +in[(i-4)*n+(j-1)] * -0.00178571428571
+                          +in[(i-3)*n+(j-1)] * -0.00333333333333
+                          +in[(i-2)*n+(j-1)] * -0.00833333333333
+                          +in[(i-1)*n+(j-1)] * -0.05
+                          +in[(i)*n+(j-1)] * -0.05
+                          +in[(i+2)*n+(j-1)] * 0.00833333333333
+                          +in[(i+3)*n+(j-1)] * 0.00333333333333
+                          +in[(i+4)*n+(j-1)] * 0.00178571428571
+                          +in[(i+5)*n+(j-1)] * 0.00111111111111
+                          +in[(i-5)*n+(j)] * -0.00111111111111
+                          +in[(i-4)*n+(j)] * -0.00178571428571
+                          +in[(i-3)*n+(j)] * -0.00333333333333
+                          +in[(i-2)*n+(j)] * -0.00833333333333
+                          +in[(i-1)*n+(j)] * -0.05
+                          +in[(i+1)*n+(j)] * 0.05
+                          +in[(i+2)*n+(j)] * 0.00833333333333
+                          +in[(i+3)*n+(j)] * 0.00333333333333
+                          +in[(i+4)*n+(j)] * 0.00178571428571
+                          +in[(i+5)*n+(j)] * 0.00111111111111
+                          +in[(i-5)*n+(j+1)] * -0.00111111111111
+                          +in[(i-4)*n+(j+1)] * -0.00178571428571
+                          +in[(i-3)*n+(j+1)] * -0.00333333333333
+                          +in[(i-2)*n+(j+1)] * -0.00833333333333
+                          +in[(i)*n+(j+1)] * 0.05
                           +in[(i+1)*n+(j+1)] * 0.05
-                          +in[(i+1)*n+(j+2)] * 0.00833333333333
-                          +in[(i+1)*n+(j+3)] * 0.00333333333333
-                          +in[(i+1)*n+(j+4)] * 0.00178571428571
-                          +in[(i+1)*n+(j+5)] * 0.00111111111111
-                          +in[(i+2)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+2)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+2)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+2)*n+(j+-1)] * 0.00833333333333
-                          +in[(i+2)*n+(j+0)] * 0.00833333333333
                           +in[(i+2)*n+(j+1)] * 0.00833333333333
-                          +in[(i+2)*n+(j+2)] * 0.025
-                          +in[(i+2)*n+(j+3)] * 0.00333333333333
-                          +in[(i+2)*n+(j+4)] * 0.00178571428571
-                          +in[(i+2)*n+(j+5)] * 0.00111111111111
-                          +in[(i+3)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+3)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+3)*n+(j+-2)] * 0.00333333333333
-                          +in[(i+3)*n+(j+-1)] * 0.00333333333333
-                          +in[(i+3)*n+(j+0)] * 0.00333333333333
                           +in[(i+3)*n+(j+1)] * 0.00333333333333
-                          +in[(i+3)*n+(j+2)] * 0.00333333333333
-                          +in[(i+3)*n+(j+3)] * 0.0166666666667
-                          +in[(i+3)*n+(j+4)] * 0.00178571428571
-                          +in[(i+3)*n+(j+5)] * 0.00111111111111
-                          +in[(i+4)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+4)*n+(j+-3)] * 0.00178571428571
-                          +in[(i+4)*n+(j+-2)] * 0.00178571428571
-                          +in[(i+4)*n+(j+-1)] * 0.00178571428571
-                          +in[(i+4)*n+(j+0)] * 0.00178571428571
                           +in[(i+4)*n+(j+1)] * 0.00178571428571
-                          +in[(i+4)*n+(j+2)] * 0.00178571428571
-                          +in[(i+4)*n+(j+3)] * 0.00178571428571
-                          +in[(i+4)*n+(j+4)] * 0.0125
-                          +in[(i+4)*n+(j+5)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-4)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-3)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-2)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-1)] * 0.00111111111111
-                          +in[(i+5)*n+(j+0)] * 0.00111111111111
                           +in[(i+5)*n+(j+1)] * 0.00111111111111
+                          +in[(i-5)*n+(j+2)] * -0.00111111111111
+                          +in[(i-4)*n+(j+2)] * -0.00178571428571
+                          +in[(i-3)*n+(j+2)] * -0.00333333333333
+                          +in[(i-1)*n+(j+2)] * 0.00833333333333
+                          +in[(i)*n+(j+2)] * 0.00833333333333
+                          +in[(i+1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+2)*n+(j+2)] * 0.025
+                          +in[(i+3)*n+(j+2)] * 0.00333333333333
+                          +in[(i+4)*n+(j+2)] * 0.00178571428571
                           +in[(i+5)*n+(j+2)] * 0.00111111111111
+                          +in[(i-5)*n+(j+3)] * -0.00111111111111
+                          +in[(i-4)*n+(j+3)] * -0.00178571428571
+                          +in[(i-2)*n+(j+3)] * 0.00333333333333
+                          +in[(i-1)*n+(j+3)] * 0.00333333333333
+                          +in[(i)*n+(j+3)] * 0.00333333333333
+                          +in[(i+1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+3)*n+(j+3)] * 0.0166666666667
+                          +in[(i+4)*n+(j+3)] * 0.00178571428571
                           +in[(i+5)*n+(j+3)] * 0.00111111111111
+                          +in[(i-5)*n+(j+4)] * -0.00111111111111
+                          +in[(i-3)*n+(j+4)] * 0.00178571428571
+                          +in[(i-2)*n+(j+4)] * 0.00178571428571
+                          +in[(i-1)*n+(j+4)] * 0.00178571428571
+                          +in[(i)*n+(j+4)] * 0.00178571428571
+                          +in[(i+1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+4)*n+(j+4)] * 0.0125
                           +in[(i+5)*n+(j+4)] * 0.00111111111111
+                          +in[(i-4)*n+(j+5)] * 0.00111111111111
+                          +in[(i-3)*n+(j+5)] * 0.00111111111111
+                          +in[(i-2)*n+(j+5)] * 0.00111111111111
+                          +in[(i-1)*n+(j+5)] * 0.00111111111111
+                          +in[(i)*n+(j+5)] * 0.00111111111111
+                          +in[(i+1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+4)*n+(j+5)] * 0.00111111111111
                           +in[(i+5)*n+(j+5)] * 0.01
                           ;
       });
diff --git a/Cxx11/stencil_target.hpp b/Cxx11/stencil_target.hpp
index 1e36ef2be..cb1ee14f8 100644
--- a/Cxx11/stencil_target.hpp
+++ b/Cxx11/stencil_target.hpp
@@ -4,10 +4,10 @@ void star1(const int n, const int t, const double * RESTRICT in, double * RESTRI
     OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
     for (auto i=1; i<n-1; ++i) {
       for (auto j=1; j<n-1; ++j) {
-            out[i*n+j] += +in[(i+-1)*n+(j+0)] * -0.5
-                          +in[(i+0)*n+(j+-1)] * -0.5
-                          +in[(i+0)*n+(j+1)] * 0.5
-                          +in[(i+1)*n+(j+0)] * 0.5;
+            out[i*n+j] += +in[(i)*n+(j-1)] * -0.5
+                          +in[(i-1)*n+(j)] * -0.5
+                          +in[(i+1)*n+(j)] * 0.5
+                          +in[(i)*n+(j+1)] * 0.5;
        }
      }
 }
@@ -16,14 +16,14 @@ void star2(const int n, const int t, const double * RESTRICT in, double * RESTRI
     OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
     for (auto i=2; i<n-2; ++i) {
       for (auto j=2; j<n-2; ++j) {
-            out[i*n+j] += +in[(i+-2)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+0)] * -0.25
-                          +in[(i+0)*n+(j+-2)] * -0.125
-                          +in[(i+0)*n+(j+-1)] * -0.25
-                          +in[(i+0)*n+(j+1)] * 0.25
-                          +in[(i+0)*n+(j+2)] * 0.125
-                          +in[(i+1)*n+(j+0)] * 0.25
-                          +in[(i+2)*n+(j+0)] * 0.125;
+            out[i*n+j] += +in[(i)*n+(j-2)] * -0.125
+                          +in[(i)*n+(j-1)] * -0.25
+                          +in[(i-2)*n+(j)] * -0.125
+                          +in[(i-1)*n+(j)] * -0.25
+                          +in[(i+1)*n+(j)] * 0.25
+                          +in[(i+2)*n+(j)] * 0.125
+                          +in[(i)*n+(j+1)] * 0.25
+                          +in[(i)*n+(j+2)] * 0.125;
        }
      }
 }
@@ -32,18 +32,18 @@ void star3(const int n, const int t, const double * RESTRICT in, double * RESTRI
     OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
     for (auto i=3; i<n-3; ++i) {
       for (auto j=3; j<n-3; ++j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556
-                          +in[(i+-2)*n+(j+0)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+0)] * -0.166666666667
-                          +in[(i+0)*n+(j+-3)] * -0.0555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.0833333333333
-                          +in[(i+0)*n+(j+-1)] * -0.166666666667
-                          +in[(i+0)*n+(j+1)] * 0.166666666667
-                          +in[(i+0)*n+(j+2)] * 0.0833333333333
-                          +in[(i+0)*n+(j+3)] * 0.0555555555556
-                          +in[(i+1)*n+(j+0)] * 0.166666666667
-                          +in[(i+2)*n+(j+0)] * 0.0833333333333
-                          +in[(i+3)*n+(j+0)] * 0.0555555555556;
+            out[i*n+j] += +in[(i)*n+(j-3)] * -0.0555555555556
+                          +in[(i)*n+(j-2)] * -0.0833333333333
+                          +in[(i)*n+(j-1)] * -0.166666666667
+                          +in[(i-3)*n+(j)] * -0.0555555555556
+                          +in[(i-2)*n+(j)] * -0.0833333333333
+                          +in[(i-1)*n+(j)] * -0.166666666667
+                          +in[(i+1)*n+(j)] * 0.166666666667
+                          +in[(i+2)*n+(j)] * 0.0833333333333
+                          +in[(i+3)*n+(j)] * 0.0555555555556
+                          +in[(i)*n+(j+1)] * 0.166666666667
+                          +in[(i)*n+(j+2)] * 0.0833333333333
+                          +in[(i)*n+(j+3)] * 0.0555555555556;
        }
      }
 }
@@ -52,22 +52,22 @@ void star4(const int n, const int t, const double * RESTRICT in, double * RESTRI
     OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
     for (auto i=4; i<n-4; ++i) {
       for (auto j=4; j<n-4; ++j) {
-            out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125
-                          +in[(i+-3)*n+(j+0)] * -0.0416666666667
-                          +in[(i+-2)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+0)*n+(j+-4)] * -0.03125
-                          +in[(i+0)*n+(j+-3)] * -0.0416666666667
-                          +in[(i+0)*n+(j+-2)] * -0.0625
-                          +in[(i+0)*n+(j+-1)] * -0.125
-                          +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.0625
-                          +in[(i+0)*n+(j+3)] * 0.0416666666667
-                          +in[(i+0)*n+(j+4)] * 0.03125
-                          +in[(i+1)*n+(j+0)] * 0.125
-                          +in[(i+2)*n+(j+0)] * 0.0625
-                          +in[(i+3)*n+(j+0)] * 0.0416666666667
-                          +in[(i+4)*n+(j+0)] * 0.03125;
+            out[i*n+j] += +in[(i)*n+(j-4)] * -0.03125
+                          +in[(i)*n+(j-3)] * -0.0416666666667
+                          +in[(i)*n+(j-2)] * -0.0625
+                          +in[(i)*n+(j-1)] * -0.125
+                          +in[(i-4)*n+(j)] * -0.03125
+                          +in[(i-3)*n+(j)] * -0.0416666666667
+                          +in[(i-2)*n+(j)] * -0.0625
+                          +in[(i-1)*n+(j)] * -0.125
+                          +in[(i+1)*n+(j)] * 0.125
+                          +in[(i+2)*n+(j)] * 0.0625
+                          +in[(i+3)*n+(j)] * 0.0416666666667
+                          +in[(i+4)*n+(j)] * 0.03125
+                          +in[(i)*n+(j+1)] * 0.125
+                          +in[(i)*n+(j+2)] * 0.0625
+                          +in[(i)*n+(j+3)] * 0.0416666666667
+                          +in[(i)*n+(j+4)] * 0.03125;
        }
      }
 }
@@ -76,26 +76,26 @@ void star5(const int n, const int t, const double * RESTRICT in, double * RESTRI
     OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
     for (auto i=5; i<n-5; ++i) {
       for (auto j=5; j<n-5; ++j) {
-            out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02
-                          +in[(i+-4)*n+(j+0)] * -0.025
-                          +in[(i+-3)*n+(j+0)] * -0.0333333333333
-                          +in[(i+-2)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+0)] * -0.1
-                          +in[(i+0)*n+(j+-5)] * -0.02
-                          +in[(i+0)*n+(j+-4)] * -0.025
-                          +in[(i+0)*n+(j+-3)] * -0.0333333333333
-                          +in[(i+0)*n+(j+-2)] * -0.05
-                          +in[(i+0)*n+(j+-1)] * -0.1
-                          +in[(i+0)*n+(j+1)] * 0.1
-                          +in[(i+0)*n+(j+2)] * 0.05
-                          +in[(i+0)*n+(j+3)] * 0.0333333333333
-                          +in[(i+0)*n+(j+4)] * 0.025
-                          +in[(i+0)*n+(j+5)] * 0.02
-                          +in[(i+1)*n+(j+0)] * 0.1
-                          +in[(i+2)*n+(j+0)] * 0.05
-                          +in[(i+3)*n+(j+0)] * 0.0333333333333
-                          +in[(i+4)*n+(j+0)] * 0.025
-                          +in[(i+5)*n+(j+0)] * 0.02;
+            out[i*n+j] += +in[(i)*n+(j-5)] * -0.02
+                          +in[(i)*n+(j-4)] * -0.025
+                          +in[(i)*n+(j-3)] * -0.0333333333333
+                          +in[(i)*n+(j-2)] * -0.05
+                          +in[(i)*n+(j-1)] * -0.1
+                          +in[(i-5)*n+(j)] * -0.02
+                          +in[(i-4)*n+(j)] * -0.025
+                          +in[(i-3)*n+(j)] * -0.0333333333333
+                          +in[(i-2)*n+(j)] * -0.05
+                          +in[(i-1)*n+(j)] * -0.1
+                          +in[(i+1)*n+(j)] * 0.1
+                          +in[(i+2)*n+(j)] * 0.05
+                          +in[(i+3)*n+(j)] * 0.0333333333333
+                          +in[(i+4)*n+(j)] * 0.025
+                          +in[(i+5)*n+(j)] * 0.02
+                          +in[(i)*n+(j+1)] * 0.1
+                          +in[(i)*n+(j+2)] * 0.05
+                          +in[(i)*n+(j+3)] * 0.0333333333333
+                          +in[(i)*n+(j+4)] * 0.025
+                          +in[(i)*n+(j+5)] * 0.02;
        }
      }
 }
@@ -104,11 +104,11 @@ void grid1(const int n, const int t, const double * RESTRICT in, double * RESTRI
     OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
     for (auto i=1; i<n-1; ++i) {
       for (auto j=1; j<n-1; ++j) {
-            out[i*n+j] += +in[(i+-1)*n+(j+-1)] * -0.25
-                          +in[(i+-1)*n+(j+0)] * -0.25
-                          +in[(i+0)*n+(j+-1)] * -0.25
-                          +in[(i+0)*n+(j+1)] * 0.25
-                          +in[(i+1)*n+(j+0)] * 0.25
+            out[i*n+j] += +in[(i-1)*n+(j-1)] * -0.25
+                          +in[(i)*n+(j-1)] * -0.25
+                          +in[(i-1)*n+(j)] * -0.25
+                          +in[(i+1)*n+(j)] * 0.25
+                          +in[(i)*n+(j+1)] * 0.25
                           +in[(i+1)*n+(j+1)] * 0.25
                           ;
        }
@@ -119,25 +119,25 @@ void grid2(const int n, const int t, const double * RESTRICT in, double * RESTRI
     OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
     for (auto i=2; i<n-2; ++i) {
       for (auto j=2; j<n-2; ++j) {
-            out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625
-                          +in[(i+-2)*n+(j+-1)] * -0.0208333333333
-                          +in[(i+-2)*n+(j+0)] * -0.0208333333333
-                          +in[(i+-2)*n+(j+1)] * -0.0208333333333
-                          +in[(i+-1)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+-1)*n+(j+-1)] * -0.125
-                          +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+2)] * 0.0208333333333
-                          +in[(i+0)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+0)*n+(j+-1)] * -0.125
-                          +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.0208333333333
-                          +in[(i+1)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+1)*n+(j+0)] * 0.125
+            out[i*n+j] += +in[(i-2)*n+(j-2)] * -0.0625
+                          +in[(i-1)*n+(j-2)] * -0.0208333333333
+                          +in[(i)*n+(j-2)] * -0.0208333333333
+                          +in[(i+1)*n+(j-2)] * -0.0208333333333
+                          +in[(i-2)*n+(j-1)] * -0.0208333333333
+                          +in[(i-1)*n+(j-1)] * -0.125
+                          +in[(i)*n+(j-1)] * -0.125
+                          +in[(i+2)*n+(j-1)] * 0.0208333333333
+                          +in[(i-2)*n+(j)] * -0.0208333333333
+                          +in[(i-1)*n+(j)] * -0.125
+                          +in[(i+1)*n+(j)] * 0.125
+                          +in[(i+2)*n+(j)] * 0.0208333333333
+                          +in[(i-2)*n+(j+1)] * -0.0208333333333
+                          +in[(i)*n+(j+1)] * 0.125
                           +in[(i+1)*n+(j+1)] * 0.125
-                          +in[(i+1)*n+(j+2)] * 0.0208333333333
-                          +in[(i+2)*n+(j+-1)] * 0.0208333333333
-                          +in[(i+2)*n+(j+0)] * 0.0208333333333
                           +in[(i+2)*n+(j+1)] * 0.0208333333333
+                          +in[(i-1)*n+(j+2)] * 0.0208333333333
+                          +in[(i)*n+(j+2)] * 0.0208333333333
+                          +in[(i+1)*n+(j+2)] * 0.0208333333333
                           +in[(i+2)*n+(j+2)] * 0.0625
                           ;
        }
@@ -148,47 +148,47 @@ void grid3(const int n, const int t, const double * RESTRICT in, double * RESTRI
     OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
     for (auto i=3; i<n-3; ++i) {
       for (auto j=3; j<n-3; ++j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778
-                          +in[(i+-3)*n+(j+-2)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+-1)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+0)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+1)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+2)] * -0.00555555555556
-                          +in[(i+-2)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+-2)*n+(j+-2)] * -0.0416666666667
-                          +in[(i+-2)*n+(j+-1)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+0)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+1)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+3)] * 0.00555555555556
-                          +in[(i+-1)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+-1)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+-1)*n+(j+-1)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+0)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+2)] * 0.0138888888889
-                          +in[(i+-1)*n+(j+3)] * 0.00555555555556
-                          +in[(i+0)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+0)*n+(j+-1)] * -0.0833333333333
-                          +in[(i+0)*n+(j+1)] * 0.0833333333333
-                          +in[(i+0)*n+(j+2)] * 0.0138888888889
-                          +in[(i+0)*n+(j+3)] * 0.00555555555556
-                          +in[(i+1)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+1)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+1)*n+(j+0)] * 0.0833333333333
+            out[i*n+j] += +in[(i-3)*n+(j-3)] * -0.0277777777778
+                          +in[(i-2)*n+(j-3)] * -0.00555555555556
+                          +in[(i-1)*n+(j-3)] * -0.00555555555556
+                          +in[(i)*n+(j-3)] * -0.00555555555556
+                          +in[(i+1)*n+(j-3)] * -0.00555555555556
+                          +in[(i+2)*n+(j-3)] * -0.00555555555556
+                          +in[(i-3)*n+(j-2)] * -0.00555555555556
+                          +in[(i-2)*n+(j-2)] * -0.0416666666667
+                          +in[(i-1)*n+(j-2)] * -0.0138888888889
+                          +in[(i)*n+(j-2)] * -0.0138888888889
+                          +in[(i+1)*n+(j-2)] * -0.0138888888889
+                          +in[(i+3)*n+(j-2)] * 0.00555555555556
+                          +in[(i-3)*n+(j-1)] * -0.00555555555556
+                          +in[(i-2)*n+(j-1)] * -0.0138888888889
+                          +in[(i-1)*n+(j-1)] * -0.0833333333333
+                          +in[(i)*n+(j-1)] * -0.0833333333333
+                          +in[(i+2)*n+(j-1)] * 0.0138888888889
+                          +in[(i+3)*n+(j-1)] * 0.00555555555556
+                          +in[(i-3)*n+(j)] * -0.00555555555556
+                          +in[(i-2)*n+(j)] * -0.0138888888889
+                          +in[(i-1)*n+(j)] * -0.0833333333333
+                          +in[(i+1)*n+(j)] * 0.0833333333333
+                          +in[(i+2)*n+(j)] * 0.0138888888889
+                          +in[(i+3)*n+(j)] * 0.00555555555556
+                          +in[(i-3)*n+(j+1)] * -0.00555555555556
+                          +in[(i-2)*n+(j+1)] * -0.0138888888889
+                          +in[(i)*n+(j+1)] * 0.0833333333333
                           +in[(i+1)*n+(j+1)] * 0.0833333333333
-                          +in[(i+1)*n+(j+2)] * 0.0138888888889
-                          +in[(i+1)*n+(j+3)] * 0.00555555555556
-                          +in[(i+2)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+2)*n+(j+-1)] * 0.0138888888889
-                          +in[(i+2)*n+(j+0)] * 0.0138888888889
                           +in[(i+2)*n+(j+1)] * 0.0138888888889
-                          +in[(i+2)*n+(j+2)] * 0.0416666666667
-                          +in[(i+2)*n+(j+3)] * 0.00555555555556
-                          +in[(i+3)*n+(j+-2)] * 0.00555555555556
-                          +in[(i+3)*n+(j+-1)] * 0.00555555555556
-                          +in[(i+3)*n+(j+0)] * 0.00555555555556
                           +in[(i+3)*n+(j+1)] * 0.00555555555556
+                          +in[(i-3)*n+(j+2)] * -0.00555555555556
+                          +in[(i-1)*n+(j+2)] * 0.0138888888889
+                          +in[(i)*n+(j+2)] * 0.0138888888889
+                          +in[(i+1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+2)*n+(j+2)] * 0.0416666666667
                           +in[(i+3)*n+(j+2)] * 0.00555555555556
+                          +in[(i-2)*n+(j+3)] * 0.00555555555556
+                          +in[(i-1)*n+(j+3)] * 0.00555555555556
+                          +in[(i)*n+(j+3)] * 0.00555555555556
+                          +in[(i+1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+2)*n+(j+3)] * 0.00555555555556
                           +in[(i+3)*n+(j+3)] * 0.0277777777778
                           ;
        }
@@ -199,77 +199,77 @@ void grid4(const int n, const int t, const double * RESTRICT in, double * RESTRI
     OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
     for (auto i=4; i<n-4; ++i) {
       for (auto j=4; j<n-4; ++j) {
-            out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625
-                          +in[(i+-4)*n+(j+-3)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+-2)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+-1)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+0)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+1)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+2)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+3)] * -0.00223214285714
-                          +in[(i+-3)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-3)*n+(j+-3)] * -0.0208333333333
-                          +in[(i+-3)*n+(j+-2)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+-1)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+0)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+1)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+2)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+4)] * 0.00223214285714
-                          +in[(i+-2)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-2)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+-2)*n+(j+-2)] * -0.03125
-                          +in[(i+-2)*n+(j+-1)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+0)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+1)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+3)] * 0.00416666666667
-                          +in[(i+-2)*n+(j+4)] * 0.00223214285714
-                          +in[(i+-1)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-1)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+-1)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+-1)*n+(j+-1)] * -0.0625
-                          +in[(i+-1)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+2)] * 0.0104166666667
-                          +in[(i+-1)*n+(j+3)] * 0.00416666666667
-                          +in[(i+-1)*n+(j+4)] * 0.00223214285714
-                          +in[(i+0)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+0)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+0)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+0)*n+(j+-1)] * -0.0625
-                          +in[(i+0)*n+(j+1)] * 0.0625
-                          +in[(i+0)*n+(j+2)] * 0.0104166666667
-                          +in[(i+0)*n+(j+3)] * 0.00416666666667
-                          +in[(i+0)*n+(j+4)] * 0.00223214285714
-                          +in[(i+1)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+1)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+1)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+1)*n+(j+0)] * 0.0625
+            out[i*n+j] += +in[(i-4)*n+(j-4)] * -0.015625
+                          +in[(i-3)*n+(j-4)] * -0.00223214285714
+                          +in[(i-2)*n+(j-4)] * -0.00223214285714
+                          +in[(i-1)*n+(j-4)] * -0.00223214285714
+                          +in[(i)*n+(j-4)] * -0.00223214285714
+                          +in[(i+1)*n+(j-4)] * -0.00223214285714
+                          +in[(i+2)*n+(j-4)] * -0.00223214285714
+                          +in[(i+3)*n+(j-4)] * -0.00223214285714
+                          +in[(i-4)*n+(j-3)] * -0.00223214285714
+                          +in[(i-3)*n+(j-3)] * -0.0208333333333
+                          +in[(i-2)*n+(j-3)] * -0.00416666666667
+                          +in[(i-1)*n+(j-3)] * -0.00416666666667
+                          +in[(i)*n+(j-3)] * -0.00416666666667
+                          +in[(i+1)*n+(j-3)] * -0.00416666666667
+                          +in[(i+2)*n+(j-3)] * -0.00416666666667
+                          +in[(i+4)*n+(j-3)] * 0.00223214285714
+                          +in[(i-4)*n+(j-2)] * -0.00223214285714
+                          +in[(i-3)*n+(j-2)] * -0.00416666666667
+                          +in[(i-2)*n+(j-2)] * -0.03125
+                          +in[(i-1)*n+(j-2)] * -0.0104166666667
+                          +in[(i)*n+(j-2)] * -0.0104166666667
+                          +in[(i+1)*n+(j-2)] * -0.0104166666667
+                          +in[(i+3)*n+(j-2)] * 0.00416666666667
+                          +in[(i+4)*n+(j-2)] * 0.00223214285714
+                          +in[(i-4)*n+(j-1)] * -0.00223214285714
+                          +in[(i-3)*n+(j-1)] * -0.00416666666667
+                          +in[(i-2)*n+(j-1)] * -0.0104166666667
+                          +in[(i-1)*n+(j-1)] * -0.0625
+                          +in[(i)*n+(j-1)] * -0.0625
+                          +in[(i+2)*n+(j-1)] * 0.0104166666667
+                          +in[(i+3)*n+(j-1)] * 0.00416666666667
+                          +in[(i+4)*n+(j-1)] * 0.00223214285714
+                          +in[(i-4)*n+(j)] * -0.00223214285714
+                          +in[(i-3)*n+(j)] * -0.00416666666667
+                          +in[(i-2)*n+(j)] * -0.0104166666667
+                          +in[(i-1)*n+(j)] * -0.0625
+                          +in[(i+1)*n+(j)] * 0.0625
+                          +in[(i+2)*n+(j)] * 0.0104166666667
+                          +in[(i+3)*n+(j)] * 0.00416666666667
+                          +in[(i+4)*n+(j)] * 0.00223214285714
+                          +in[(i-4)*n+(j+1)] * -0.00223214285714
+                          +in[(i-3)*n+(j+1)] * -0.00416666666667
+                          +in[(i-2)*n+(j+1)] * -0.0104166666667
+                          +in[(i)*n+(j+1)] * 0.0625
                           +in[(i+1)*n+(j+1)] * 0.0625
-                          +in[(i+1)*n+(j+2)] * 0.0104166666667
-                          +in[(i+1)*n+(j+3)] * 0.00416666666667
-                          +in[(i+1)*n+(j+4)] * 0.00223214285714
-                          +in[(i+2)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+2)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+2)*n+(j+-1)] * 0.0104166666667
-                          +in[(i+2)*n+(j+0)] * 0.0104166666667
                           +in[(i+2)*n+(j+1)] * 0.0104166666667
-                          +in[(i+2)*n+(j+2)] * 0.03125
-                          +in[(i+2)*n+(j+3)] * 0.00416666666667
-                          +in[(i+2)*n+(j+4)] * 0.00223214285714
-                          +in[(i+3)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+3)*n+(j+-2)] * 0.00416666666667
-                          +in[(i+3)*n+(j+-1)] * 0.00416666666667
-                          +in[(i+3)*n+(j+0)] * 0.00416666666667
                           +in[(i+3)*n+(j+1)] * 0.00416666666667
-                          +in[(i+3)*n+(j+2)] * 0.00416666666667
-                          +in[(i+3)*n+(j+3)] * 0.0208333333333
-                          +in[(i+3)*n+(j+4)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-3)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-2)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-1)] * 0.00223214285714
-                          +in[(i+4)*n+(j+0)] * 0.00223214285714
                           +in[(i+4)*n+(j+1)] * 0.00223214285714
+                          +in[(i-4)*n+(j+2)] * -0.00223214285714
+                          +in[(i-3)*n+(j+2)] * -0.00416666666667
+                          +in[(i-1)*n+(j+2)] * 0.0104166666667
+                          +in[(i)*n+(j+2)] * 0.0104166666667
+                          +in[(i+1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+2)*n+(j+2)] * 0.03125
+                          +in[(i+3)*n+(j+2)] * 0.00416666666667
                           +in[(i+4)*n+(j+2)] * 0.00223214285714
+                          +in[(i-4)*n+(j+3)] * -0.00223214285714
+                          +in[(i-2)*n+(j+3)] * 0.00416666666667
+                          +in[(i-1)*n+(j+3)] * 0.00416666666667
+                          +in[(i)*n+(j+3)] * 0.00416666666667
+                          +in[(i+1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+3)*n+(j+3)] * 0.0208333333333
                           +in[(i+4)*n+(j+3)] * 0.00223214285714
+                          +in[(i-3)*n+(j+4)] * 0.00223214285714
+                          +in[(i-2)*n+(j+4)] * 0.00223214285714
+                          +in[(i-1)*n+(j+4)] * 0.00223214285714
+                          +in[(i)*n+(j+4)] * 0.00223214285714
+                          +in[(i+1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+3)*n+(j+4)] * 0.00223214285714
                           +in[(i+4)*n+(j+4)] * 0.015625
                           ;
        }
@@ -280,115 +280,115 @@ void grid5(const int n, const int t, const double * RESTRICT in, double * RESTRI
     OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
     for (auto i=5; i<n-5; ++i) {
       for (auto j=5; j<n-5; ++j) {
-            out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01
-                          +in[(i+-5)*n+(j+-4)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-3)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-2)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-1)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+0)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+1)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+2)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+3)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+4)] * -0.00111111111111
-                          +in[(i+-4)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-4)*n+(j+-4)] * -0.0125
-                          +in[(i+-4)*n+(j+-3)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+-2)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+-1)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+0)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+1)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+2)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+3)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-3)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-3)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-3)*n+(j+-3)] * -0.0166666666667
-                          +in[(i+-3)*n+(j+-2)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+-1)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+0)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+1)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+2)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-3)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-2)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-2)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-2)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+-2)*n+(j+-2)] * -0.025
-                          +in[(i+-2)*n+(j+-1)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+0)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+1)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+3)] * 0.00333333333333
-                          +in[(i+-2)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-2)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-1)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-1)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-1)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+-1)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+-1)*n+(j+-1)] * -0.05
-                          +in[(i+-1)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+2)] * 0.00833333333333
-                          +in[(i+-1)*n+(j+3)] * 0.00333333333333
-                          +in[(i+-1)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-1)*n+(j+5)] * 0.00111111111111
-                          +in[(i+0)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+0)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+0)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+0)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+0)*n+(j+-1)] * -0.05
-                          +in[(i+0)*n+(j+1)] * 0.05
-                          +in[(i+0)*n+(j+2)] * 0.00833333333333
-                          +in[(i+0)*n+(j+3)] * 0.00333333333333
-                          +in[(i+0)*n+(j+4)] * 0.00178571428571
-                          +in[(i+0)*n+(j+5)] * 0.00111111111111
-                          +in[(i+1)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+1)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+1)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+1)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+1)*n+(j+0)] * 0.05
+            out[i*n+j] += +in[(i-5)*n+(j-5)] * -0.01
+                          +in[(i-4)*n+(j-5)] * -0.00111111111111
+                          +in[(i-3)*n+(j-5)] * -0.00111111111111
+                          +in[(i-2)*n+(j-5)] * -0.00111111111111
+                          +in[(i-1)*n+(j-5)] * -0.00111111111111
+                          +in[(i)*n+(j-5)] * -0.00111111111111
+                          +in[(i+1)*n+(j-5)] * -0.00111111111111
+                          +in[(i+2)*n+(j-5)] * -0.00111111111111
+                          +in[(i+3)*n+(j-5)] * -0.00111111111111
+                          +in[(i+4)*n+(j-5)] * -0.00111111111111
+                          +in[(i-5)*n+(j-4)] * -0.00111111111111
+                          +in[(i-4)*n+(j-4)] * -0.0125
+                          +in[(i-3)*n+(j-4)] * -0.00178571428571
+                          +in[(i-2)*n+(j-4)] * -0.00178571428571
+                          +in[(i-1)*n+(j-4)] * -0.00178571428571
+                          +in[(i)*n+(j-4)] * -0.00178571428571
+                          +in[(i+1)*n+(j-4)] * -0.00178571428571
+                          +in[(i+2)*n+(j-4)] * -0.00178571428571
+                          +in[(i+3)*n+(j-4)] * -0.00178571428571
+                          +in[(i+5)*n+(j-4)] * 0.00111111111111
+                          +in[(i-5)*n+(j-3)] * -0.00111111111111
+                          +in[(i-4)*n+(j-3)] * -0.00178571428571
+                          +in[(i-3)*n+(j-3)] * -0.0166666666667
+                          +in[(i-2)*n+(j-3)] * -0.00333333333333
+                          +in[(i-1)*n+(j-3)] * -0.00333333333333
+                          +in[(i)*n+(j-3)] * -0.00333333333333
+                          +in[(i+1)*n+(j-3)] * -0.00333333333333
+                          +in[(i+2)*n+(j-3)] * -0.00333333333333
+                          +in[(i+4)*n+(j-3)] * 0.00178571428571
+                          +in[(i+5)*n+(j-3)] * 0.00111111111111
+                          +in[(i-5)*n+(j-2)] * -0.00111111111111
+                          +in[(i-4)*n+(j-2)] * -0.00178571428571
+                          +in[(i-3)*n+(j-2)] * -0.00333333333333
+                          +in[(i-2)*n+(j-2)] * -0.025
+                          +in[(i-1)*n+(j-2)] * -0.00833333333333
+                          +in[(i)*n+(j-2)] * -0.00833333333333
+                          +in[(i+1)*n+(j-2)] * -0.00833333333333
+                          +in[(i+3)*n+(j-2)] * 0.00333333333333
+                          +in[(i+4)*n+(j-2)] * 0.00178571428571
+                          +in[(i+5)*n+(j-2)] * 0.00111111111111
+                          +in[(i-5)*n+(j-1)] * -0.00111111111111
+                          +in[(i-4)*n+(j-1)] * -0.00178571428571
+                          +in[(i-3)*n+(j-1)] * -0.00333333333333
+                          +in[(i-2)*n+(j-1)] * -0.00833333333333
+                          +in[(i-1)*n+(j-1)] * -0.05
+                          +in[(i)*n+(j-1)] * -0.05
+                          +in[(i+2)*n+(j-1)] * 0.00833333333333
+                          +in[(i+3)*n+(j-1)] * 0.00333333333333
+                          +in[(i+4)*n+(j-1)] * 0.00178571428571
+                          +in[(i+5)*n+(j-1)] * 0.00111111111111
+                          +in[(i-5)*n+(j)] * -0.00111111111111
+                          +in[(i-4)*n+(j)] * -0.00178571428571
+                          +in[(i-3)*n+(j)] * -0.00333333333333
+                          +in[(i-2)*n+(j)] * -0.00833333333333
+                          +in[(i-1)*n+(j)] * -0.05
+                          +in[(i+1)*n+(j)] * 0.05
+                          +in[(i+2)*n+(j)] * 0.00833333333333
+                          +in[(i+3)*n+(j)] * 0.00333333333333
+                          +in[(i+4)*n+(j)] * 0.00178571428571
+                          +in[(i+5)*n+(j)] * 0.00111111111111
+                          +in[(i-5)*n+(j+1)] * -0.00111111111111
+                          +in[(i-4)*n+(j+1)] * -0.00178571428571
+                          +in[(i-3)*n+(j+1)] * -0.00333333333333
+                          +in[(i-2)*n+(j+1)] * -0.00833333333333
+                          +in[(i)*n+(j+1)] * 0.05
                           +in[(i+1)*n+(j+1)] * 0.05
-                          +in[(i+1)*n+(j+2)] * 0.00833333333333
-                          +in[(i+1)*n+(j+3)] * 0.00333333333333
-                          +in[(i+1)*n+(j+4)] * 0.00178571428571
-                          +in[(i+1)*n+(j+5)] * 0.00111111111111
-                          +in[(i+2)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+2)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+2)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+2)*n+(j+-1)] * 0.00833333333333
-                          +in[(i+2)*n+(j+0)] * 0.00833333333333
                           +in[(i+2)*n+(j+1)] * 0.00833333333333
-                          +in[(i+2)*n+(j+2)] * 0.025
-                          +in[(i+2)*n+(j+3)] * 0.00333333333333
-                          +in[(i+2)*n+(j+4)] * 0.00178571428571
-                          +in[(i+2)*n+(j+5)] * 0.00111111111111
-                          +in[(i+3)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+3)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+3)*n+(j+-2)] * 0.00333333333333
-                          +in[(i+3)*n+(j+-1)] * 0.00333333333333
-                          +in[(i+3)*n+(j+0)] * 0.00333333333333
                           +in[(i+3)*n+(j+1)] * 0.00333333333333
-                          +in[(i+3)*n+(j+2)] * 0.00333333333333
-                          +in[(i+3)*n+(j+3)] * 0.0166666666667
-                          +in[(i+3)*n+(j+4)] * 0.00178571428571
-                          +in[(i+3)*n+(j+5)] * 0.00111111111111
-                          +in[(i+4)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+4)*n+(j+-3)] * 0.00178571428571
-                          +in[(i+4)*n+(j+-2)] * 0.00178571428571
-                          +in[(i+4)*n+(j+-1)] * 0.00178571428571
-                          +in[(i+4)*n+(j+0)] * 0.00178571428571
                           +in[(i+4)*n+(j+1)] * 0.00178571428571
-                          +in[(i+4)*n+(j+2)] * 0.00178571428571
-                          +in[(i+4)*n+(j+3)] * 0.00178571428571
-                          +in[(i+4)*n+(j+4)] * 0.0125
-                          +in[(i+4)*n+(j+5)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-4)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-3)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-2)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-1)] * 0.00111111111111
-                          +in[(i+5)*n+(j+0)] * 0.00111111111111
                           +in[(i+5)*n+(j+1)] * 0.00111111111111
+                          +in[(i-5)*n+(j+2)] * -0.00111111111111
+                          +in[(i-4)*n+(j+2)] * -0.00178571428571
+                          +in[(i-3)*n+(j+2)] * -0.00333333333333
+                          +in[(i-1)*n+(j+2)] * 0.00833333333333
+                          +in[(i)*n+(j+2)] * 0.00833333333333
+                          +in[(i+1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+2)*n+(j+2)] * 0.025
+                          +in[(i+3)*n+(j+2)] * 0.00333333333333
+                          +in[(i+4)*n+(j+2)] * 0.00178571428571
                           +in[(i+5)*n+(j+2)] * 0.00111111111111
+                          +in[(i-5)*n+(j+3)] * -0.00111111111111
+                          +in[(i-4)*n+(j+3)] * -0.00178571428571
+                          +in[(i-2)*n+(j+3)] * 0.00333333333333
+                          +in[(i-1)*n+(j+3)] * 0.00333333333333
+                          +in[(i)*n+(j+3)] * 0.00333333333333
+                          +in[(i+1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+3)*n+(j+3)] * 0.0166666666667
+                          +in[(i+4)*n+(j+3)] * 0.00178571428571
                           +in[(i+5)*n+(j+3)] * 0.00111111111111
+                          +in[(i-5)*n+(j+4)] * -0.00111111111111
+                          +in[(i-3)*n+(j+4)] * 0.00178571428571
+                          +in[(i-2)*n+(j+4)] * 0.00178571428571
+                          +in[(i-1)*n+(j+4)] * 0.00178571428571
+                          +in[(i)*n+(j+4)] * 0.00178571428571
+                          +in[(i+1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+4)*n+(j+4)] * 0.0125
                           +in[(i+5)*n+(j+4)] * 0.00111111111111
+                          +in[(i-4)*n+(j+5)] * 0.00111111111111
+                          +in[(i-3)*n+(j+5)] * 0.00111111111111
+                          +in[(i-2)*n+(j+5)] * 0.00111111111111
+                          +in[(i-1)*n+(j+5)] * 0.00111111111111
+                          +in[(i)*n+(j+5)] * 0.00111111111111
+                          +in[(i+1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+4)*n+(j+5)] * 0.00111111111111
                           +in[(i+5)*n+(j+5)] * 0.01
                           ;
        }
diff --git a/Cxx11/stencil_taskloop.hpp b/Cxx11/stencil_taskloop.hpp
index fec723685..856f41995 100644
--- a/Cxx11/stencil_taskloop.hpp
+++ b/Cxx11/stencil_taskloop.hpp
@@ -5,10 +5,10 @@ void star1(const int n, const int t, std::vector<double> & in, std::vector<doubl
         for (auto i=it; i<std::min(n-1,it+t); ++i) {
           OMP_SIMD
           for (auto j=jt; j<std::min(n-1,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-1)*n+(j+0)] * -0.5
-                          +in[(i+0)*n+(j+-1)] * -0.5
-                          +in[(i+0)*n+(j+1)] * 0.5
-                          +in[(i+1)*n+(j+0)] * 0.5;
+            out[i*n+j] += +in[(i)*n+(j-1)] * -0.5
+                          +in[(i-1)*n+(j)] * -0.5
+                          +in[(i+1)*n+(j)] * 0.5
+                          +in[(i)*n+(j+1)] * 0.5;
            }
          }
        }
@@ -22,14 +22,14 @@ void star2(const int n, const int t, std::vector<double> & in, std::vector<doubl
         for (auto i=it; i<std::min(n-2,it+t); ++i) {
           OMP_SIMD
           for (auto j=jt; j<std::min(n-2,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-2)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+0)] * -0.25
-                          +in[(i+0)*n+(j+-2)] * -0.125
-                          +in[(i+0)*n+(j+-1)] * -0.25
-                          +in[(i+0)*n+(j+1)] * 0.25
-                          +in[(i+0)*n+(j+2)] * 0.125
-                          +in[(i+1)*n+(j+0)] * 0.25
-                          +in[(i+2)*n+(j+0)] * 0.125;
+            out[i*n+j] += +in[(i)*n+(j-2)] * -0.125
+                          +in[(i)*n+(j-1)] * -0.25
+                          +in[(i-2)*n+(j)] * -0.125
+                          +in[(i-1)*n+(j)] * -0.25
+                          +in[(i+1)*n+(j)] * 0.25
+                          +in[(i+2)*n+(j)] * 0.125
+                          +in[(i)*n+(j+1)] * 0.25
+                          +in[(i)*n+(j+2)] * 0.125;
            }
          }
        }
@@ -43,18 +43,18 @@ void star3(const int n, const int t, std::vector<double> & in, std::vector<doubl
         for (auto i=it; i<std::min(n-3,it+t); ++i) {
           OMP_SIMD
           for (auto j=jt; j<std::min(n-3,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556
-                          +in[(i+-2)*n+(j+0)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+0)] * -0.166666666667
-                          +in[(i+0)*n+(j+-3)] * -0.0555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.0833333333333
-                          +in[(i+0)*n+(j+-1)] * -0.166666666667
-                          +in[(i+0)*n+(j+1)] * 0.166666666667
-                          +in[(i+0)*n+(j+2)] * 0.0833333333333
-                          +in[(i+0)*n+(j+3)] * 0.0555555555556
-                          +in[(i+1)*n+(j+0)] * 0.166666666667
-                          +in[(i+2)*n+(j+0)] * 0.0833333333333
-                          +in[(i+3)*n+(j+0)] * 0.0555555555556;
+            out[i*n+j] += +in[(i)*n+(j-3)] * -0.0555555555556
+                          +in[(i)*n+(j-2)] * -0.0833333333333
+                          +in[(i)*n+(j-1)] * -0.166666666667
+                          +in[(i-3)*n+(j)] * -0.0555555555556
+                          +in[(i-2)*n+(j)] * -0.0833333333333
+                          +in[(i-1)*n+(j)] * -0.166666666667
+                          +in[(i+1)*n+(j)] * 0.166666666667
+                          +in[(i+2)*n+(j)] * 0.0833333333333
+                          +in[(i+3)*n+(j)] * 0.0555555555556
+                          +in[(i)*n+(j+1)] * 0.166666666667
+                          +in[(i)*n+(j+2)] * 0.0833333333333
+                          +in[(i)*n+(j+3)] * 0.0555555555556;
            }
          }
        }
@@ -68,22 +68,22 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
         for (auto i=it; i<std::min(n-4,it+t); ++i) {
           OMP_SIMD
           for (auto j=jt; j<std::min(n-4,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125
-                          +in[(i+-3)*n+(j+0)] * -0.0416666666667
-                          +in[(i+-2)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+0)*n+(j+-4)] * -0.03125
-                          +in[(i+0)*n+(j+-3)] * -0.0416666666667
-                          +in[(i+0)*n+(j+-2)] * -0.0625
-                          +in[(i+0)*n+(j+-1)] * -0.125
-                          +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.0625
-                          +in[(i+0)*n+(j+3)] * 0.0416666666667
-                          +in[(i+0)*n+(j+4)] * 0.03125
-                          +in[(i+1)*n+(j+0)] * 0.125
-                          +in[(i+2)*n+(j+0)] * 0.0625
-                          +in[(i+3)*n+(j+0)] * 0.0416666666667
-                          +in[(i+4)*n+(j+0)] * 0.03125;
+            out[i*n+j] += +in[(i)*n+(j-4)] * -0.03125
+                          +in[(i)*n+(j-3)] * -0.0416666666667
+                          +in[(i)*n+(j-2)] * -0.0625
+                          +in[(i)*n+(j-1)] * -0.125
+                          +in[(i-4)*n+(j)] * -0.03125
+                          +in[(i-3)*n+(j)] * -0.0416666666667
+                          +in[(i-2)*n+(j)] * -0.0625
+                          +in[(i-1)*n+(j)] * -0.125
+                          +in[(i+1)*n+(j)] * 0.125
+                          +in[(i+2)*n+(j)] * 0.0625
+                          +in[(i+3)*n+(j)] * 0.0416666666667
+                          +in[(i+4)*n+(j)] * 0.03125
+                          +in[(i)*n+(j+1)] * 0.125
+                          +in[(i)*n+(j+2)] * 0.0625
+                          +in[(i)*n+(j+3)] * 0.0416666666667
+                          +in[(i)*n+(j+4)] * 0.03125;
            }
          }
        }
@@ -97,26 +97,26 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
         for (auto i=it; i<std::min(n-5,it+t); ++i) {
           OMP_SIMD
           for (auto j=jt; j<std::min(n-5,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02
-                          +in[(i+-4)*n+(j+0)] * -0.025
-                          +in[(i+-3)*n+(j+0)] * -0.0333333333333
-                          +in[(i+-2)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+0)] * -0.1
-                          +in[(i+0)*n+(j+-5)] * -0.02
-                          +in[(i+0)*n+(j+-4)] * -0.025
-                          +in[(i+0)*n+(j+-3)] * -0.0333333333333
-                          +in[(i+0)*n+(j+-2)] * -0.05
-                          +in[(i+0)*n+(j+-1)] * -0.1
-                          +in[(i+0)*n+(j+1)] * 0.1
-                          +in[(i+0)*n+(j+2)] * 0.05
-                          +in[(i+0)*n+(j+3)] * 0.0333333333333
-                          +in[(i+0)*n+(j+4)] * 0.025
-                          +in[(i+0)*n+(j+5)] * 0.02
-                          +in[(i+1)*n+(j+0)] * 0.1
-                          +in[(i+2)*n+(j+0)] * 0.05
-                          +in[(i+3)*n+(j+0)] * 0.0333333333333
-                          +in[(i+4)*n+(j+0)] * 0.025
-                          +in[(i+5)*n+(j+0)] * 0.02;
+            out[i*n+j] += +in[(i)*n+(j-5)] * -0.02
+                          +in[(i)*n+(j-4)] * -0.025
+                          +in[(i)*n+(j-3)] * -0.0333333333333
+                          +in[(i)*n+(j-2)] * -0.05
+                          +in[(i)*n+(j-1)] * -0.1
+                          +in[(i-5)*n+(j)] * -0.02
+                          +in[(i-4)*n+(j)] * -0.025
+                          +in[(i-3)*n+(j)] * -0.0333333333333
+                          +in[(i-2)*n+(j)] * -0.05
+                          +in[(i-1)*n+(j)] * -0.1
+                          +in[(i+1)*n+(j)] * 0.1
+                          +in[(i+2)*n+(j)] * 0.05
+                          +in[(i+3)*n+(j)] * 0.0333333333333
+                          +in[(i+4)*n+(j)] * 0.025
+                          +in[(i+5)*n+(j)] * 0.02
+                          +in[(i)*n+(j+1)] * 0.1
+                          +in[(i)*n+(j+2)] * 0.05
+                          +in[(i)*n+(j+3)] * 0.0333333333333
+                          +in[(i)*n+(j+4)] * 0.025
+                          +in[(i)*n+(j+5)] * 0.02;
            }
          }
        }
@@ -130,11 +130,11 @@ void grid1(const int n, const int t, std::vector<double> & in, std::vector<doubl
         for (auto i=it; i<std::min(n-1,it+t); ++i) {
           OMP_SIMD
           for (auto j=jt; j<std::min(n-1,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-1)*n+(j+-1)] * -0.25
-                          +in[(i+-1)*n+(j+0)] * -0.25
-                          +in[(i+0)*n+(j+-1)] * -0.25
-                          +in[(i+0)*n+(j+1)] * 0.25
-                          +in[(i+1)*n+(j+0)] * 0.25
+            out[i*n+j] += +in[(i-1)*n+(j-1)] * -0.25
+                          +in[(i)*n+(j-1)] * -0.25
+                          +in[(i-1)*n+(j)] * -0.25
+                          +in[(i+1)*n+(j)] * 0.25
+                          +in[(i)*n+(j+1)] * 0.25
                           +in[(i+1)*n+(j+1)] * 0.25
                           ;
            }
@@ -150,25 +150,25 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
         for (auto i=it; i<std::min(n-2,it+t); ++i) {
           OMP_SIMD
           for (auto j=jt; j<std::min(n-2,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625
-                          +in[(i+-2)*n+(j+-1)] * -0.0208333333333
-                          +in[(i+-2)*n+(j+0)] * -0.0208333333333
-                          +in[(i+-2)*n+(j+1)] * -0.0208333333333
-                          +in[(i+-1)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+-1)*n+(j+-1)] * -0.125
-                          +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+2)] * 0.0208333333333
-                          +in[(i+0)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+0)*n+(j+-1)] * -0.125
-                          +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.0208333333333
-                          +in[(i+1)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+1)*n+(j+0)] * 0.125
+            out[i*n+j] += +in[(i-2)*n+(j-2)] * -0.0625
+                          +in[(i-1)*n+(j-2)] * -0.0208333333333
+                          +in[(i)*n+(j-2)] * -0.0208333333333
+                          +in[(i+1)*n+(j-2)] * -0.0208333333333
+                          +in[(i-2)*n+(j-1)] * -0.0208333333333
+                          +in[(i-1)*n+(j-1)] * -0.125
+                          +in[(i)*n+(j-1)] * -0.125
+                          +in[(i+2)*n+(j-1)] * 0.0208333333333
+                          +in[(i-2)*n+(j)] * -0.0208333333333
+                          +in[(i-1)*n+(j)] * -0.125
+                          +in[(i+1)*n+(j)] * 0.125
+                          +in[(i+2)*n+(j)] * 0.0208333333333
+                          +in[(i-2)*n+(j+1)] * -0.0208333333333
+                          +in[(i)*n+(j+1)] * 0.125
                           +in[(i+1)*n+(j+1)] * 0.125
-                          +in[(i+1)*n+(j+2)] * 0.0208333333333
-                          +in[(i+2)*n+(j+-1)] * 0.0208333333333
-                          +in[(i+2)*n+(j+0)] * 0.0208333333333
                           +in[(i+2)*n+(j+1)] * 0.0208333333333
+                          +in[(i-1)*n+(j+2)] * 0.0208333333333
+                          +in[(i)*n+(j+2)] * 0.0208333333333
+                          +in[(i+1)*n+(j+2)] * 0.0208333333333
                           +in[(i+2)*n+(j+2)] * 0.0625
                           ;
            }
@@ -184,47 +184,47 @@ void grid3(const int n, const int t, std::vector<double> & in, std::vector<doubl
         for (auto i=it; i<std::min(n-3,it+t); ++i) {
           OMP_SIMD
           for (auto j=jt; j<std::min(n-3,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778
-                          +in[(i+-3)*n+(j+-2)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+-1)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+0)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+1)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+2)] * -0.00555555555556
-                          +in[(i+-2)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+-2)*n+(j+-2)] * -0.0416666666667
-                          +in[(i+-2)*n+(j+-1)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+0)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+1)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+3)] * 0.00555555555556
-                          +in[(i+-1)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+-1)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+-1)*n+(j+-1)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+0)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+2)] * 0.0138888888889
-                          +in[(i+-1)*n+(j+3)] * 0.00555555555556
-                          +in[(i+0)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+0)*n+(j+-1)] * -0.0833333333333
-                          +in[(i+0)*n+(j+1)] * 0.0833333333333
-                          +in[(i+0)*n+(j+2)] * 0.0138888888889
-                          +in[(i+0)*n+(j+3)] * 0.00555555555556
-                          +in[(i+1)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+1)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+1)*n+(j+0)] * 0.0833333333333
+            out[i*n+j] += +in[(i-3)*n+(j-3)] * -0.0277777777778
+                          +in[(i-2)*n+(j-3)] * -0.00555555555556
+                          +in[(i-1)*n+(j-3)] * -0.00555555555556
+                          +in[(i)*n+(j-3)] * -0.00555555555556
+                          +in[(i+1)*n+(j-3)] * -0.00555555555556
+                          +in[(i+2)*n+(j-3)] * -0.00555555555556
+                          +in[(i-3)*n+(j-2)] * -0.00555555555556
+                          +in[(i-2)*n+(j-2)] * -0.0416666666667
+                          +in[(i-1)*n+(j-2)] * -0.0138888888889
+                          +in[(i)*n+(j-2)] * -0.0138888888889
+                          +in[(i+1)*n+(j-2)] * -0.0138888888889
+                          +in[(i+3)*n+(j-2)] * 0.00555555555556
+                          +in[(i-3)*n+(j-1)] * -0.00555555555556
+                          +in[(i-2)*n+(j-1)] * -0.0138888888889
+                          +in[(i-1)*n+(j-1)] * -0.0833333333333
+                          +in[(i)*n+(j-1)] * -0.0833333333333
+                          +in[(i+2)*n+(j-1)] * 0.0138888888889
+                          +in[(i+3)*n+(j-1)] * 0.00555555555556
+                          +in[(i-3)*n+(j)] * -0.00555555555556
+                          +in[(i-2)*n+(j)] * -0.0138888888889
+                          +in[(i-1)*n+(j)] * -0.0833333333333
+                          +in[(i+1)*n+(j)] * 0.0833333333333
+                          +in[(i+2)*n+(j)] * 0.0138888888889
+                          +in[(i+3)*n+(j)] * 0.00555555555556
+                          +in[(i-3)*n+(j+1)] * -0.00555555555556
+                          +in[(i-2)*n+(j+1)] * -0.0138888888889
+                          +in[(i)*n+(j+1)] * 0.0833333333333
                           +in[(i+1)*n+(j+1)] * 0.0833333333333
-                          +in[(i+1)*n+(j+2)] * 0.0138888888889
-                          +in[(i+1)*n+(j+3)] * 0.00555555555556
-                          +in[(i+2)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+2)*n+(j+-1)] * 0.0138888888889
-                          +in[(i+2)*n+(j+0)] * 0.0138888888889
                           +in[(i+2)*n+(j+1)] * 0.0138888888889
-                          +in[(i+2)*n+(j+2)] * 0.0416666666667
-                          +in[(i+2)*n+(j+3)] * 0.00555555555556
-                          +in[(i+3)*n+(j+-2)] * 0.00555555555556
-                          +in[(i+3)*n+(j+-1)] * 0.00555555555556
-                          +in[(i+3)*n+(j+0)] * 0.00555555555556
                           +in[(i+3)*n+(j+1)] * 0.00555555555556
+                          +in[(i-3)*n+(j+2)] * -0.00555555555556
+                          +in[(i-1)*n+(j+2)] * 0.0138888888889
+                          +in[(i)*n+(j+2)] * 0.0138888888889
+                          +in[(i+1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+2)*n+(j+2)] * 0.0416666666667
                           +in[(i+3)*n+(j+2)] * 0.00555555555556
+                          +in[(i-2)*n+(j+3)] * 0.00555555555556
+                          +in[(i-1)*n+(j+3)] * 0.00555555555556
+                          +in[(i)*n+(j+3)] * 0.00555555555556
+                          +in[(i+1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+2)*n+(j+3)] * 0.00555555555556
                           +in[(i+3)*n+(j+3)] * 0.0277777777778
                           ;
            }
@@ -240,77 +240,77 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
         for (auto i=it; i<std::min(n-4,it+t); ++i) {
           OMP_SIMD
           for (auto j=jt; j<std::min(n-4,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625
-                          +in[(i+-4)*n+(j+-3)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+-2)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+-1)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+0)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+1)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+2)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+3)] * -0.00223214285714
-                          +in[(i+-3)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-3)*n+(j+-3)] * -0.0208333333333
-                          +in[(i+-3)*n+(j+-2)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+-1)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+0)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+1)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+2)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+4)] * 0.00223214285714
-                          +in[(i+-2)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-2)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+-2)*n+(j+-2)] * -0.03125
-                          +in[(i+-2)*n+(j+-1)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+0)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+1)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+3)] * 0.00416666666667
-                          +in[(i+-2)*n+(j+4)] * 0.00223214285714
-                          +in[(i+-1)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-1)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+-1)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+-1)*n+(j+-1)] * -0.0625
-                          +in[(i+-1)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+2)] * 0.0104166666667
-                          +in[(i+-1)*n+(j+3)] * 0.00416666666667
-                          +in[(i+-1)*n+(j+4)] * 0.00223214285714
-                          +in[(i+0)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+0)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+0)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+0)*n+(j+-1)] * -0.0625
-                          +in[(i+0)*n+(j+1)] * 0.0625
-                          +in[(i+0)*n+(j+2)] * 0.0104166666667
-                          +in[(i+0)*n+(j+3)] * 0.00416666666667
-                          +in[(i+0)*n+(j+4)] * 0.00223214285714
-                          +in[(i+1)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+1)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+1)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+1)*n+(j+0)] * 0.0625
+            out[i*n+j] += +in[(i-4)*n+(j-4)] * -0.015625
+                          +in[(i-3)*n+(j-4)] * -0.00223214285714
+                          +in[(i-2)*n+(j-4)] * -0.00223214285714
+                          +in[(i-1)*n+(j-4)] * -0.00223214285714
+                          +in[(i)*n+(j-4)] * -0.00223214285714
+                          +in[(i+1)*n+(j-4)] * -0.00223214285714
+                          +in[(i+2)*n+(j-4)] * -0.00223214285714
+                          +in[(i+3)*n+(j-4)] * -0.00223214285714
+                          +in[(i-4)*n+(j-3)] * -0.00223214285714
+                          +in[(i-3)*n+(j-3)] * -0.0208333333333
+                          +in[(i-2)*n+(j-3)] * -0.00416666666667
+                          +in[(i-1)*n+(j-3)] * -0.00416666666667
+                          +in[(i)*n+(j-3)] * -0.00416666666667
+                          +in[(i+1)*n+(j-3)] * -0.00416666666667
+                          +in[(i+2)*n+(j-3)] * -0.00416666666667
+                          +in[(i+4)*n+(j-3)] * 0.00223214285714
+                          +in[(i-4)*n+(j-2)] * -0.00223214285714
+                          +in[(i-3)*n+(j-2)] * -0.00416666666667
+                          +in[(i-2)*n+(j-2)] * -0.03125
+                          +in[(i-1)*n+(j-2)] * -0.0104166666667
+                          +in[(i)*n+(j-2)] * -0.0104166666667
+                          +in[(i+1)*n+(j-2)] * -0.0104166666667
+                          +in[(i+3)*n+(j-2)] * 0.00416666666667
+                          +in[(i+4)*n+(j-2)] * 0.00223214285714
+                          +in[(i-4)*n+(j-1)] * -0.00223214285714
+                          +in[(i-3)*n+(j-1)] * -0.00416666666667
+                          +in[(i-2)*n+(j-1)] * -0.0104166666667
+                          +in[(i-1)*n+(j-1)] * -0.0625
+                          +in[(i)*n+(j-1)] * -0.0625
+                          +in[(i+2)*n+(j-1)] * 0.0104166666667
+                          +in[(i+3)*n+(j-1)] * 0.00416666666667
+                          +in[(i+4)*n+(j-1)] * 0.00223214285714
+                          +in[(i-4)*n+(j)] * -0.00223214285714
+                          +in[(i-3)*n+(j)] * -0.00416666666667
+                          +in[(i-2)*n+(j)] * -0.0104166666667
+                          +in[(i-1)*n+(j)] * -0.0625
+                          +in[(i+1)*n+(j)] * 0.0625
+                          +in[(i+2)*n+(j)] * 0.0104166666667
+                          +in[(i+3)*n+(j)] * 0.00416666666667
+                          +in[(i+4)*n+(j)] * 0.00223214285714
+                          +in[(i-4)*n+(j+1)] * -0.00223214285714
+                          +in[(i-3)*n+(j+1)] * -0.00416666666667
+                          +in[(i-2)*n+(j+1)] * -0.0104166666667
+                          +in[(i)*n+(j+1)] * 0.0625
                           +in[(i+1)*n+(j+1)] * 0.0625
-                          +in[(i+1)*n+(j+2)] * 0.0104166666667
-                          +in[(i+1)*n+(j+3)] * 0.00416666666667
-                          +in[(i+1)*n+(j+4)] * 0.00223214285714
-                          +in[(i+2)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+2)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+2)*n+(j+-1)] * 0.0104166666667
-                          +in[(i+2)*n+(j+0)] * 0.0104166666667
                           +in[(i+2)*n+(j+1)] * 0.0104166666667
-                          +in[(i+2)*n+(j+2)] * 0.03125
-                          +in[(i+2)*n+(j+3)] * 0.00416666666667
-                          +in[(i+2)*n+(j+4)] * 0.00223214285714
-                          +in[(i+3)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+3)*n+(j+-2)] * 0.00416666666667
-                          +in[(i+3)*n+(j+-1)] * 0.00416666666667
-                          +in[(i+3)*n+(j+0)] * 0.00416666666667
                           +in[(i+3)*n+(j+1)] * 0.00416666666667
-                          +in[(i+3)*n+(j+2)] * 0.00416666666667
-                          +in[(i+3)*n+(j+3)] * 0.0208333333333
-                          +in[(i+3)*n+(j+4)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-3)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-2)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-1)] * 0.00223214285714
-                          +in[(i+4)*n+(j+0)] * 0.00223214285714
                           +in[(i+4)*n+(j+1)] * 0.00223214285714
+                          +in[(i-4)*n+(j+2)] * -0.00223214285714
+                          +in[(i-3)*n+(j+2)] * -0.00416666666667
+                          +in[(i-1)*n+(j+2)] * 0.0104166666667
+                          +in[(i)*n+(j+2)] * 0.0104166666667
+                          +in[(i+1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+2)*n+(j+2)] * 0.03125
+                          +in[(i+3)*n+(j+2)] * 0.00416666666667
                           +in[(i+4)*n+(j+2)] * 0.00223214285714
+                          +in[(i-4)*n+(j+3)] * -0.00223214285714
+                          +in[(i-2)*n+(j+3)] * 0.00416666666667
+                          +in[(i-1)*n+(j+3)] * 0.00416666666667
+                          +in[(i)*n+(j+3)] * 0.00416666666667
+                          +in[(i+1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+3)*n+(j+3)] * 0.0208333333333
                           +in[(i+4)*n+(j+3)] * 0.00223214285714
+                          +in[(i-3)*n+(j+4)] * 0.00223214285714
+                          +in[(i-2)*n+(j+4)] * 0.00223214285714
+                          +in[(i-1)*n+(j+4)] * 0.00223214285714
+                          +in[(i)*n+(j+4)] * 0.00223214285714
+                          +in[(i+1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+3)*n+(j+4)] * 0.00223214285714
                           +in[(i+4)*n+(j+4)] * 0.015625
                           ;
            }
@@ -326,115 +326,115 @@ void grid5(const int n, const int t, std::vector<double> & in, std::vector<doubl
         for (auto i=it; i<std::min(n-5,it+t); ++i) {
           OMP_SIMD
           for (auto j=jt; j<std::min(n-5,jt+t); ++j) {
-            out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01
-                          +in[(i+-5)*n+(j+-4)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-3)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-2)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-1)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+0)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+1)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+2)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+3)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+4)] * -0.00111111111111
-                          +in[(i+-4)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-4)*n+(j+-4)] * -0.0125
-                          +in[(i+-4)*n+(j+-3)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+-2)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+-1)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+0)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+1)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+2)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+3)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-3)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-3)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-3)*n+(j+-3)] * -0.0166666666667
-                          +in[(i+-3)*n+(j+-2)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+-1)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+0)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+1)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+2)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-3)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-2)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-2)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-2)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+-2)*n+(j+-2)] * -0.025
-                          +in[(i+-2)*n+(j+-1)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+0)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+1)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+3)] * 0.00333333333333
-                          +in[(i+-2)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-2)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-1)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-1)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-1)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+-1)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+-1)*n+(j+-1)] * -0.05
-                          +in[(i+-1)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+2)] * 0.00833333333333
-                          +in[(i+-1)*n+(j+3)] * 0.00333333333333
-                          +in[(i+-1)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-1)*n+(j+5)] * 0.00111111111111
-                          +in[(i+0)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+0)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+0)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+0)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+0)*n+(j+-1)] * -0.05
-                          +in[(i+0)*n+(j+1)] * 0.05
-                          +in[(i+0)*n+(j+2)] * 0.00833333333333
-                          +in[(i+0)*n+(j+3)] * 0.00333333333333
-                          +in[(i+0)*n+(j+4)] * 0.00178571428571
-                          +in[(i+0)*n+(j+5)] * 0.00111111111111
-                          +in[(i+1)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+1)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+1)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+1)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+1)*n+(j+0)] * 0.05
+            out[i*n+j] += +in[(i-5)*n+(j-5)] * -0.01
+                          +in[(i-4)*n+(j-5)] * -0.00111111111111
+                          +in[(i-3)*n+(j-5)] * -0.00111111111111
+                          +in[(i-2)*n+(j-5)] * -0.00111111111111
+                          +in[(i-1)*n+(j-5)] * -0.00111111111111
+                          +in[(i)*n+(j-5)] * -0.00111111111111
+                          +in[(i+1)*n+(j-5)] * -0.00111111111111
+                          +in[(i+2)*n+(j-5)] * -0.00111111111111
+                          +in[(i+3)*n+(j-5)] * -0.00111111111111
+                          +in[(i+4)*n+(j-5)] * -0.00111111111111
+                          +in[(i-5)*n+(j-4)] * -0.00111111111111
+                          +in[(i-4)*n+(j-4)] * -0.0125
+                          +in[(i-3)*n+(j-4)] * -0.00178571428571
+                          +in[(i-2)*n+(j-4)] * -0.00178571428571
+                          +in[(i-1)*n+(j-4)] * -0.00178571428571
+                          +in[(i)*n+(j-4)] * -0.00178571428571
+                          +in[(i+1)*n+(j-4)] * -0.00178571428571
+                          +in[(i+2)*n+(j-4)] * -0.00178571428571
+                          +in[(i+3)*n+(j-4)] * -0.00178571428571
+                          +in[(i+5)*n+(j-4)] * 0.00111111111111
+                          +in[(i-5)*n+(j-3)] * -0.00111111111111
+                          +in[(i-4)*n+(j-3)] * -0.00178571428571
+                          +in[(i-3)*n+(j-3)] * -0.0166666666667
+                          +in[(i-2)*n+(j-3)] * -0.00333333333333
+                          +in[(i-1)*n+(j-3)] * -0.00333333333333
+                          +in[(i)*n+(j-3)] * -0.00333333333333
+                          +in[(i+1)*n+(j-3)] * -0.00333333333333
+                          +in[(i+2)*n+(j-3)] * -0.00333333333333
+                          +in[(i+4)*n+(j-3)] * 0.00178571428571
+                          +in[(i+5)*n+(j-3)] * 0.00111111111111
+                          +in[(i-5)*n+(j-2)] * -0.00111111111111
+                          +in[(i-4)*n+(j-2)] * -0.00178571428571
+                          +in[(i-3)*n+(j-2)] * -0.00333333333333
+                          +in[(i-2)*n+(j-2)] * -0.025
+                          +in[(i-1)*n+(j-2)] * -0.00833333333333
+                          +in[(i)*n+(j-2)] * -0.00833333333333
+                          +in[(i+1)*n+(j-2)] * -0.00833333333333
+                          +in[(i+3)*n+(j-2)] * 0.00333333333333
+                          +in[(i+4)*n+(j-2)] * 0.00178571428571
+                          +in[(i+5)*n+(j-2)] * 0.00111111111111
+                          +in[(i-5)*n+(j-1)] * -0.00111111111111
+                          +in[(i-4)*n+(j-1)] * -0.00178571428571
+                          +in[(i-3)*n+(j-1)] * -0.00333333333333
+                          +in[(i-2)*n+(j-1)] * -0.00833333333333
+                          +in[(i-1)*n+(j-1)] * -0.05
+                          +in[(i)*n+(j-1)] * -0.05
+                          +in[(i+2)*n+(j-1)] * 0.00833333333333
+                          +in[(i+3)*n+(j-1)] * 0.00333333333333
+                          +in[(i+4)*n+(j-1)] * 0.00178571428571
+                          +in[(i+5)*n+(j-1)] * 0.00111111111111
+                          +in[(i-5)*n+(j)] * -0.00111111111111
+                          +in[(i-4)*n+(j)] * -0.00178571428571
+                          +in[(i-3)*n+(j)] * -0.00333333333333
+                          +in[(i-2)*n+(j)] * -0.00833333333333
+                          +in[(i-1)*n+(j)] * -0.05
+                          +in[(i+1)*n+(j)] * 0.05
+                          +in[(i+2)*n+(j)] * 0.00833333333333
+                          +in[(i+3)*n+(j)] * 0.00333333333333
+                          +in[(i+4)*n+(j)] * 0.00178571428571
+                          +in[(i+5)*n+(j)] * 0.00111111111111
+                          +in[(i-5)*n+(j+1)] * -0.00111111111111
+                          +in[(i-4)*n+(j+1)] * -0.00178571428571
+                          +in[(i-3)*n+(j+1)] * -0.00333333333333
+                          +in[(i-2)*n+(j+1)] * -0.00833333333333
+                          +in[(i)*n+(j+1)] * 0.05
                           +in[(i+1)*n+(j+1)] * 0.05
-                          +in[(i+1)*n+(j+2)] * 0.00833333333333
-                          +in[(i+1)*n+(j+3)] * 0.00333333333333
-                          +in[(i+1)*n+(j+4)] * 0.00178571428571
-                          +in[(i+1)*n+(j+5)] * 0.00111111111111
-                          +in[(i+2)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+2)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+2)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+2)*n+(j+-1)] * 0.00833333333333
-                          +in[(i+2)*n+(j+0)] * 0.00833333333333
                           +in[(i+2)*n+(j+1)] * 0.00833333333333
-                          +in[(i+2)*n+(j+2)] * 0.025
-                          +in[(i+2)*n+(j+3)] * 0.00333333333333
-                          +in[(i+2)*n+(j+4)] * 0.00178571428571
-                          +in[(i+2)*n+(j+5)] * 0.00111111111111
-                          +in[(i+3)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+3)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+3)*n+(j+-2)] * 0.00333333333333
-                          +in[(i+3)*n+(j+-1)] * 0.00333333333333
-                          +in[(i+3)*n+(j+0)] * 0.00333333333333
                           +in[(i+3)*n+(j+1)] * 0.00333333333333
-                          +in[(i+3)*n+(j+2)] * 0.00333333333333
-                          +in[(i+3)*n+(j+3)] * 0.0166666666667
-                          +in[(i+3)*n+(j+4)] * 0.00178571428571
-                          +in[(i+3)*n+(j+5)] * 0.00111111111111
-                          +in[(i+4)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+4)*n+(j+-3)] * 0.00178571428571
-                          +in[(i+4)*n+(j+-2)] * 0.00178571428571
-                          +in[(i+4)*n+(j+-1)] * 0.00178571428571
-                          +in[(i+4)*n+(j+0)] * 0.00178571428571
                           +in[(i+4)*n+(j+1)] * 0.00178571428571
-                          +in[(i+4)*n+(j+2)] * 0.00178571428571
-                          +in[(i+4)*n+(j+3)] * 0.00178571428571
-                          +in[(i+4)*n+(j+4)] * 0.0125
-                          +in[(i+4)*n+(j+5)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-4)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-3)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-2)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-1)] * 0.00111111111111
-                          +in[(i+5)*n+(j+0)] * 0.00111111111111
                           +in[(i+5)*n+(j+1)] * 0.00111111111111
+                          +in[(i-5)*n+(j+2)] * -0.00111111111111
+                          +in[(i-4)*n+(j+2)] * -0.00178571428571
+                          +in[(i-3)*n+(j+2)] * -0.00333333333333
+                          +in[(i-1)*n+(j+2)] * 0.00833333333333
+                          +in[(i)*n+(j+2)] * 0.00833333333333
+                          +in[(i+1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+2)*n+(j+2)] * 0.025
+                          +in[(i+3)*n+(j+2)] * 0.00333333333333
+                          +in[(i+4)*n+(j+2)] * 0.00178571428571
                           +in[(i+5)*n+(j+2)] * 0.00111111111111
+                          +in[(i-5)*n+(j+3)] * -0.00111111111111
+                          +in[(i-4)*n+(j+3)] * -0.00178571428571
+                          +in[(i-2)*n+(j+3)] * 0.00333333333333
+                          +in[(i-1)*n+(j+3)] * 0.00333333333333
+                          +in[(i)*n+(j+3)] * 0.00333333333333
+                          +in[(i+1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+3)*n+(j+3)] * 0.0166666666667
+                          +in[(i+4)*n+(j+3)] * 0.00178571428571
                           +in[(i+5)*n+(j+3)] * 0.00111111111111
+                          +in[(i-5)*n+(j+4)] * -0.00111111111111
+                          +in[(i-3)*n+(j+4)] * 0.00178571428571
+                          +in[(i-2)*n+(j+4)] * 0.00178571428571
+                          +in[(i-1)*n+(j+4)] * 0.00178571428571
+                          +in[(i)*n+(j+4)] * 0.00178571428571
+                          +in[(i+1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+4)*n+(j+4)] * 0.0125
                           +in[(i+5)*n+(j+4)] * 0.00111111111111
+                          +in[(i-4)*n+(j+5)] * 0.00111111111111
+                          +in[(i-3)*n+(j+5)] * 0.00111111111111
+                          +in[(i-2)*n+(j+5)] * 0.00111111111111
+                          +in[(i-1)*n+(j+5)] * 0.00111111111111
+                          +in[(i)*n+(j+5)] * 0.00111111111111
+                          +in[(i+1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+4)*n+(j+5)] * 0.00111111111111
                           +in[(i+5)*n+(j+5)] * 0.01
                           ;
            }
diff --git a/Cxx11/stencil_tbb.hpp b/Cxx11/stencil_tbb.hpp
index 9dc8b4d16..7b68173a9 100644
--- a/Cxx11/stencil_tbb.hpp
+++ b/Cxx11/stencil_tbb.hpp
@@ -4,10 +4,10 @@ void star1(const int n, const int t, std::vector<double> & in, std::vector<doubl
     for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) {
       PRAGMA_SIMD
       for (auto j=r.cols().begin(); j!=r.cols().end(); ++j ) {
-            out[i*n+j] += +in[(i+-1)*n+(j+0)] * -0.5
-                          +in[(i+0)*n+(j+-1)] * -0.5
-                          +in[(i+0)*n+(j+1)] * 0.5
-                          +in[(i+1)*n+(j+0)] * 0.5;
+            out[i*n+j] += +in[(i)*n+(j-1)] * -0.5
+                          +in[(i-1)*n+(j)] * -0.5
+                          +in[(i+1)*n+(j)] * 0.5
+                          +in[(i)*n+(j+1)] * 0.5;
       }
     }
   }, tbb_partitioner );
@@ -19,14 +19,14 @@ void star2(const int n, const int t, std::vector<double> & in, std::vector<doubl
     for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) {
       PRAGMA_SIMD
       for (auto j=r.cols().begin(); j!=r.cols().end(); ++j ) {
-            out[i*n+j] += +in[(i+-2)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+0)] * -0.25
-                          +in[(i+0)*n+(j+-2)] * -0.125
-                          +in[(i+0)*n+(j+-1)] * -0.25
-                          +in[(i+0)*n+(j+1)] * 0.25
-                          +in[(i+0)*n+(j+2)] * 0.125
-                          +in[(i+1)*n+(j+0)] * 0.25
-                          +in[(i+2)*n+(j+0)] * 0.125;
+            out[i*n+j] += +in[(i)*n+(j-2)] * -0.125
+                          +in[(i)*n+(j-1)] * -0.25
+                          +in[(i-2)*n+(j)] * -0.125
+                          +in[(i-1)*n+(j)] * -0.25
+                          +in[(i+1)*n+(j)] * 0.25
+                          +in[(i+2)*n+(j)] * 0.125
+                          +in[(i)*n+(j+1)] * 0.25
+                          +in[(i)*n+(j+2)] * 0.125;
       }
     }
   }, tbb_partitioner );
@@ -38,18 +38,18 @@ void star3(const int n, const int t, std::vector<double> & in, std::vector<doubl
     for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) {
       PRAGMA_SIMD
       for (auto j=r.cols().begin(); j!=r.cols().end(); ++j ) {
-            out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556
-                          +in[(i+-2)*n+(j+0)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+0)] * -0.166666666667
-                          +in[(i+0)*n+(j+-3)] * -0.0555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.0833333333333
-                          +in[(i+0)*n+(j+-1)] * -0.166666666667
-                          +in[(i+0)*n+(j+1)] * 0.166666666667
-                          +in[(i+0)*n+(j+2)] * 0.0833333333333
-                          +in[(i+0)*n+(j+3)] * 0.0555555555556
-                          +in[(i+1)*n+(j+0)] * 0.166666666667
-                          +in[(i+2)*n+(j+0)] * 0.0833333333333
-                          +in[(i+3)*n+(j+0)] * 0.0555555555556;
+            out[i*n+j] += +in[(i)*n+(j-3)] * -0.0555555555556
+                          +in[(i)*n+(j-2)] * -0.0833333333333
+                          +in[(i)*n+(j-1)] * -0.166666666667
+                          +in[(i-3)*n+(j)] * -0.0555555555556
+                          +in[(i-2)*n+(j)] * -0.0833333333333
+                          +in[(i-1)*n+(j)] * -0.166666666667
+                          +in[(i+1)*n+(j)] * 0.166666666667
+                          +in[(i+2)*n+(j)] * 0.0833333333333
+                          +in[(i+3)*n+(j)] * 0.0555555555556
+                          +in[(i)*n+(j+1)] * 0.166666666667
+                          +in[(i)*n+(j+2)] * 0.0833333333333
+                          +in[(i)*n+(j+3)] * 0.0555555555556;
       }
     }
   }, tbb_partitioner );
@@ -61,22 +61,22 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
     for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) {
       PRAGMA_SIMD
       for (auto j=r.cols().begin(); j!=r.cols().end(); ++j ) {
-            out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125
-                          +in[(i+-3)*n+(j+0)] * -0.0416666666667
-                          +in[(i+-2)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+0)*n+(j+-4)] * -0.03125
-                          +in[(i+0)*n+(j+-3)] * -0.0416666666667
-                          +in[(i+0)*n+(j+-2)] * -0.0625
-                          +in[(i+0)*n+(j+-1)] * -0.125
-                          +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.0625
-                          +in[(i+0)*n+(j+3)] * 0.0416666666667
-                          +in[(i+0)*n+(j+4)] * 0.03125
-                          +in[(i+1)*n+(j+0)] * 0.125
-                          +in[(i+2)*n+(j+0)] * 0.0625
-                          +in[(i+3)*n+(j+0)] * 0.0416666666667
-                          +in[(i+4)*n+(j+0)] * 0.03125;
+            out[i*n+j] += +in[(i)*n+(j-4)] * -0.03125
+                          +in[(i)*n+(j-3)] * -0.0416666666667
+                          +in[(i)*n+(j-2)] * -0.0625
+                          +in[(i)*n+(j-1)] * -0.125
+                          +in[(i-4)*n+(j)] * -0.03125
+                          +in[(i-3)*n+(j)] * -0.0416666666667
+                          +in[(i-2)*n+(j)] * -0.0625
+                          +in[(i-1)*n+(j)] * -0.125
+                          +in[(i+1)*n+(j)] * 0.125
+                          +in[(i+2)*n+(j)] * 0.0625
+                          +in[(i+3)*n+(j)] * 0.0416666666667
+                          +in[(i+4)*n+(j)] * 0.03125
+                          +in[(i)*n+(j+1)] * 0.125
+                          +in[(i)*n+(j+2)] * 0.0625
+                          +in[(i)*n+(j+3)] * 0.0416666666667
+                          +in[(i)*n+(j+4)] * 0.03125;
       }
     }
   }, tbb_partitioner );
@@ -88,26 +88,26 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
     for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) {
       PRAGMA_SIMD
       for (auto j=r.cols().begin(); j!=r.cols().end(); ++j ) {
-            out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02
-                          +in[(i+-4)*n+(j+0)] * -0.025
-                          +in[(i+-3)*n+(j+0)] * -0.0333333333333
-                          +in[(i+-2)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+0)] * -0.1
-                          +in[(i+0)*n+(j+-5)] * -0.02
-                          +in[(i+0)*n+(j+-4)] * -0.025
-                          +in[(i+0)*n+(j+-3)] * -0.0333333333333
-                          +in[(i+0)*n+(j+-2)] * -0.05
-                          +in[(i+0)*n+(j+-1)] * -0.1
-                          +in[(i+0)*n+(j+1)] * 0.1
-                          +in[(i+0)*n+(j+2)] * 0.05
-                          +in[(i+0)*n+(j+3)] * 0.0333333333333
-                          +in[(i+0)*n+(j+4)] * 0.025
-                          +in[(i+0)*n+(j+5)] * 0.02
-                          +in[(i+1)*n+(j+0)] * 0.1
-                          +in[(i+2)*n+(j+0)] * 0.05
-                          +in[(i+3)*n+(j+0)] * 0.0333333333333
-                          +in[(i+4)*n+(j+0)] * 0.025
-                          +in[(i+5)*n+(j+0)] * 0.02;
+            out[i*n+j] += +in[(i)*n+(j-5)] * -0.02
+                          +in[(i)*n+(j-4)] * -0.025
+                          +in[(i)*n+(j-3)] * -0.0333333333333
+                          +in[(i)*n+(j-2)] * -0.05
+                          +in[(i)*n+(j-1)] * -0.1
+                          +in[(i-5)*n+(j)] * -0.02
+                          +in[(i-4)*n+(j)] * -0.025
+                          +in[(i-3)*n+(j)] * -0.0333333333333
+                          +in[(i-2)*n+(j)] * -0.05
+                          +in[(i-1)*n+(j)] * -0.1
+                          +in[(i+1)*n+(j)] * 0.1
+                          +in[(i+2)*n+(j)] * 0.05
+                          +in[(i+3)*n+(j)] * 0.0333333333333
+                          +in[(i+4)*n+(j)] * 0.025
+                          +in[(i+5)*n+(j)] * 0.02
+                          +in[(i)*n+(j+1)] * 0.1
+                          +in[(i)*n+(j+2)] * 0.05
+                          +in[(i)*n+(j+3)] * 0.0333333333333
+                          +in[(i)*n+(j+4)] * 0.025
+                          +in[(i)*n+(j+5)] * 0.02;
       }
     }
   }, tbb_partitioner );
@@ -119,11 +119,11 @@ void grid1(const int n, const int t, std::vector<double> & in, std::vector<doubl
     for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) {
       PRAGMA_SIMD
       for (auto j=r.cols().begin(); j!=r.cols().end(); ++j ) {
-            out[i*n+j] += +in[(i+-1)*n+(j+-1)] * -0.25
-                          +in[(i+-1)*n+(j+0)] * -0.25
-                          +in[(i+0)*n+(j+-1)] * -0.25
-                          +in[(i+0)*n+(j+1)] * 0.25
-                          +in[(i+1)*n+(j+0)] * 0.25
+            out[i*n+j] += +in[(i-1)*n+(j-1)] * -0.25
+                          +in[(i)*n+(j-1)] * -0.25
+                          +in[(i-1)*n+(j)] * -0.25
+                          +in[(i+1)*n+(j)] * 0.25
+                          +in[(i)*n+(j+1)] * 0.25
                           +in[(i+1)*n+(j+1)] * 0.25
                           ;
       }
@@ -137,25 +137,25 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
     for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) {
       PRAGMA_SIMD
       for (auto j=r.cols().begin(); j!=r.cols().end(); ++j ) {
-            out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625
-                          +in[(i+-2)*n+(j+-1)] * -0.0208333333333
-                          +in[(i+-2)*n+(j+0)] * -0.0208333333333
-                          +in[(i+-2)*n+(j+1)] * -0.0208333333333
-                          +in[(i+-1)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+-1)*n+(j+-1)] * -0.125
-                          +in[(i+-1)*n+(j+0)] * -0.125
-                          +in[(i+-1)*n+(j+2)] * 0.0208333333333
-                          +in[(i+0)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+0)*n+(j+-1)] * -0.125
-                          +in[(i+0)*n+(j+1)] * 0.125
-                          +in[(i+0)*n+(j+2)] * 0.0208333333333
-                          +in[(i+1)*n+(j+-2)] * -0.0208333333333
-                          +in[(i+1)*n+(j+0)] * 0.125
+            out[i*n+j] += +in[(i-2)*n+(j-2)] * -0.0625
+                          +in[(i-1)*n+(j-2)] * -0.0208333333333
+                          +in[(i)*n+(j-2)] * -0.0208333333333
+                          +in[(i+1)*n+(j-2)] * -0.0208333333333
+                          +in[(i-2)*n+(j-1)] * -0.0208333333333
+                          +in[(i-1)*n+(j-1)] * -0.125
+                          +in[(i)*n+(j-1)] * -0.125
+                          +in[(i+2)*n+(j-1)] * 0.0208333333333
+                          +in[(i-2)*n+(j)] * -0.0208333333333
+                          +in[(i-1)*n+(j)] * -0.125
+                          +in[(i+1)*n+(j)] * 0.125
+                          +in[(i+2)*n+(j)] * 0.0208333333333
+                          +in[(i-2)*n+(j+1)] * -0.0208333333333
+                          +in[(i)*n+(j+1)] * 0.125
                           +in[(i+1)*n+(j+1)] * 0.125
-                          +in[(i+1)*n+(j+2)] * 0.0208333333333
-                          +in[(i+2)*n+(j+-1)] * 0.0208333333333
-                          +in[(i+2)*n+(j+0)] * 0.0208333333333
                           +in[(i+2)*n+(j+1)] * 0.0208333333333
+                          +in[(i-1)*n+(j+2)] * 0.0208333333333
+                          +in[(i)*n+(j+2)] * 0.0208333333333
+                          +in[(i+1)*n+(j+2)] * 0.0208333333333
                           +in[(i+2)*n+(j+2)] * 0.0625
                           ;
       }
@@ -169,47 +169,47 @@ void grid3(const int n, const int t, std::vector<double> & in, std::vector<doubl
     for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) {
       PRAGMA_SIMD
       for (auto j=r.cols().begin(); j!=r.cols().end(); ++j ) {
-            out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778
-                          +in[(i+-3)*n+(j+-2)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+-1)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+0)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+1)] * -0.00555555555556
-                          +in[(i+-3)*n+(j+2)] * -0.00555555555556
-                          +in[(i+-2)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+-2)*n+(j+-2)] * -0.0416666666667
-                          +in[(i+-2)*n+(j+-1)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+0)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+1)] * -0.0138888888889
-                          +in[(i+-2)*n+(j+3)] * 0.00555555555556
-                          +in[(i+-1)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+-1)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+-1)*n+(j+-1)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+0)] * -0.0833333333333
-                          +in[(i+-1)*n+(j+2)] * 0.0138888888889
-                          +in[(i+-1)*n+(j+3)] * 0.00555555555556
-                          +in[(i+0)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+0)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+0)*n+(j+-1)] * -0.0833333333333
-                          +in[(i+0)*n+(j+1)] * 0.0833333333333
-                          +in[(i+0)*n+(j+2)] * 0.0138888888889
-                          +in[(i+0)*n+(j+3)] * 0.00555555555556
-                          +in[(i+1)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+1)*n+(j+-2)] * -0.0138888888889
-                          +in[(i+1)*n+(j+0)] * 0.0833333333333
+            out[i*n+j] += +in[(i-3)*n+(j-3)] * -0.0277777777778
+                          +in[(i-2)*n+(j-3)] * -0.00555555555556
+                          +in[(i-1)*n+(j-3)] * -0.00555555555556
+                          +in[(i)*n+(j-3)] * -0.00555555555556
+                          +in[(i+1)*n+(j-3)] * -0.00555555555556
+                          +in[(i+2)*n+(j-3)] * -0.00555555555556
+                          +in[(i-3)*n+(j-2)] * -0.00555555555556
+                          +in[(i-2)*n+(j-2)] * -0.0416666666667
+                          +in[(i-1)*n+(j-2)] * -0.0138888888889
+                          +in[(i)*n+(j-2)] * -0.0138888888889
+                          +in[(i+1)*n+(j-2)] * -0.0138888888889
+                          +in[(i+3)*n+(j-2)] * 0.00555555555556
+                          +in[(i-3)*n+(j-1)] * -0.00555555555556
+                          +in[(i-2)*n+(j-1)] * -0.0138888888889
+                          +in[(i-1)*n+(j-1)] * -0.0833333333333
+                          +in[(i)*n+(j-1)] * -0.0833333333333
+                          +in[(i+2)*n+(j-1)] * 0.0138888888889
+                          +in[(i+3)*n+(j-1)] * 0.00555555555556
+                          +in[(i-3)*n+(j)] * -0.00555555555556
+                          +in[(i-2)*n+(j)] * -0.0138888888889
+                          +in[(i-1)*n+(j)] * -0.0833333333333
+                          +in[(i+1)*n+(j)] * 0.0833333333333
+                          +in[(i+2)*n+(j)] * 0.0138888888889
+                          +in[(i+3)*n+(j)] * 0.00555555555556
+                          +in[(i-3)*n+(j+1)] * -0.00555555555556
+                          +in[(i-2)*n+(j+1)] * -0.0138888888889
+                          +in[(i)*n+(j+1)] * 0.0833333333333
                           +in[(i+1)*n+(j+1)] * 0.0833333333333
-                          +in[(i+1)*n+(j+2)] * 0.0138888888889
-                          +in[(i+1)*n+(j+3)] * 0.00555555555556
-                          +in[(i+2)*n+(j+-3)] * -0.00555555555556
-                          +in[(i+2)*n+(j+-1)] * 0.0138888888889
-                          +in[(i+2)*n+(j+0)] * 0.0138888888889
                           +in[(i+2)*n+(j+1)] * 0.0138888888889
-                          +in[(i+2)*n+(j+2)] * 0.0416666666667
-                          +in[(i+2)*n+(j+3)] * 0.00555555555556
-                          +in[(i+3)*n+(j+-2)] * 0.00555555555556
-                          +in[(i+3)*n+(j+-1)] * 0.00555555555556
-                          +in[(i+3)*n+(j+0)] * 0.00555555555556
                           +in[(i+3)*n+(j+1)] * 0.00555555555556
+                          +in[(i-3)*n+(j+2)] * -0.00555555555556
+                          +in[(i-1)*n+(j+2)] * 0.0138888888889
+                          +in[(i)*n+(j+2)] * 0.0138888888889
+                          +in[(i+1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+2)*n+(j+2)] * 0.0416666666667
                           +in[(i+3)*n+(j+2)] * 0.00555555555556
+                          +in[(i-2)*n+(j+3)] * 0.00555555555556
+                          +in[(i-1)*n+(j+3)] * 0.00555555555556
+                          +in[(i)*n+(j+3)] * 0.00555555555556
+                          +in[(i+1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+2)*n+(j+3)] * 0.00555555555556
                           +in[(i+3)*n+(j+3)] * 0.0277777777778
                           ;
       }
@@ -223,77 +223,77 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
     for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) {
       PRAGMA_SIMD
       for (auto j=r.cols().begin(); j!=r.cols().end(); ++j ) {
-            out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625
-                          +in[(i+-4)*n+(j+-3)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+-2)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+-1)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+0)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+1)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+2)] * -0.00223214285714
-                          +in[(i+-4)*n+(j+3)] * -0.00223214285714
-                          +in[(i+-3)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-3)*n+(j+-3)] * -0.0208333333333
-                          +in[(i+-3)*n+(j+-2)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+-1)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+0)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+1)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+2)] * -0.00416666666667
-                          +in[(i+-3)*n+(j+4)] * 0.00223214285714
-                          +in[(i+-2)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-2)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+-2)*n+(j+-2)] * -0.03125
-                          +in[(i+-2)*n+(j+-1)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+0)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+1)] * -0.0104166666667
-                          +in[(i+-2)*n+(j+3)] * 0.00416666666667
-                          +in[(i+-2)*n+(j+4)] * 0.00223214285714
-                          +in[(i+-1)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+-1)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+-1)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+-1)*n+(j+-1)] * -0.0625
-                          +in[(i+-1)*n+(j+0)] * -0.0625
-                          +in[(i+-1)*n+(j+2)] * 0.0104166666667
-                          +in[(i+-1)*n+(j+3)] * 0.00416666666667
-                          +in[(i+-1)*n+(j+4)] * 0.00223214285714
-                          +in[(i+0)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+0)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+0)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+0)*n+(j+-1)] * -0.0625
-                          +in[(i+0)*n+(j+1)] * 0.0625
-                          +in[(i+0)*n+(j+2)] * 0.0104166666667
-                          +in[(i+0)*n+(j+3)] * 0.00416666666667
-                          +in[(i+0)*n+(j+4)] * 0.00223214285714
-                          +in[(i+1)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+1)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+1)*n+(j+-2)] * -0.0104166666667
-                          +in[(i+1)*n+(j+0)] * 0.0625
+            out[i*n+j] += +in[(i-4)*n+(j-4)] * -0.015625
+                          +in[(i-3)*n+(j-4)] * -0.00223214285714
+                          +in[(i-2)*n+(j-4)] * -0.00223214285714
+                          +in[(i-1)*n+(j-4)] * -0.00223214285714
+                          +in[(i)*n+(j-4)] * -0.00223214285714
+                          +in[(i+1)*n+(j-4)] * -0.00223214285714
+                          +in[(i+2)*n+(j-4)] * -0.00223214285714
+                          +in[(i+3)*n+(j-4)] * -0.00223214285714
+                          +in[(i-4)*n+(j-3)] * -0.00223214285714
+                          +in[(i-3)*n+(j-3)] * -0.0208333333333
+                          +in[(i-2)*n+(j-3)] * -0.00416666666667
+                          +in[(i-1)*n+(j-3)] * -0.00416666666667
+                          +in[(i)*n+(j-3)] * -0.00416666666667
+                          +in[(i+1)*n+(j-3)] * -0.00416666666667
+                          +in[(i+2)*n+(j-3)] * -0.00416666666667
+                          +in[(i+4)*n+(j-3)] * 0.00223214285714
+                          +in[(i-4)*n+(j-2)] * -0.00223214285714
+                          +in[(i-3)*n+(j-2)] * -0.00416666666667
+                          +in[(i-2)*n+(j-2)] * -0.03125
+                          +in[(i-1)*n+(j-2)] * -0.0104166666667
+                          +in[(i)*n+(j-2)] * -0.0104166666667
+                          +in[(i+1)*n+(j-2)] * -0.0104166666667
+                          +in[(i+3)*n+(j-2)] * 0.00416666666667
+                          +in[(i+4)*n+(j-2)] * 0.00223214285714
+                          +in[(i-4)*n+(j-1)] * -0.00223214285714
+                          +in[(i-3)*n+(j-1)] * -0.00416666666667
+                          +in[(i-2)*n+(j-1)] * -0.0104166666667
+                          +in[(i-1)*n+(j-1)] * -0.0625
+                          +in[(i)*n+(j-1)] * -0.0625
+                          +in[(i+2)*n+(j-1)] * 0.0104166666667
+                          +in[(i+3)*n+(j-1)] * 0.00416666666667
+                          +in[(i+4)*n+(j-1)] * 0.00223214285714
+                          +in[(i-4)*n+(j)] * -0.00223214285714
+                          +in[(i-3)*n+(j)] * -0.00416666666667
+                          +in[(i-2)*n+(j)] * -0.0104166666667
+                          +in[(i-1)*n+(j)] * -0.0625
+                          +in[(i+1)*n+(j)] * 0.0625
+                          +in[(i+2)*n+(j)] * 0.0104166666667
+                          +in[(i+3)*n+(j)] * 0.00416666666667
+                          +in[(i+4)*n+(j)] * 0.00223214285714
+                          +in[(i-4)*n+(j+1)] * -0.00223214285714
+                          +in[(i-3)*n+(j+1)] * -0.00416666666667
+                          +in[(i-2)*n+(j+1)] * -0.0104166666667
+                          +in[(i)*n+(j+1)] * 0.0625
                           +in[(i+1)*n+(j+1)] * 0.0625
-                          +in[(i+1)*n+(j+2)] * 0.0104166666667
-                          +in[(i+1)*n+(j+3)] * 0.00416666666667
-                          +in[(i+1)*n+(j+4)] * 0.00223214285714
-                          +in[(i+2)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+2)*n+(j+-3)] * -0.00416666666667
-                          +in[(i+2)*n+(j+-1)] * 0.0104166666667
-                          +in[(i+2)*n+(j+0)] * 0.0104166666667
                           +in[(i+2)*n+(j+1)] * 0.0104166666667
-                          +in[(i+2)*n+(j+2)] * 0.03125
-                          +in[(i+2)*n+(j+3)] * 0.00416666666667
-                          +in[(i+2)*n+(j+4)] * 0.00223214285714
-                          +in[(i+3)*n+(j+-4)] * -0.00223214285714
-                          +in[(i+3)*n+(j+-2)] * 0.00416666666667
-                          +in[(i+3)*n+(j+-1)] * 0.00416666666667
-                          +in[(i+3)*n+(j+0)] * 0.00416666666667
                           +in[(i+3)*n+(j+1)] * 0.00416666666667
-                          +in[(i+3)*n+(j+2)] * 0.00416666666667
-                          +in[(i+3)*n+(j+3)] * 0.0208333333333
-                          +in[(i+3)*n+(j+4)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-3)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-2)] * 0.00223214285714
-                          +in[(i+4)*n+(j+-1)] * 0.00223214285714
-                          +in[(i+4)*n+(j+0)] * 0.00223214285714
                           +in[(i+4)*n+(j+1)] * 0.00223214285714
+                          +in[(i-4)*n+(j+2)] * -0.00223214285714
+                          +in[(i-3)*n+(j+2)] * -0.00416666666667
+                          +in[(i-1)*n+(j+2)] * 0.0104166666667
+                          +in[(i)*n+(j+2)] * 0.0104166666667
+                          +in[(i+1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+2)*n+(j+2)] * 0.03125
+                          +in[(i+3)*n+(j+2)] * 0.00416666666667
                           +in[(i+4)*n+(j+2)] * 0.00223214285714
+                          +in[(i-4)*n+(j+3)] * -0.00223214285714
+                          +in[(i-2)*n+(j+3)] * 0.00416666666667
+                          +in[(i-1)*n+(j+3)] * 0.00416666666667
+                          +in[(i)*n+(j+3)] * 0.00416666666667
+                          +in[(i+1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+3)*n+(j+3)] * 0.0208333333333
                           +in[(i+4)*n+(j+3)] * 0.00223214285714
+                          +in[(i-3)*n+(j+4)] * 0.00223214285714
+                          +in[(i-2)*n+(j+4)] * 0.00223214285714
+                          +in[(i-1)*n+(j+4)] * 0.00223214285714
+                          +in[(i)*n+(j+4)] * 0.00223214285714
+                          +in[(i+1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+3)*n+(j+4)] * 0.00223214285714
                           +in[(i+4)*n+(j+4)] * 0.015625
                           ;
       }
@@ -307,115 +307,115 @@ void grid5(const int n, const int t, std::vector<double> & in, std::vector<doubl
     for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) {
       PRAGMA_SIMD
       for (auto j=r.cols().begin(); j!=r.cols().end(); ++j ) {
-            out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01
-                          +in[(i+-5)*n+(j+-4)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-3)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-2)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+-1)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+0)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+1)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+2)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+3)] * -0.00111111111111
-                          +in[(i+-5)*n+(j+4)] * -0.00111111111111
-                          +in[(i+-4)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-4)*n+(j+-4)] * -0.0125
-                          +in[(i+-4)*n+(j+-3)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+-2)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+-1)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+0)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+1)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+2)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+3)] * -0.00178571428571
-                          +in[(i+-4)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-3)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-3)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-3)*n+(j+-3)] * -0.0166666666667
-                          +in[(i+-3)*n+(j+-2)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+-1)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+0)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+1)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+2)] * -0.00333333333333
-                          +in[(i+-3)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-3)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-2)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-2)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-2)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+-2)*n+(j+-2)] * -0.025
-                          +in[(i+-2)*n+(j+-1)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+0)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+1)] * -0.00833333333333
-                          +in[(i+-2)*n+(j+3)] * 0.00333333333333
-                          +in[(i+-2)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-2)*n+(j+5)] * 0.00111111111111
-                          +in[(i+-1)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+-1)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+-1)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+-1)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+-1)*n+(j+-1)] * -0.05
-                          +in[(i+-1)*n+(j+0)] * -0.05
-                          +in[(i+-1)*n+(j+2)] * 0.00833333333333
-                          +in[(i+-1)*n+(j+3)] * 0.00333333333333
-                          +in[(i+-1)*n+(j+4)] * 0.00178571428571
-                          +in[(i+-1)*n+(j+5)] * 0.00111111111111
-                          +in[(i+0)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+0)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+0)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+0)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+0)*n+(j+-1)] * -0.05
-                          +in[(i+0)*n+(j+1)] * 0.05
-                          +in[(i+0)*n+(j+2)] * 0.00833333333333
-                          +in[(i+0)*n+(j+3)] * 0.00333333333333
-                          +in[(i+0)*n+(j+4)] * 0.00178571428571
-                          +in[(i+0)*n+(j+5)] * 0.00111111111111
-                          +in[(i+1)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+1)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+1)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+1)*n+(j+-2)] * -0.00833333333333
-                          +in[(i+1)*n+(j+0)] * 0.05
+            out[i*n+j] += +in[(i-5)*n+(j-5)] * -0.01
+                          +in[(i-4)*n+(j-5)] * -0.00111111111111
+                          +in[(i-3)*n+(j-5)] * -0.00111111111111
+                          +in[(i-2)*n+(j-5)] * -0.00111111111111
+                          +in[(i-1)*n+(j-5)] * -0.00111111111111
+                          +in[(i)*n+(j-5)] * -0.00111111111111
+                          +in[(i+1)*n+(j-5)] * -0.00111111111111
+                          +in[(i+2)*n+(j-5)] * -0.00111111111111
+                          +in[(i+3)*n+(j-5)] * -0.00111111111111
+                          +in[(i+4)*n+(j-5)] * -0.00111111111111
+                          +in[(i-5)*n+(j-4)] * -0.00111111111111
+                          +in[(i-4)*n+(j-4)] * -0.0125
+                          +in[(i-3)*n+(j-4)] * -0.00178571428571
+                          +in[(i-2)*n+(j-4)] * -0.00178571428571
+                          +in[(i-1)*n+(j-4)] * -0.00178571428571
+                          +in[(i)*n+(j-4)] * -0.00178571428571
+                          +in[(i+1)*n+(j-4)] * -0.00178571428571
+                          +in[(i+2)*n+(j-4)] * -0.00178571428571
+                          +in[(i+3)*n+(j-4)] * -0.00178571428571
+                          +in[(i+5)*n+(j-4)] * 0.00111111111111
+                          +in[(i-5)*n+(j-3)] * -0.00111111111111
+                          +in[(i-4)*n+(j-3)] * -0.00178571428571
+                          +in[(i-3)*n+(j-3)] * -0.0166666666667
+                          +in[(i-2)*n+(j-3)] * -0.00333333333333
+                          +in[(i-1)*n+(j-3)] * -0.00333333333333
+                          +in[(i)*n+(j-3)] * -0.00333333333333
+                          +in[(i+1)*n+(j-3)] * -0.00333333333333
+                          +in[(i+2)*n+(j-3)] * -0.00333333333333
+                          +in[(i+4)*n+(j-3)] * 0.00178571428571
+                          +in[(i+5)*n+(j-3)] * 0.00111111111111
+                          +in[(i-5)*n+(j-2)] * -0.00111111111111
+                          +in[(i-4)*n+(j-2)] * -0.00178571428571
+                          +in[(i-3)*n+(j-2)] * -0.00333333333333
+                          +in[(i-2)*n+(j-2)] * -0.025
+                          +in[(i-1)*n+(j-2)] * -0.00833333333333
+                          +in[(i)*n+(j-2)] * -0.00833333333333
+                          +in[(i+1)*n+(j-2)] * -0.00833333333333
+                          +in[(i+3)*n+(j-2)] * 0.00333333333333
+                          +in[(i+4)*n+(j-2)] * 0.00178571428571
+                          +in[(i+5)*n+(j-2)] * 0.00111111111111
+                          +in[(i-5)*n+(j-1)] * -0.00111111111111
+                          +in[(i-4)*n+(j-1)] * -0.00178571428571
+                          +in[(i-3)*n+(j-1)] * -0.00333333333333
+                          +in[(i-2)*n+(j-1)] * -0.00833333333333
+                          +in[(i-1)*n+(j-1)] * -0.05
+                          +in[(i)*n+(j-1)] * -0.05
+                          +in[(i+2)*n+(j-1)] * 0.00833333333333
+                          +in[(i+3)*n+(j-1)] * 0.00333333333333
+                          +in[(i+4)*n+(j-1)] * 0.00178571428571
+                          +in[(i+5)*n+(j-1)] * 0.00111111111111
+                          +in[(i-5)*n+(j)] * -0.00111111111111
+                          +in[(i-4)*n+(j)] * -0.00178571428571
+                          +in[(i-3)*n+(j)] * -0.00333333333333
+                          +in[(i-2)*n+(j)] * -0.00833333333333
+                          +in[(i-1)*n+(j)] * -0.05
+                          +in[(i+1)*n+(j)] * 0.05
+                          +in[(i+2)*n+(j)] * 0.00833333333333
+                          +in[(i+3)*n+(j)] * 0.00333333333333
+                          +in[(i+4)*n+(j)] * 0.00178571428571
+                          +in[(i+5)*n+(j)] * 0.00111111111111
+                          +in[(i-5)*n+(j+1)] * -0.00111111111111
+                          +in[(i-4)*n+(j+1)] * -0.00178571428571
+                          +in[(i-3)*n+(j+1)] * -0.00333333333333
+                          +in[(i-2)*n+(j+1)] * -0.00833333333333
+                          +in[(i)*n+(j+1)] * 0.05
                           +in[(i+1)*n+(j+1)] * 0.05
-                          +in[(i+1)*n+(j+2)] * 0.00833333333333
-                          +in[(i+1)*n+(j+3)] * 0.00333333333333
-                          +in[(i+1)*n+(j+4)] * 0.00178571428571
-                          +in[(i+1)*n+(j+5)] * 0.00111111111111
-                          +in[(i+2)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+2)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+2)*n+(j+-3)] * -0.00333333333333
-                          +in[(i+2)*n+(j+-1)] * 0.00833333333333
-                          +in[(i+2)*n+(j+0)] * 0.00833333333333
                           +in[(i+2)*n+(j+1)] * 0.00833333333333
-                          +in[(i+2)*n+(j+2)] * 0.025
-                          +in[(i+2)*n+(j+3)] * 0.00333333333333
-                          +in[(i+2)*n+(j+4)] * 0.00178571428571
-                          +in[(i+2)*n+(j+5)] * 0.00111111111111
-                          +in[(i+3)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+3)*n+(j+-4)] * -0.00178571428571
-                          +in[(i+3)*n+(j+-2)] * 0.00333333333333
-                          +in[(i+3)*n+(j+-1)] * 0.00333333333333
-                          +in[(i+3)*n+(j+0)] * 0.00333333333333
                           +in[(i+3)*n+(j+1)] * 0.00333333333333
-                          +in[(i+3)*n+(j+2)] * 0.00333333333333
-                          +in[(i+3)*n+(j+3)] * 0.0166666666667
-                          +in[(i+3)*n+(j+4)] * 0.00178571428571
-                          +in[(i+3)*n+(j+5)] * 0.00111111111111
-                          +in[(i+4)*n+(j+-5)] * -0.00111111111111
-                          +in[(i+4)*n+(j+-3)] * 0.00178571428571
-                          +in[(i+4)*n+(j+-2)] * 0.00178571428571
-                          +in[(i+4)*n+(j+-1)] * 0.00178571428571
-                          +in[(i+4)*n+(j+0)] * 0.00178571428571
                           +in[(i+4)*n+(j+1)] * 0.00178571428571
-                          +in[(i+4)*n+(j+2)] * 0.00178571428571
-                          +in[(i+4)*n+(j+3)] * 0.00178571428571
-                          +in[(i+4)*n+(j+4)] * 0.0125
-                          +in[(i+4)*n+(j+5)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-4)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-3)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-2)] * 0.00111111111111
-                          +in[(i+5)*n+(j+-1)] * 0.00111111111111
-                          +in[(i+5)*n+(j+0)] * 0.00111111111111
                           +in[(i+5)*n+(j+1)] * 0.00111111111111
+                          +in[(i-5)*n+(j+2)] * -0.00111111111111
+                          +in[(i-4)*n+(j+2)] * -0.00178571428571
+                          +in[(i-3)*n+(j+2)] * -0.00333333333333
+                          +in[(i-1)*n+(j+2)] * 0.00833333333333
+                          +in[(i)*n+(j+2)] * 0.00833333333333
+                          +in[(i+1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+2)*n+(j+2)] * 0.025
+                          +in[(i+3)*n+(j+2)] * 0.00333333333333
+                          +in[(i+4)*n+(j+2)] * 0.00178571428571
                           +in[(i+5)*n+(j+2)] * 0.00111111111111
+                          +in[(i-5)*n+(j+3)] * -0.00111111111111
+                          +in[(i-4)*n+(j+3)] * -0.00178571428571
+                          +in[(i-2)*n+(j+3)] * 0.00333333333333
+                          +in[(i-1)*n+(j+3)] * 0.00333333333333
+                          +in[(i)*n+(j+3)] * 0.00333333333333
+                          +in[(i+1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+3)*n+(j+3)] * 0.0166666666667
+                          +in[(i+4)*n+(j+3)] * 0.00178571428571
                           +in[(i+5)*n+(j+3)] * 0.00111111111111
+                          +in[(i-5)*n+(j+4)] * -0.00111111111111
+                          +in[(i-3)*n+(j+4)] * 0.00178571428571
+                          +in[(i-2)*n+(j+4)] * 0.00178571428571
+                          +in[(i-1)*n+(j+4)] * 0.00178571428571
+                          +in[(i)*n+(j+4)] * 0.00178571428571
+                          +in[(i+1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+4)*n+(j+4)] * 0.0125
                           +in[(i+5)*n+(j+4)] * 0.00111111111111
+                          +in[(i-4)*n+(j+5)] * 0.00111111111111
+                          +in[(i-3)*n+(j+5)] * 0.00111111111111
+                          +in[(i-2)*n+(j+5)] * 0.00111111111111
+                          +in[(i-1)*n+(j+5)] * 0.00111111111111
+                          +in[(i)*n+(j+5)] * 0.00111111111111
+                          +in[(i+1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+4)*n+(j+5)] * 0.00111111111111
                           +in[(i+5)*n+(j+5)] * 0.01
                           ;
       }
diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index e4b25c3cb..ab7363dee 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -647,9 +647,9 @@ case "$PRK_TARGET" in
             SYCLDIR=${TRAVIS_ROOT}/triSYCL
             if [ "${CC}" = "clang" ] ; then
                 # SYCL will compile without OpenMP
-                echo "SYCLCXX=${PRK_CXX} -pthread -std=c++14" >> common/make.defs
+                echo "SYCLCXX=${PRK_CXX} -pthread -std=c++17" >> common/make.defs
             else
-                echo "SYCLCXX=${PRK_CXX} -fopenmp -std=c++14" >> common/make.defs
+                echo "SYCLCXX=${PRK_CXX} -fopenmp -std=c++17" >> common/make.defs
             fi
             echo "SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/include" >> common/make.defs
             make -C $PRK_TARGET_PATH p2p-hyperplane-sycl stencil-sycl transpose-sycl nstream-sycl

From 1144a490a3bcde89e5d9066343e5e1acf8bba9fd Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Sat, 15 Sep 2018 12:32:06 -0700
Subject: [PATCH 114/245] Fix Travis issues (#365)

* fix Julia syntax issue

"1./" is a syntax error now.  change to "1.0/"

* fix issue with array += scalar
---
 JULIA/stencil.jl | 52 ++++++++++++++++++++++++++++++------------------
 1 file changed, 33 insertions(+), 19 deletions(-)

diff --git a/JULIA/stencil.jl b/JULIA/stencil.jl
index 1b5e18bbb..3b53cf7d5 100644
--- a/JULIA/stencil.jl
+++ b/JULIA/stencil.jl
@@ -61,6 +61,21 @@
 #
 # *******************************************************************
 
+function do_add(A, n)
+    for i=1:n
+        for j=1:n
+            A[i,j] += 1.0
+        end
+    end
+end
+
+function do_init(A, n)
+    for i=1:n
+        for j=1:n
+            A[i,j] = i+j-2
+        end
+    end
+end
 
 function do_star(A, W, B, r, n)
     for j=r:n-r-1
@@ -151,38 +166,37 @@ function main()
     if pattern == "star"
         stencil_size = 4*r+1
         for i=1:r
-            W[r+1,r+i+1] = +1./(2*i*r)
-            W[r+i+1,r+1] = +1./(2*i*r)
-            W[r+1,r-i+1] = -1./(2*i*r)
-            W[r-i+1,r+1] = -1./(2*i*r)
+            W[r+1,r+i+1] =  1.0/(2*i*r)
+            W[r+i+1,r+1] =  1.0/(2*i*r)
+            W[r+1,r-i+1] = -1.0/(2*i*r)
+            W[r-i+1,r+1] = -1.0/(2*i*r)
         end
     else
         stencil_size = (2*r+1)^2
         for j=1:r
             for i=-j+1:j-1
-                W[r+i+1,r+j+1] = +1./(4*j*(2*j-1)*r)
-                W[r+i+1,r-j+1] = -1./(4*j*(2*j-1)*r)
-                W[r+j+1,r+i+1] = +1./(4*j*(2*j-1)*r)
-                W[r-j+1,r+i+1] = -1./(4*j*(2*j-1)*r)
+                W[r+i+1,r+j+1] =  1.0/(4*j*(2*j-1)*r)
+                W[r+i+1,r-j+1] = -1.0/(4*j*(2*j-1)*r)
+                W[r+j+1,r+i+1] =  1.0/(4*j*(2*j-1)*r)
+                W[r-j+1,r+i+1] = -1.0/(4*j*(2*j-1)*r)
             end
-            W[r+j+1,r+j+1]    = +1./(4*j*r)
-            W[r-j+1,r-j+1]    = -1./(4*j*r)
+            W[r+j+1,r+j+1]    =  1.0/(4*j*r)
+            W[r-j+1,r-j+1]    = -1.0/(4*j*r)
         end
     end
 
-    A = zeros(Float64,n,n)
-    for i=1:n
-        for j=1:n
-            A[i,j] = i+j-2
-        end
-    end
-    B = zeros(Float64,n,n)
-
+    precompile(do_init, (Array{Float64,2}, Int64))
     if pattern == "star"
         precompile(do_star, (Array{Float64,2}, Array{Float64,2}, Array{Float64,2}, Int64, Int64))
     else
         precompile(do_stencil, (Array{Float64,2}, Array{Float64,2}, Array{Float64,2}, Int64, Int64))
     end
+    precompile(do_add, (Array{Float64,2}, Int64))
+
+    A = zeros(Float64,n,n)
+    B = zeros(Float64,n,n)
+
+    do_init(A, n)
 
     t0 = time_ns()
 
@@ -192,7 +206,7 @@ function main()
         else
             do_stencil(A, W, B, r, n)
         end
-        A += 1.0
+        do_add(A, n)
     end
 
     t1 = time_ns()

From fba9268665d58012cda21a4205a801dc08552925 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Fri, 11 Jan 2019 17:25:44 -0800
Subject: [PATCH 115/245] update CMake (#366)

* and they say autotools is brittle...

* update SOS version

* disable SHMEM on Mac altogether

* disable RAJA

* disable RAJA

* ASLR breaks SOS
---
 .travis.yml                        | 4 ++++
 travis/build-run-prk.sh            | 2 ++
 travis/install-cmake.sh            | 7 ++-----
 travis/install-deps.sh             | 5 +++--
 travis/install-sandia-openshmem.sh | 4 ++--
 5 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index ac0a9f07c..140751724 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -135,6 +135,10 @@ matrix:
   # Mac issue with thread_t (see https://github.com/humairakamal/fgmpi/pull/1)
   - os: osx
     env: PRK_TARGET=allfgmpi
+  # SOS@OFI has not worked on MacOS in a while :-(
+  - os: osx
+    compiler: clang
+    env: PRK_TARGET=allshmem
   allow_failures:
   # Travis trusty breaks this
   - os: linux
diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index ab7363dee..de135e01f 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -591,6 +591,7 @@ case "$PRK_TARGET" in
                 ;;
         esac
         # RAJA
+        if [ 0 = 1 ] ; then
         make -C $PRK_TARGET_PATH p2p-vector-raja stencil-vector-raja transpose-vector-raja nstream-vector-raja \
                                  p2p-raja stencil-raja transpose-raja nstream-raja
         # New (Views)
@@ -620,6 +621,7 @@ case "$PRK_TARGET" in
                 $PRK_TARGET_PATH/stencil-raja        10 200 20 $s $r
             done
         done
+        fi
         # Kokkos
         make -C $PRK_TARGET_PATH stencil-kokkos transpose-kokkos nstream-kokkos
         $PRK_TARGET_PATH/stencil-kokkos     10 1000
diff --git a/travis/install-cmake.sh b/travis/install-cmake.sh
index 0aa5f8daf..f807d8093 100755
--- a/travis/install-cmake.sh
+++ b/travis/install-cmake.sh
@@ -18,9 +18,6 @@ case "$os" in
         echo "Linux"
         if [ ! -d "$TRAVIS_ROOT/cmake" ]; then
             mkdir -p $TRAVIS_ROOT/cmake
-            # DEBUG
-            ls -l $TRAVIS_ROOT
-            ls -l $TRAVIS_ROOT/cmake
             # from source
             #wget --no-check-certificate -q https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz
             #tar -C $TRAVIS_ROOT -xzf cmake-3.4.1.tar.gz
@@ -30,8 +27,8 @@ case "$os" in
             #make -j4 && make install
             # from binary
             cd $TRAVIS_ROOT
-            wget --no-check-certificate -q https://cmake.org/files/v3.4/cmake-3.4.1-Linux-x86_64.sh
-            sh ./cmake-3.4.1-Linux-x86_64.sh --prefix=$TRAVIS_ROOT/cmake --skip-license --exclude-subdir
+            wget --no-check-certificate -q https://github.com/Kitware/CMake/releases/download/v3.13.2/cmake-3.13.2-Linux-x86_64.sh
+            sh ./cmake-3.13.2-Linux-x86_64.sh --prefix=$TRAVIS_ROOT/cmake --skip-license --exclude-subdir
         else
             echo "CMake installed..."
             find $TRAVIS_ROOT/cmake -name cmake
diff --git a/travis/install-deps.sh b/travis/install-deps.sh
index 89243cd93..cb79e96e9 100755
--- a/travis/install-deps.sh
+++ b/travis/install-deps.sh
@@ -66,9 +66,9 @@ case "$PRK_TARGET" in
         if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then
             sh ./travis/install-boost.sh $TRAVIS_ROOT
         fi
-        # CMake 3.3 or higher is required.  You are running version 2.8.7.
+        # CMake 3.10 or higher is required.
         sh ./travis/install-cmake.sh $TRAVIS_ROOT
-        sh ./travis/install-raja.sh $TRAVIS_ROOT
+        #sh ./travis/install-raja.sh $TRAVIS_ROOT
         sh ./travis/install-kokkos.sh $TRAVIS_ROOT
         #sh ./travis/install-occa.sh $TRAVIS_ROOT
         sh ./travis/install-sycl.sh $TRAVIS_ROOT
@@ -94,6 +94,7 @@ case "$PRK_TARGET" in
             brew link --overwrite gcc || true
         fi
         if [ "${CC}" = "gcc" ] ; then
+            sh ./travis/install-cmake.sh $TRAVIS_ROOT
             sh ./travis/install-opencoarrays.sh $TRAVIS_ROOT
         fi
         ;;
diff --git a/travis/install-sandia-openshmem.sh b/travis/install-sandia-openshmem.sh
index 308c32d3d..0d046d7ef 100755
--- a/travis/install-sandia-openshmem.sh
+++ b/travis/install-sandia-openshmem.sh
@@ -16,7 +16,7 @@ if [ ! -d "$SHMEM_ROOT" ]; then
     # HEAD
     #git clone --depth 1 https://github.com/Sandia-OpenSHMEM/SOS.git sandia-shmem
     #cd sandia-shmem
-    VERSION=1.4.0
+    VERSION=1.4.2
     #git clone -b v$VERSION --depth 1 https://github.com/Sandia-OpenSHMEM/SOS.git SOS-$VERSION
     wget https://github.com/Sandia-OpenSHMEM/SOS/archive/v$VERSION.tar.gz
     tar -xzf v$VERSION.tar.gz
@@ -28,9 +28,9 @@ if [ ! -d "$SHMEM_ROOT" ]; then
     ../configure --with-libfabric=$TRAVIS_ROOT/libfabric \
                  --disable-fortran \
                  --enable-error-checking \
-                 --enable-remote-virtual-addressing \
                  --enable-pmi-simple \
                  --prefix=$SHMEM_ROOT
+                 #--enable-remote-virtual-addressing \
     make
     make check | true
     make install

From f626ce7d51493358c0788f6f72340f8067b3484a Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Sat, 9 Feb 2019 09:18:06 -0800
Subject: [PATCH 116/245] trivial fixes (#367)

* add no-warning flags for cl.hpp
* format fix
---
 FORTRAN/dgemm.f90    | 2 +-
 common/make.defs.gcc | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/FORTRAN/dgemm.f90 b/FORTRAN/dgemm.f90
index 5f678c981..a68eff104 100644
--- a/FORTRAN/dgemm.f90
+++ b/FORTRAN/dgemm.f90
@@ -173,7 +173,7 @@ program main
 
   if (command_argument_count().lt.2) then
     write(*,'(a17,i1)') 'argument count = ', command_argument_count()
-    write(*,'(a62)')    'Usage: ./dgemm-pretty <# iterations> <matrix order> [<tile_size>]'
+    write(*,'(a66)')    'Usage: ./dgemm-pretty <# iterations> <matrix order> [<tile_size>]'
     stop 1
   endif
 
diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index 732083da1..48e7cc115 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -42,6 +42,7 @@ OPENCLFLAG=-framework OpenCL
 # Linux
 #OPENCLDIR=/etc/alternatives/opencl-intel-tools
 #OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL
+OPENCLFLAG+=-Wno-ignored-attributes -Wno-deprecated-declarations
 #
 # SYCL flags
 #

From 631ba6f3107ecf03ab065c069cdce94463bc4f44 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Sat, 9 Feb 2019 09:18:22 -0800
Subject: [PATCH 117/245] fix CBLAS on MacOS (#368)

* add no-warning flags for cl.hpp
* fix issues with dgemm-cblas on Mac
---
 Cxx11/dgemm-cblas.cc   | 6 +++++-
 Cxx11/prk_util.h       | 6 ++++++
 common/make.defs.gcc   | 2 +-
 common/make.defs.intel | 1 +
 common/make.defs.llvm  | 2 +-
 common/make.defs.pgi   | 4 ++++
 6 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/Cxx11/dgemm-cblas.cc b/Cxx11/dgemm-cblas.cc
index 61a9292fb..24ae52bae 100644
--- a/Cxx11/dgemm-cblas.cc
+++ b/Cxx11/dgemm-cblas.cc
@@ -73,6 +73,10 @@
 #include <cblas.h>
 #endif
 
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
 #ifdef PRK_DEBUG
 #include <random>
 void prk_dgemm_loops(const int order,
@@ -146,7 +150,7 @@ void prk_dgemm(const int order, const int batches,
     const double beta  = 1.0;
 
     const int group_count = 1;
-    const int group_size[group_count] = { batches };
+    PRK_UNUSED const int group_size[group_count] = { batches };
 
     const CBLAS_TRANSPOSE transa_array[group_count] = { CblasNoTrans };
     const CBLAS_TRANSPOSE transb_array[group_count] = { CblasNoTrans };
diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h
index 321f91c8c..b25dccdf6 100644
--- a/Cxx11/prk_util.h
+++ b/Cxx11/prk_util.h
@@ -76,6 +76,12 @@
 
 #define RESTRICT __restrict__
 
+#if (defined(__cplusplus) && (__cplusplus >= 201703L))
+#define PRK_UNUSED [[maybe_unused]]
+#else
+#define PRK_UNUSED
+#endif
+
 namespace prk {
 
     template<class I, class T>
diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index 48e7cc115..e1d68dd5a 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -85,7 +85,7 @@ THRUSTFLAG=-I${THRUSTDIR} -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP
 #
 # CBLAS for C++ DGEMM
 #
-CBLASFLAG=-DACCELERATE -framework Accelerate
+CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions
 #
 # CUDA flags
 #
diff --git a/common/make.defs.intel b/common/make.defs.intel
index 00a781cac..bba53d1bb 100644
--- a/common/make.defs.intel
+++ b/common/make.defs.intel
@@ -81,6 +81,7 @@ RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
 #
 # CBLAS for C++ DGEMM
 #
+#CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions
 CBLASFLAG=-DMKL -mkl
 #
 # CUDA flags
diff --git a/common/make.defs.llvm b/common/make.defs.llvm
index f4a54c4f8..a5c9010d4 100644
--- a/common/make.defs.llvm
+++ b/common/make.defs.llvm
@@ -108,7 +108,7 @@ RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
 #
 # CBLAS for C++ DGEMM
 #
-CBLASFLAG=-DACCELERATE -framework Accelerate
+CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions
 #
 # CUDA flags
 #
diff --git a/common/make.defs.pgi b/common/make.defs.pgi
index ddaf99a69..1205afff2 100644
--- a/common/make.defs.pgi
+++ b/common/make.defs.pgi
@@ -40,6 +40,10 @@ KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPE
 RAJADIR=./raja
 RAJAFLAG=-DUSE_RAJA -I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
 #
+# CBLAS for C++ DGEMM
+#
+CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions
+#
 # CUDA flags
 #
 # Linux w/ NVIDIA CUDA

From 236947e8f385f1f2a0d3c0cb1c516d227256a146 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 20 Feb 2019 22:19:49 -0800
Subject: [PATCH 118/245] check system return code in stencil-opencl (#371)

* silence GCC warning about ignored return code
---
 Cxx11/stencil-opencl.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Cxx11/stencil-opencl.cc b/Cxx11/stencil-opencl.cc
index 89a261cc9..8db6adfa4 100644
--- a/Cxx11/stencil-opencl.cc
+++ b/Cxx11/stencil-opencl.cc
@@ -83,7 +83,10 @@ void run(cl::Context context, int iterations, int n, int radius, bool star)
       std::string command("./generate-opencl-stencil.py ");
       command += ( star ? "star " : "grid " );
       command += std::to_string(radius);
-      std::system( command.c_str() );
+      int rc = std::system( command.c_str() );
+      if (rc != 0) {
+          std::cerr << command.c_str() << " returned " << rc << std::endl;
+      }
   }
   source = prk::opencl::loadProgram(filename1);
   cl::Program program1(context, source, true);

From 4195262ff1d926bba638e2791773dfb842a2d2b6 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 20 Feb 2019 22:20:15 -0800
Subject: [PATCH 119/245] support FreeBSD (#370)

* support FreeBSD example
* add README for FreeBSD [ci skip]
* modify Travis run script to work locally under FreeBSD
---
 .gitignore               |   2 +
 common/README.freebsd    |  26 ++++++++
 common/make.defs.freebsd | 102 +++++++++++++++++++++++++++++
 travis/build-run-prk.sh  | 138 +++++++++++++++++++++++----------------
 4 files changed, 210 insertions(+), 58 deletions(-)
 create mode 100644 common/README.freebsd
 create mode 100644 common/make.defs.freebsd

diff --git a/.gitignore b/.gitignore
index d4a60c93f..7ed1c8b8c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -117,12 +117,14 @@ Cxx11/compute
 Cxx11/triSYCL
 Cxx11/occa
 Cxx11/pstl
+Cxx11/parallelstl
 Cxx11/range-v3
 Cxx11/dgemm-vector
 Cxx11/dgemm-cblas
 Cxx11/dgemm-cublas
 Cxx11/p2p-openmp-target
 Cxx11/p2p-tasks-openmp
+Cxx11/p2p-tasks-tbb
 Cxx11/p2p-vector
 Cxx11/p2p-vector-doacross-openmp
 Cxx11/p2p-vector-raja
diff --git a/common/README.freebsd b/common/README.freebsd
new file mode 100644
index 000000000..8a52f24da
--- /dev/null
+++ b/common/README.freebsd
@@ -0,0 +1,26 @@
+This is a rather terse summary of what is required to build the PRKs on FreeBSD.
+
+# Necessary Packages
+
+BSD make isn't GNU make, which the PRK assumes.
+
+sudo pkg install gmake
+
+I assume GCC works fine as it does on Linux but I tested LLVM.
+OpenMP target is not supported by LLVM 6.0.1 so those compilations will fail.
+
+sudo pkg install clang flang libpgmath
+
+## C++ dependencies
+
+sudo pkg install opencl-2.2_1 
+sudo pkg install devel/clinfo devel/ocl-icd lang/beignet lang/pocl
+sudo pkg install tbb
+sudo pkg install boost-all
+
+You will need to acquire triSYCL and Intel Parallel STL via GitHub.
+One minor issue with triSYCL was addressed by patching triSYCL.
+I suspect this issue disappears with LLVM 7.0 but you can look up
+the issue with `std::optional` on GitHub if necessary.
+
+RAJA and Kokkos were not tested.
diff --git a/common/make.defs.freebsd b/common/make.defs.freebsd
new file mode 100644
index 000000000..745813eef
--- /dev/null
+++ b/common/make.defs.freebsd
@@ -0,0 +1,102 @@
+#
+# This file shows the LLVM toolchain options for PRKs using
+# OpenMP, MPI and/or Fortran coarrays only.
+#
+# Base compilers and language options
+#
+LLVM_ROOT=/usr/local/llvm60
+LLVM_PATH=${LLVM_ROOT}/bin/
+# C99 is required in some implementations.
+CC=${LLVM_PATH}clang -std=c11 -pthread
+# All of the Fortran code is written for the 2008 standard and requires preprocessing.
+FC=/usr/local/bin/flang -Mpreprocess -Mfreeform -I/usr/local/flang/include -lexecinfo
+# C++11 may not be required but does no harm here.
+CXX=${LLVM_PATH}clang++ -std=c++14 -pthread
+#
+# Compiler flags
+#
+# -mtune=native is appropriate for most cases.
+# -march=native is appropriate if you want portable binaries.
+#
+DEFAULT_OPT_FLAGS=-g -O3
+#DEFAULT_OPT_FLAGS+=-mllvm -polly -mllvm -polly-vectorizer=stripmine
+#
+# If you want to be specific, get the architecture options from:
+#   ${LLVM_PATH}llc --version
+# and then get the CPU/ISA options from (e.g. for x86-64):
+#   ${LLVM_PATH}llc -march=x86-64 -mcpu=help
+#
+# These are useful to understand why the compiler does not vectorize loops:
+#   DEFAULT_OPT_FLAGS+=-Rpass-analysis=loop-vectorize
+#   DEFAULT_OPT_FLAGS+=-Rpass=loop-vectorize
+#
+# OpenMP flags
+#
+OPENMPFLAG=-fopenmp
+OPENMPSIMDFLAG=-fopenmp-simd
+OFFLOADFLAG=-fopenmp
+#ORNLACCFLAG= # Flang does not support OpenACC
+# Mac weirdness
+OPENMPFLAG+=-L${LLVM_ROOT}/lib
+# BSD weirdness
+OPENMPFLAG+=-I${LLVM_ROOT}/lib/clang/6.0.1/include
+#
+# OpenCL flags
+#
+OPENCLDIR=/usr/local
+OPENCLFLAG=-I${OPENCLDIR}/include -L${OPENCLDIR}/lib -lOpenCL
+OPENCLFLAG+=-Wno-deprecated-declarations
+#
+# OCCA
+#
+#OCCADIR=${HOME}/prk-repo/Cxx11/occa
+#
+# TBB
+#
+TBBDIR=/usr/local
+TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
+#
+# Parallel STL, Boost, etc.
+#
+BOOSTFLAG=-I/usr/local/include
+RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
+#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
+PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG}
+KOKKOSDIR=/opt/kokkos/clang
+KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl
+RAJADIR=/opt/raja/clang
+RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
+#THRUSTDIR=/opt/nvidia/thrust
+#THRUSTFLAG=-I${THRUSTDIR} -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP
+#
+# SYCL flags
+#
+# triSYCL
+# https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
+SYCLDIR=./triSYCL
+SYCLCXX=${CXX} ${DEFAULT_OPT_FLAGS} ${OPENMPFLAG}
+SYCLFLAG=-std=c++17 -I${SYCLDIR}/include ${BOOSTFLAG}
+#
+# CBLAS for C++ DGEMM
+#
+CBLASFLAG=
+#
+# CUDA flags
+#
+# Mac w/ CUDA emulation via https://github.com/hughperkins/coriander
+NVCC=/opt/llvm/cocl/bin/cocl
+# Linux w/ NVIDIA CUDA
+#NVCC=nvcc -arch=sm_50
+CUDAFLAGS=-g -O3 -std=c++11
+# https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233
+CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED
+#
+# ISPC
+#
+ISPC=ispc
+ISPCFLAG=-O3 --target=host --opt=fast-math
+#
+# MPI
+#
+# We assume you have installed an implementation of MPI-3 that is in your path.
+MPICC=mpicc
diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index de135e01f..194e7ca51 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -12,6 +12,15 @@ if [ -f ~/use-intel-compilers ] ; then
     export FC=ifort
 fi
 
+case "$os" in
+    FreeBSD)
+        MAKE=gmake
+        ;;
+    *)
+        MAKE=make
+        ;;
+esac
+
 case "$os" in
     Darwin)
         # Homebrew should put MPI here...
@@ -80,7 +89,7 @@ case "$PRK_TARGET" in
     allserial)
         echo "Serial"
         echo "CC=$CC -std=c99" >> common/make.defs
-        make $PRK_TARGET
+        ${MAKE} $PRK_TARGET
         export PRK_TARGET_PATH=SERIAL
         $PRK_TARGET_PATH/Synch_p2p/p2p       10 1024 1024
         $PRK_TARGET_PATH/Stencil/stencil     10 1000
@@ -139,7 +148,7 @@ case "$PRK_TARGET" in
         echo "EXTRA_CLIBS=-lm -lpthread" >> common/make.defs
 
         # C11 without external parallelism
-        make -C $PRK_TARGET_PATH p2p stencil transpose p2p-innerloop
+        ${MAKE} -C $PRK_TARGET_PATH p2p stencil transpose p2p-innerloop
         $PRK_TARGET_PATH/p2p             10 1024 1024
         $PRK_TARGET_PATH/p2p             10 1024 1024 100 100
         $PRK_TARGET_PATH/p2p-innerloop   10 1024
@@ -153,7 +162,7 @@ case "$PRK_TARGET" in
         done
 
         # C11 with POSIX or C11 thread parallelism - test POSIX here, C11 at the end.
-        make -C $PRK_TARGET_PATH transpose-thread
+        ${MAKE} -C $PRK_TARGET_PATH transpose-thread
         $PRK_TARGET_PATH/transpose-thread   10 1024 512
 
         # C11 with OpenMP
@@ -162,7 +171,7 @@ case "$PRK_TARGET" in
             g*)
                 # Host
                 echo "OPENMPFLAG=-fopenmp" >> common/make.defs
-                make -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp transpose-openmp
+                ${MAKE} -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp transpose-openmp
                 $PRK_TARGET_PATH/p2p-tasks-openmp         10 1024 1024 100 100
                 $PRK_TARGET_PATH/p2p-innerloop-openmp     10 1024
                 $PRK_TARGET_PATH/stencil-openmp           10 1000
@@ -175,7 +184,7 @@ case "$PRK_TARGET" in
                 done
                 # Offload
                 echo "OFFLOADFLAG=-foffload=\"-O3 -v\"" >> common/make.defs
-                make -C $PRK_TARGET_PATH target
+                ${MAKE} -C $PRK_TARGET_PATH target
                 $PRK_TARGET_PATH/stencil-target     10 1000
                 $PRK_TARGET_PATH/transpose-target   10 1024 32
                 #echo "Test stencil code generator"
@@ -189,7 +198,7 @@ case "$PRK_TARGET" in
                 # Host
                 echo "Skipping Clang since OpenMP support probably missing"
                 #echo "OPENMPFLAG=-fopenmp" >> common/make.defs
-                #make -C $PRK_TARGET_PATH openmp
+                #${MAKE} -C $PRK_TARGET_PATH openmp
                 #$PRK_TARGET_PATH/p2p-tasks-openmp         10 1024 1024 100 100
                 #$PRK_TARGET_PATH/stencil-openmp           10 1000
                 #$PRK_TARGET_PATH/transpose-penmp          10 1024 32
@@ -203,7 +212,7 @@ case "$PRK_TARGET" in
             ic*)
                 # Host
                 echo "OPENMPFLAG=-qopenmp" >> common/make.defs
-                make -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp transpose-openmp
+                ${MAKE} -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp transpose-openmp
                 $PRK_TARGET_PATH/p2p-tasks-openmp         10 1024 1024 100 100
                 $PRK_TARGET_PATH/p2p-innerloop-openmp     10 1024 1024
                 $PRK_TARGET_PATH/stencil-openmp           10 1000
@@ -217,7 +226,7 @@ case "$PRK_TARGET" in
                 # Offload - not supported on MacOS
                 if [ "${TRAVIS_OS_NAME}" = "linux" ] ; then
                     echo "OFFLOADFLAG=-qopenmp -qopenmp-offload=host" >> common/make.defs
-                    make -C $PRK_TARGET_PATH target
+                    ${MAKE} -C $PRK_TARGET_PATH target
                     $PRK_TARGET_PATH/stencil-openmp-target     10 1000
                     $PRK_TARGET_PATH/transpose-openmp-target   10 1024 32
                     #echo "Test stencil code generator"
@@ -236,7 +245,7 @@ case "$PRK_TARGET" in
         # C11 with Cilk
         if [ "${CC}" = "gcc" ] ; then
             echo "CILKFLAG=-fcilkplus" >> common/make.defs
-            make -C $PRK_TARGET_PATH stencil-cilk transpose-cilk
+            ${MAKE} -C $PRK_TARGET_PATH stencil-cilk transpose-cilk
             $PRK_TARGET_PATH/stencil-cilk     10 1000
             $PRK_TARGET_PATH/transpose-cilk   10 1024 32
             #echo "Test stencil code generator"
@@ -248,12 +257,12 @@ case "$PRK_TARGET" in
         fi
         # Use MUSL for GCC+Linux only
         if [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "$CC" = "gcc" ] ; then
-            make -C $PRK_TARGET_PATH clean
+            ${MAKE} -C $PRK_TARGET_PATH clean
             ./travis/install-musl.sh ${TRAVIS_ROOT} ${PRK_CC}
             echo "PRKVERSION=\"'2.16'\"" > common/make.defs
             echo "CC=${TRAVIS_ROOT}/musl/bin/musl-gcc -static -std=c11 -DUSE_C11_THREADS" >> common/make.defs
             echo "EXTRA_CLIBS=-lm -lpthread" >> common/make.defs
-            make -C $PRK_TARGET_PATH transpose-thread
+            ${MAKE} -C $PRK_TARGET_PATH transpose-thread
             $PRK_TARGET_PATH/transpose-thread   10 1024 512
         fi
 
@@ -316,12 +325,12 @@ case "$PRK_TARGET" in
         echo "CXX=${PRK_CXX} -std=c++14 -pthread" >> common/make.defs
 
         # C++11 without external parallelism
-        make -C $PRK_TARGET_PATH transpose-valarray nstream-valarray
+        ${MAKE} -C $PRK_TARGET_PATH transpose-valarray nstream-valarray
         $PRK_TARGET_PATH/transpose-valarray 10 1024 32
         $PRK_TARGET_PATH/nstream-valarray   10 16777216 32
 
         # C++11 without external parallelism
-        make -C $PRK_TARGET_PATH p2p-vector p2p-hyperplane-vector stencil-vector transpose-vector nstream-vector \
+        ${MAKE} -C $PRK_TARGET_PATH p2p-vector p2p-hyperplane-vector stencil-vector transpose-vector nstream-vector \
                                  dgemm-vector sparse-vector
         $PRK_TARGET_PATH/p2p-vector              10 1024 1024
         $PRK_TARGET_PATH/p2p-vector              10 1024 1024 100 100
@@ -343,13 +352,13 @@ case "$PRK_TARGET" in
         # C++11 with CBLAS
         if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then
             echo "CBLASFLAG=-DACCELERATE -framework Accelerate" >> common/make.defs
-            make -C $PRK_TARGET_PATH transpose-cblas dgemm-cblas
+            ${MAKE} -C $PRK_TARGET_PATH transpose-cblas dgemm-cblas
             $PRK_TARGET_PATH/transpose-cblas    10 1024
             $PRK_TARGET_PATH/dgemm-cblas        10 400
         fi
 
         # C++11 native parallelism
-        make -C $PRK_TARGET_PATH transpose-vector-thread transpose-vector-async
+        ${MAKE} -C $PRK_TARGET_PATH transpose-vector-thread transpose-vector-async
         $PRK_TARGET_PATH/transpose-vector-thread 10 1024 512 32
         $PRK_TARGET_PATH/transpose-vector-async  10 1024 512 32
 
@@ -359,7 +368,7 @@ case "$PRK_TARGET" in
             gcc)
                 # Host
                 echo "OPENMPFLAG=-fopenmp" >> common/make.defs
-                make -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-hyperplane-openmp stencil-openmp \
+                ${MAKE} -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-hyperplane-openmp stencil-openmp \
                                          transpose-openmp nstream-openmp
                 $PRK_TARGET_PATH/p2p-tasks-openmp                 10 1024 1024 100 100
                 $PRK_TARGET_PATH/p2p-hyperplane-openmp     10 1024
@@ -375,7 +384,7 @@ case "$PRK_TARGET" in
                 done
                 # Offload
                 echo "OFFLOADFLAG=-foffload=\"-O3 -v\"" >> common/make.defs
-                make -C $PRK_TARGET_PATH target
+                ${MAKE} -C $PRK_TARGET_PATH target
                 $PRK_TARGET_PATH/stencil-openmp-target     10 1000
                 $PRK_TARGET_PATH/transpose-openmp-target   10 1024 32
                 #echo "Test stencil code generator"
@@ -386,7 +395,7 @@ case "$PRK_TARGET" in
                 done
                 # ORNL-ACC
                 echo "ORNLACCFLAG=-fopenacc" >> common/make.defs
-                make -C $PRK_TARGET_PATH p2p-hyperplane-vector-ornlacc
+                ${MAKE} -C $PRK_TARGET_PATH p2p-hyperplane-vector-ornlacc
                 $PRK_TARGET_PATH/p2p-hyperplane-vector-ornlacc     10 1024
                 $PRK_TARGET_PATH/p2p-hyperplane-vector-ornlacc     10 1024 64
                 ;;
@@ -394,7 +403,7 @@ case "$PRK_TARGET" in
                 if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then
                     # Host
                     echo "OPENMPFLAG=-fopenmp" >> common/make.defs
-                    make -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-hyperplane-openmp stencil-openmp \
+                    ${MAKE} -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-hyperplane-openmp stencil-openmp \
                                              transpose-openmp nstream-openmp
                     $PRK_TARGET_PATH/p2p-tasks-openmp                 10 1024 1024 100 100
                     $PRK_TARGET_PATH/p2p-hyperplane-openmp     10 1024
@@ -410,7 +419,7 @@ case "$PRK_TARGET" in
                     done
                     # Offload
                     #echo "OFFLOADFLAG=-foffload=\"-O3 -v\"" >> common/make.defs
-                    #make -C $PRK_TARGET_PATH target
+                    #${MAKE} -C $PRK_TARGET_PATH target
                     #$PRK_TARGET_PATH/stencil-openmp-target     10 1000
                     #$PRK_TARGET_PATH/transpose-openmp-target   10 1024 32
                     ##echo "Test stencil code generator"
@@ -426,7 +435,7 @@ case "$PRK_TARGET" in
             icc)
                 # Host
                 echo "OPENMPFLAG=-qopenmp" >> common/make.defs
-                make -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp \
+                ${MAKE} -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp \
                                          transpose-openmp nstream-openmp
                 $PRK_TARGET_PATH/p2p-tasks-openmp                 10 1024 1024 100 100
                 $PRK_TARGET_PATH/p2p-innerloop-openmp             10 1024 1024
@@ -442,7 +451,7 @@ case "$PRK_TARGET" in
                 # Offload - not supported on MacOS
                 if [ "${TRAVIS_OS_NAME}" = "linux" ] ; then
                     echo "OFFLOADFLAG=-qopenmp -qopenmp-offload=host" >> common/make.defs
-                    make -C $PRK_TARGET_PATH target
+                    ${MAKE} -C $PRK_TARGET_PATH target
                     $PRK_TARGET_PATH/stencil-openmp-target     10 1000
                     $PRK_TARGET_PATH/transpose-openmp-target   10 1024 32
                     #echo "Test stencil code generator"
@@ -458,14 +467,20 @@ case "$PRK_TARGET" in
                 ;;
         esac
 
-        # Boost.Compute found after OpenCL, and only available in Travis with MacOS.
-        echo "BOOSTFLAG=-DUSE_BOOST" >> common/make.defs
-
-        #echo "RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}" >> common/make.defs
-        echo "RANGEFLAG=-DUSE_RANGES_TS -I${TRAVIS_ROOT}/range-v3/include" >> common/make.defs
+        # Boost.Compute runs after OpenCL, and only available in Travis with MacOS.
+        case "$os" in
+            FreeBSD)
+                echo "BOOSTFLAG=-DUSE_BOOST -I/usr/local/include" >> common/make.defs
+                echo "RANGEFLAG=-DUSE_BOOST_IRANGE -I/usr/local/include" >> common/make.defs
+                ;;
+            *)
+                echo "BOOSTFLAG=-DUSE_BOOST" >> common/make.defs
+                echo "RANGEFLAG=-DUSE_RANGES_TS -I${TRAVIS_ROOT}/range-v3/include" >> common/make.defs
+                ;;
+        esac
 
         # C++11 with rangefor and Boost.Ranges
-        make -C $PRK_TARGET_PATH rangefor
+        ${MAKE} -C $PRK_TARGET_PATH rangefor
         $PRK_TARGET_PATH/stencil-vector-rangefor     10 1000
         $PRK_TARGET_PATH/transpose-vector-rangefor   10 1024 32
         $PRK_TARGET_PATH/nstream-vector-rangefor     10 16777216 32
@@ -493,7 +508,7 @@ case "$PRK_TARGET" in
                     export LD_LIBRARY_PATH=${TBBROOT}/lib:${LD_LIBRARY_PATH}
                     ;;
             esac
-            make -C $PRK_TARGET_PATH p2p-innerloop-vector-tbb p2p-hyperplane-vector-tbb p2p-tasks-tbb stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb
+            ${MAKE} -C $PRK_TARGET_PATH p2p-innerloop-vector-tbb p2p-hyperplane-vector-tbb p2p-tasks-tbb stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb
             $PRK_TARGET_PATH/p2p-innerloop-vector-tbb     10 1024
             $PRK_TARGET_PATH/p2p-hyperplane-vector-tbb    10 1024 1
             $PRK_TARGET_PATH/p2p-hyperplane-vector-tbb    10 1024 32
@@ -510,7 +525,7 @@ case "$PRK_TARGET" in
         fi
 
         # C++11 with STL
-        make -C $PRK_TARGET_PATH p2p-hyperplane-vector-stl stencil-vector-stl transpose-vector-stl nstream-vector-stl
+        ${MAKE} -C $PRK_TARGET_PATH p2p-hyperplane-vector-stl stencil-vector-stl transpose-vector-stl nstream-vector-stl
         $PRK_TARGET_PATH/p2p-hyperplane-vector-stl    10 1024 1
         $PRK_TARGET_PATH/p2p-hyperplane-vector-stl    10 1024 32
         $PRK_TARGET_PATH/stencil-vector-stl           10 1000
@@ -532,7 +547,7 @@ case "$PRK_TARGET" in
             else
                 echo "PSTLFLAG=-DUSE_PSTL -fopenmp ${TBBFLAG} -DUSE_INTEL_PSTL -I${TRAVIS_ROOT}/pstl/include ${RANGEFLAG}" >> common/make.defs
             fi
-            make -C $PRK_TARGET_PATH p2p-hyperplane-vector-pstl stencil-vector-pstl transpose-vector-pstl nstream-vector-pstl
+            ${MAKE} -C $PRK_TARGET_PATH p2p-hyperplane-vector-pstl stencil-vector-pstl transpose-vector-pstl nstream-vector-pstl
             $PRK_TARGET_PATH/p2p-hyperplane-vector-pstl    10 1024 1
             $PRK_TARGET_PATH/p2p-hyperplane-vector-pstl    10 1024 32
             $PRK_TARGET_PATH/stencil-vector-pstl           10 1000
@@ -549,7 +564,7 @@ case "$PRK_TARGET" in
         # C++11 with OpenCL
         if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then
             echo "OPENCLFLAG=-framework OpenCL" >> common/make.defs
-            make -C $PRK_TARGET_PATH opencl
+            ${MAKE} -C $PRK_TARGET_PATH opencl
             # must run programs in same directory as OpenCL source files...
             cd $PRK_TARGET_PATH
             ./stencil-opencl     10 1000
@@ -572,7 +587,7 @@ case "$PRK_TARGET" in
         # (2) Boost.Compute is not available from APT.
         # If we ever address 1, we need to enable the Boost.Compute install for Linux.
         if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then
-            make -C $PRK_TARGET_PATH nstream-vector-boost-compute
+            ${MAKE} -C $PRK_TARGET_PATH nstream-vector-boost-compute
             $PRK_TARGET_PATH/nstream-vector-boost-compute     10 16777216 32
         fi
 
@@ -592,7 +607,7 @@ case "$PRK_TARGET" in
         esac
         # RAJA
         if [ 0 = 1 ] ; then
-        make -C $PRK_TARGET_PATH p2p-vector-raja stencil-vector-raja transpose-vector-raja nstream-vector-raja \
+        ${MAKE} -C $PRK_TARGET_PATH p2p-vector-raja stencil-vector-raja transpose-vector-raja nstream-vector-raja \
                                  p2p-raja stencil-raja transpose-raja nstream-raja
         # New (Views)
         $PRK_TARGET_PATH/p2p-raja                10 1024 1024
@@ -623,7 +638,7 @@ case "$PRK_TARGET" in
         done
         fi
         # Kokkos
-        make -C $PRK_TARGET_PATH stencil-kokkos transpose-kokkos nstream-kokkos
+        ${MAKE} -C $PRK_TARGET_PATH stencil-kokkos transpose-kokkos nstream-kokkos
         $PRK_TARGET_PATH/stencil-kokkos     10 1000
         $PRK_TARGET_PATH/transpose-kokkos   10 1024 32
         $PRK_TARGET_PATH/nstream-kokkos     10 16777216 32
@@ -638,7 +653,7 @@ case "$PRK_TARGET" in
         #if [ "${TRAVIS_OS_NAME}" = "linux" ] ; then
         #    echo "OCCADIR=${TRAVIS_ROOT}/occa" >> common/make.defs
         #    export OCCA_CXX=${PRK_CXX}
-        #    make -C $PRK_TARGET_PATH transpose-occa nstream-occa
+        #    ${MAKE} -C $PRK_TARGET_PATH transpose-occa nstream-occa
         #    $PRK_TARGET_PATH/transpose-occa   10 1024 32
         #    $PRK_TARGET_PATH/nstream-occa     10 16777216 32
         #fi
@@ -654,7 +669,7 @@ case "$PRK_TARGET" in
                 echo "SYCLCXX=${PRK_CXX} -fopenmp -std=c++17" >> common/make.defs
             fi
             echo "SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/include" >> common/make.defs
-            make -C $PRK_TARGET_PATH p2p-hyperplane-sycl stencil-sycl transpose-sycl nstream-sycl
+            ${MAKE} -C $PRK_TARGET_PATH p2p-hyperplane-sycl stencil-sycl transpose-sycl nstream-sycl
             #$PRK_TARGET_PATH/p2p-hyperplane-sycl 10 50 1 # 100 takes too long :-o
             $PRK_TARGET_PATH/stencil-sycl        10 1000
             $PRK_TARGET_PATH/transpose-sycl      10 1024 32
@@ -698,9 +713,16 @@ case "$PRK_TARGET" in
                 echo "COARRAYFLAG=-fcoarray=single" >> common/make.defs
                 ;;
             clang)
-                echo "LLVM Fortran is not supported."
-                exit 9
-                echo "FC=flang" >> common/make.defs
+                case "$os" in
+                    FreeBSD)
+                        echo "FC=flang -Mpreprocess -Mfreeform -I/usr/local/flang/include -lexecinfo" >> common/make.defs
+                        ;;
+                    *)
+                        # untested
+                        echo "FC=flang -Mpreprocess -Mfreeform" >> common/make.defs
+                        ;;
+                esac
+                echo "OPENMPFLAG=-fopenmp" >> common/make.defs
                 ;;
             icc)
                 # -heap-arrays prevents SEGV in transpose-pretty (?)
@@ -713,7 +735,7 @@ case "$PRK_TARGET" in
         esac
 
         # Serial
-        make -C ${PRK_TARGET_PATH} p2p p2p-innerloop stencil transpose nstream dgemm
+        ${MAKE} -C ${PRK_TARGET_PATH} p2p p2p-innerloop stencil transpose nstream dgemm
         $PRK_TARGET_PATH/p2p               10 1024 1024
         $PRK_TARGET_PATH/p2p-innerloop     10 1024
         $PRK_TARGET_PATH/stencil           10 1000
@@ -724,7 +746,7 @@ case "$PRK_TARGET" in
         $PRK_TARGET_PATH/dgemm             10 400 32
 
         # Pretty
-        make -C ${PRK_TARGET_PATH} stencil-pretty transpose-pretty nstream-pretty dgemm-pretty
+        ${MAKE} -C ${PRK_TARGET_PATH} stencil-pretty transpose-pretty nstream-pretty dgemm-pretty
         #$PRK_TARGET_PATH/p2p-pretty          10 1024 1024
         # pretty versions do not support tiling...
         $PRK_TARGET_PATH/stencil-pretty      10 1000
@@ -733,7 +755,7 @@ case "$PRK_TARGET" in
         $PRK_TARGET_PATH/dgemm-pretty        10 400
 
         # OpenMP host
-        make -C ${PRK_TARGET_PATH} p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp transpose-openmp \
+        ${MAKE} -C ${PRK_TARGET_PATH} p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp transpose-openmp \
                                    nstream-openmp dgemm-openmp
         export OMP_NUM_THREADS=2
         $PRK_TARGET_PATH/p2p-tasks-openmp     10 1024 1024
@@ -749,7 +771,7 @@ case "$PRK_TARGET" in
         # Intel Mac does not support OpenMP target or coarrays
         if [ "${CC}" = "gcc" ] || [ "${TRAVIS_OS_NAME}" = "linux" ] ; then
             # OpenMP target
-            make -C ${PRK_TARGET_PATH} stencil-openmp-target transpose-openmp-target nstream-openmp-target
+            ${MAKE} -C ${PRK_TARGET_PATH} stencil-openmp-target transpose-openmp-target nstream-openmp-target
             export OMP_NUM_THREADS=2
             #$PRK_TARGET_PATH/p2p-openmp-target           10 1024 1024 # most compilers do not support doacross yet
             $PRK_TARGET_PATH/stencil-openmp-target       10 1000
@@ -758,7 +780,7 @@ case "$PRK_TARGET" in
             $PRK_TARGET_PATH/nstream-openmp-target       10 16777216
 
             # Fortran coarrays
-            make -C ${PRK_TARGET_PATH} coarray
+            ${MAKE} -C ${PRK_TARGET_PATH} coarray
             export PRK_MPI_PROCS=4
             if [ "${CC}" = "gcc" ] ; then
                 if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then
@@ -803,7 +825,7 @@ case "$PRK_TARGET" in
             echo "CC=$CC -std=c99" >> common/make.defs
             echo "OPENMPFLAG=-fopenmp" >> common/make.defs
         fi
-        make $PRK_TARGET
+        ${MAKE} $PRK_TARGET
         export PRK_TARGET_PATH=OPENMP
         export OMP_NUM_THREADS=4
         $PRK_TARGET_PATH/Synch_p2p/p2p            $OMP_NUM_THREADS 10 1024 1024
@@ -870,7 +892,7 @@ case "$PRK_TARGET" in
         echo "OPENMPFLAG=-fopenmp" >> common/make.defs
 
         echo "MPI-1"
-        make allmpi1
+        ${MAKE} allmpi1
         export PRK_TARGET_PATH=MPI1
         export PRK_MPI_PROCS=4
         export PRK_RUN="$PRK_LAUNCHER -n $PRK_MPI_PROCS ${PRK_OVERSUBSCRIBE:-}"
@@ -894,7 +916,7 @@ case "$PRK_TARGET" in
         # MPI+OpenMP is just too much of a pain with Clang right now.
         if [ "${CC}" = "gcc" ] ; then
             echo "MPI+OpenMP"
-            make allmpiomp
+            ${MAKE} allmpiomp
             export PRK_TARGET_PATH=MPIOPENMP
             export PRK_MPI_PROCS=2
             export OMP_NUM_THREADS=2
@@ -906,7 +928,7 @@ case "$PRK_TARGET" in
         fi
 
         echo "MPI-RMA"
-        make allmpirma
+        ${MAKE} allmpirma
         export PRK_TARGET_PATH=MPIRMA
         export PRK_MPI_PROCS=4
         export PRK_RUN="$PRK_LAUNCHER -n $PRK_MPI_PROCS ${PRK_OVERSUBSCRIBE:-}"
@@ -915,7 +937,7 @@ case "$PRK_TARGET" in
         $PRK_RUN $PRK_TARGET_PATH/Transpose/transpose 10 1024 32
 
         echo "MPI+MPI"
-        make allmpishm
+        ${MAKE} allmpishm
         export PRK_TARGET_PATH=MPISHM
         export PRK_MPI_PROCS=4
         export PRK_RUN="$PRK_LAUNCHER -n $PRK_MPI_PROCS ${PRK_OVERSUBSCRIBE:-}"
@@ -930,7 +952,7 @@ case "$PRK_TARGET" in
         export LD_LIBRARY_PATH=${TRAVIS_ROOT}/sandia-openshmem/lib:${TRAVIS_ROOT}/libfabric/lib:$LD_LIBRARY_PATH
         export SHMEM_ROOT=${TRAVIS_ROOT}/sandia-openshmem
         echo "SHMEMTOP=$SHMEM_ROOT\nSHMEMCC=$SHMEM_ROOT/bin/oshcc" >> common/make.defs
-        make $PRK_TARGET
+        ${MAKE} $PRK_TARGET
         export PRK_TARGET_PATH=SHMEM
         export PRK_SHMEM_PROCS=4
         export OSHRUN_LAUNCHER=${TRAVIS_ROOT}/hydra/bin/mpirun
@@ -960,7 +982,7 @@ case "$PRK_TARGET" in
                 echo "UPCC=$UPC_ROOT/bin/upc" >> common/make.defs
                 export PRK_LAUNCHER=""
                 export PRK_LAUNCHER_ARGS="-n $PRK_UPC_PROCS"
-                make $PRK_TARGET
+                ${MAKE} $PRK_TARGET
                 ;;
             bupc)
                 export UPC_ROOT=${TRAVIS_ROOT}/bupc-$CC
@@ -987,7 +1009,7 @@ case "$PRK_TARGET" in
                         export PRK_LAUNCHER="$UPC_ROOT/bin/upcrun -N 1 -n $PRK_UPC_PROCS -c $PRK_UPC_PROCS"
                         ;;
                 esac
-                make $PRK_TARGET PRK_FLAGS="-Wc,-O3"
+                ${MAKE} $PRK_TARGET PRK_FLAGS="-Wc,-O3"
                 ;;
             *)
                 echo "Invalid value of UPC_IMPL ($UPC_IMPL)"
@@ -1013,7 +1035,7 @@ case "$PRK_TARGET" in
                 ;;
         esac
         echo "CHARMTOP=$CHARM_ROOT" >> common/make.defs
-        make $PRK_TARGET PRK_FLAGS=-O3
+        ${MAKE} $PRK_TARGET PRK_FLAGS=-O3
         export PRK_TARGET_PATH=CHARM++
         export PRK_CHARM_PROCS=4
         export PRK_LAUNCHER=$CHARM_ROOT/bin/charmrun
@@ -1041,7 +1063,7 @@ case "$PRK_TARGET" in
                 ;;
         esac
         echo "CHARMTOP=$CHARM_ROOT" >> common/make.defs
-        make $PRK_TARGET PRK_FLAGS="-O3 -std=gnu99"
+        ${MAKE} $PRK_TARGET PRK_FLAGS="-O3 -std=gnu99"
         export PRK_TARGET_PATH=AMPI
         export PRK_CHARM_PROCS=4
         export PRK_LAUNCHER=$CHARM_ROOT/bin/charmrun
@@ -1072,7 +1094,7 @@ case "$PRK_TARGET" in
         echo "Fine-Grain MPI (FG-MPI)"
         export FGMPI_ROOT=${TRAVIS_ROOT}/fgmpi
         echo "FGMPITOP=$FGMPI_ROOT\nFGMPICC=$FGMPI_ROOT/bin/mpicc -std=c99" >> common/make.defs
-        make $PRK_TARGET
+        ${MAKE} $PRK_TARGET
         export PRK_TARGET_PATH=FG_MPI
         export PRK_MPI_PROCS=2
         export PRK_FGMPI_THREADS=2
@@ -1099,7 +1121,7 @@ case "$PRK_TARGET" in
         export SCRIPT_PATH=${TRAVIS_ROOT}/grappa/bin
         ########################
         echo "GRAPPATOP=${TRAVIS_ROOT}/grappa" >> common/make.defs
-        make $PRK_TARGET
+        ${MAKE} $PRK_TARGET
         export PRK_TARGET_PATH=GRAPPA
         export PRK_MPI_PROCS=2
         export PRK_LAUNCHER=$MPI_ROOT/bin/mpirun
@@ -1122,6 +1144,6 @@ case "$PRK_TARGET" in
     alllegion)
         echo "Legion"
         echo "LEGIONTOP=${TRAVIS_ROOT}/legion" > common/make.defs
-        make $PRK_TARGET -k
+        ${MAKE} $PRK_TARGET -k
         ;;
 esac

From 283eca102910ce85794f843c38db21ee14487d00 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 20 Feb 2019 22:27:01 -0800
Subject: [PATCH 120/245] add PRK nstream for C11 (#369)

* add PRK nstream for C11
* add MPI version
* add MEMKIND nstream
* add MEMKIND to example make.defs
* add MMAP nstream
---
 .gitignore             |  26 ++++-
 C1z/Makefile           |  37 +++++-
 C1z/nstream-memkind.c  | 228 +++++++++++++++++++++++++++++++++++++
 C1z/nstream-mmap.c     | 248 +++++++++++++++++++++++++++++++++++++++++
 C1z/nstream-mpi.c      | 216 +++++++++++++++++++++++++++++++++++
 C1z/nstream.c          | 183 ++++++++++++++++++++++++++++++
 C1z/prk_util.h         |  62 +++++++++--
 common/make.defs.gcc   |   2 +
 common/make.defs.intel |   2 +
 common/make.defs.llvm  |   3 +
 10 files changed, 991 insertions(+), 16 deletions(-)
 create mode 100644 C1z/nstream-memkind.c
 create mode 100644 C1z/nstream-mmap.c
 create mode 100644 C1z/nstream-mpi.c
 create mode 100644 C1z/nstream.c

diff --git a/.gitignore b/.gitignore
index 7ed1c8b8c..66948e148 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,6 +29,8 @@ octave-workspace                # Octave crashes
 */*/*.optrpt
 *__genmod.*                     # Intel Fortran compiler
 */*__genmod.mod
+*.patch
+*/*.patch
 
 common/make.defs
 scripts/small/runfgmpi
@@ -95,6 +97,13 @@ SERIAL/Sparse/sparse
 SERIAL/Stencil/stencil
 SERIAL/Synch_p2p/p2p
 SERIAL/Transpose/transpose
+C1z/nstream
+C1z/nstream-openmp
+C1z/nstream-mpi
+C1z/nstream-memkind
+C1z/nstream-memkind-openmp
+C1z/nstream-mmap
+C1z/nstream-mmap-openmp
 C1z/p2p
 C1z/p2p-innerloop
 C1z/p2p-innerloop-openmp
@@ -131,9 +140,11 @@ Cxx11/p2p-vector-raja
 Cxx11/p2p-vector-tbb
 Cxx11/p2p-innerloop-openmp
 Cxx11/p2p-doacross-vector-openmp
+Cxx11/p2p-doacross-openmp
 Cxx11/p2p-innerloop-opencl
 Cxx11/p2p-innerloop-vector
 Cxx11/p2p-hyperplane-vector
+Cxx11/p2p-hyperplane-openmp
 Cxx11/p2p-hyperplane-vector-openmp
 Cxx11/p2p-innerloop-vector-tbb
 Cxx11/p2p-hyperplane-vector-stl
@@ -167,6 +178,7 @@ Cxx11/stencil-vector-cilk
 Cxx11/stencil-vector-stl
 Cxx11/stencil-vector-pstl
 Cxx11/stencil-vector-raja
+Cxx11/stencil-openmp
 Cxx11/stencil-raja
 Cxx11/stencil-vector-rangefor
 Cxx11/stencil-vector-tbb
@@ -174,6 +186,7 @@ Cxx11/stencil-vector-taskloop
 Cxx11/stencil-kokkos
 Cxx11/stencil-cuda
 Cxx11/stencil-sycl
+Cxx11/transpose-openmp
 Cxx11/transpose-opencl
 Cxx11/transpose-sycl
 Cxx11/transpose-openmp-target
@@ -215,6 +228,14 @@ Cxx11/star6.cl
 Cxx11/star7.cl
 Cxx11/star8.cl
 Cxx11/star9.cl
+Cxx11/star10.cl
+Cxx11/hipSYCL
+Cxx11/cpp-proposals-pub
+Cxx11/ornl-mdspan
+Cxx11/boost.tgz
+Cxx11/boost.tbz
+Cxx11/OpenCL-CLHPP
+Cxx11/GSL
 FORTRAN/dgemm-taskloop-openmp
 FORTRAN/dgemm-pretty
 FORTRAN/dgemm-openmp
@@ -256,8 +277,3 @@ FORTRAN/transpose-ornlacc
 RUST/p2p/Cargo.lock
 RUST/stencil/Cargo.lock
 RUST/transpose/Cargo.lock
-nstream-openmp
-p2p-doacross-openmp
-p2p-hyperplane-openmp
-stencil-openmp
-transpose-openmp
diff --git a/C1z/Makefile b/C1z/Makefile
index 0df8225c1..aac123acc 100644
--- a/C1z/Makefile
+++ b/C1z/Makefile
@@ -10,6 +10,10 @@ ifdef VERBOSE
   CFLAGS += -DVERBOSE
 endif
 
+ifdef PRK_USE_MMAP
+  CFLAGS += -DPRK_USE_MMAP
+endif
+
 ifeq ($(findstring musl,$(CC)),musl)
   CFLAGS += -DUSE_C11_THREADS
 endif
@@ -41,11 +45,17 @@ endif
 
 all: serial thread openmp taskloop $(EXTRA)
 
-serial: p2p p2p-innerloop stencil transpose
+serial: nstream p2p p2p-innerloop stencil transpose
 
 thread: transpose-thread
 
-openmp: p2p-simd-openmp p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp transpose-openmp
+openmp: nstream-openmp p2p-simd-openmp p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp transpose-openmp
+
+mpi: nstream-mpi
+
+memkind: nstream-memkind nstream-memkind-openmp
+
+mmap: nstream-mmap nstream-mmap-openmp
 
 target: stencil-target transpose-target
 
@@ -58,12 +68,30 @@ ispc: transpose-ispc
 p2p-innerloop: p2p-innerloop-openmp.c prk_util.h
 	$(CC) $(CFLAGS) $< $(EXTRA_CLIBS) -o $@
 
+%-mpi: %-mpi.c prk_util.h
+	$(MPICC) $(CFLAGS) $< $(EXTRA_CLIBS) -o $@
+
+%-memkind: %-memkind.c prk_util.h
+	$(CC) $(CFLAGS) $< $(EXTRA_CLIBS) $(MEMKINDFLAGS) -o $@
+
+%-memkind-openmp: %-memkind.c prk_util.h
+	$(CC) $(CFLAGS) $(OMPFLAGS) $< $(EXTRA_CLIBS) $(MEMKINDFLAGS) -o $@
+
+%-mmap: %-mmap.c prk_util.h
+	$(CC) $(CFLAGS) $< $(EXTRA_CLIBS) -o $@
+
+%-mmap-openmp: %-mmap.c prk_util.h
+	$(CC) $(CFLAGS) $(OMPFLAGS) $< $(EXTRA_CLIBS) -o $@
+
 %-target: %-target.c prk_util.h
 	$(CC) $(CFLAGS) $< $(OMPFLAGS) $(TARGETFLAGS) $(EXTRA_CLIBS) -o $@
 
 %-taskloop: %-taskloop.c prk_util.h
 	$(CC) $(CFLAGS) $< $(OMPFLAGS) $(EXTRA_CLIBS) -o $@
 
+nstream-openmp: nstream.c prk_util.h
+	$(CC) $(CFLAGS) $< $(OMPFLAGS) $(EXTRA_CLIBS) -o $@
+
 %-openmp: %-openmp.c prk_util.h
 	$(CC) $(CFLAGS) $< $(OMPFLAGS) $(EXTRA_CLIBS) -o $@
 
@@ -92,13 +120,16 @@ clean:
 	-rm -f *.optrpt
 	-rm -f *.dwarf
 	-rm -rf *.dSYM # Mac
-	-rm -f p2p p2p-innerloop stencil transpose
+	-rm -f nstream p2p p2p-innerloop stencil transpose
 	-rm -f *-openmp
+	-rm -f *-mpi
 	-rm -f *-target
 	-rm -f *-taskloop
 	-rm -f *-cilk
 	-rm -f *-thread
 	-rm -f *-ispc
+	-rm -f nstream-mmap nstream-memkind
+	-rm -f nstream-mmap-openmp nstream-memkind-openmp
 
 cleancl:
 	-rm -f star[123456789].cl
diff --git a/C1z/nstream-memkind.c b/C1z/nstream-memkind.c
new file mode 100644
index 000000000..465f7a067
--- /dev/null
+++ b/C1z/nstream-memkind.c
@@ -0,0 +1,228 @@
+///
+/// Copyright (c) 2019, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    nstream
+///
+/// PURPOSE: To compute memory bandwidth when adding a vector of a given
+///          number of double precision values to the scalar multiple of
+///          another vector of the same length, and storing the result in
+///          a third vector.
+///
+/// USAGE:   The program takes as input the number
+///          of iterations to loop over the triad vectors, the length of the
+///          vectors, and the offset between vectors
+///
+///          <progname> <# iterations> <vector length> <offset>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// NOTES:   Bandwidth is determined as the number of words read, plus the
+///          number of words written, times the size of the words, divided
+///          by the execution time. For a vector length of N, the total
+///          number of words read and written is 4*N*sizeof(double).
+///
+///
+/// HISTORY: This code is loosely based on the Stream benchmark by John
+///          McCalpin, but does not follow all the Stream rules. Hence,
+///          reported results should not be associated with Stream in
+///          external publications
+///
+///          Converted to C++11 by Jeff Hammond, November 2017.
+///          Converted to C11 by Jeff Hammond, February 2019.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+
+#include <memkind.h>
+#ifndef MEMKIND_PMEM_MIN_SIZE
+# define MEMKIND_PMEM_MIN_SIZE (1024 * 1024 * 16)
+#endif
+
+int main(int argc, char * argv[])
+{
+  printf("Parallel Research Kernels version %.2f\n", PRKVERSION );
+#ifdef _OPENMP
+  printf("C11/OpenMP STREAM triad: A = B + scalar * C\n");
+#else
+  printf("C11 STREAM triad: A = B + scalar * C\n");
+#endif
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  if (argc < 3) {
+    printf("Usage: <# iterations> <vector length>\n");
+    return 1;
+  }
+
+  // number of times to do the transpose
+  int iterations = atoi(argv[1]);
+  if (iterations < 1) {
+    printf("ERROR: iterations must be >= 1\n");
+    return 1;
+  }
+
+  // length of a the matrix
+  size_t length = atol(argv[2]);
+  if (length <= 0) {
+    printf("ERROR: Matrix length must be greater than 0\n");
+    return 1;
+  }
+
+#ifdef _OPENMP
+  printf("Number of threads    = %d\n", omp_get_max_threads());
+#endif
+  printf("Number of iterations = %d\n", iterations);
+  printf("Vector length        = %zu\n", length);
+  //printf("Offset               = %d\n", offset);
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  double nstream_time = 0.0;
+
+  size_t bytes = length*sizeof(double);
+
+  char * pool_path = getenv("PRK_MEMKIND_POOL_PATH");
+  if (pool_path == NULL) {
+      pool_path = "/pmem";
+  }
+  printf("MEMKIND pool path = %s\n", pool_path);
+  struct memkind * memkind_handle;
+  int err = memkind_create_pmem(pool_path, 0, &memkind_handle);
+  if (err) {
+    printf("MEMKIND failed to create a memory pool! (err=%d, errno=%d)\n", err, errno);
+  }
+
+  size_t usable_size = 0;
+
+  double * restrict A = memkind_malloc(memkind_handle, bytes);
+  if (A==NULL) {
+    printf("MEMKIND failed to allocate A! (errno=%d)\n", errno);
+  }
+  usable_size = memkind_malloc_usable_size(memkind_handle, A);
+  printf("A usage size = %zu\n", usable_size);
+
+  double * restrict B = memkind_malloc(memkind_handle, bytes);
+  if (B==NULL) {
+    printf("MEMKIND failed to allocate B! (errno=%d)\n", errno);
+  }
+  usable_size = memkind_malloc_usable_size(memkind_handle, B);
+  printf("B usage size = %zu\n", usable_size);
+
+  double * restrict C = memkind_malloc(memkind_handle, bytes);
+  if (C==NULL) {
+    printf("MEMKIND failed to allocate C! (errno=%d)\n", errno);
+  }
+  usable_size = memkind_malloc_usable_size(memkind_handle, C);
+  printf("C usage size = %zu\n", usable_size);
+
+  double scalar = 3.0;
+
+  OMP_PARALLEL()
+  {
+    OMP_FOR_SIMD()
+    for (size_t i=0; i<length; i++) {
+      A[i] = 0.0;
+      B[i] = 2.0;
+      C[i] = 2.0;
+    }
+
+    for (int iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) {
+          OMP_BARRIER
+          OMP_MASTER
+          nstream_time = prk_wtime();
+      }
+
+      OMP_FOR_SIMD()
+      for (size_t i=0; i<length; i++) {
+          A[i] += B[i] + scalar * C[i];
+      }
+    }
+    OMP_BARRIER
+    OMP_MASTER
+    nstream_time = prk_wtime() - nstream_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  double ar = 0.0;
+  double br = 2.0;
+  double cr = 2.0;
+  for (int i=0; i<=iterations; i++) {
+      ar += br + scalar * cr;
+  }
+
+  ar *= length;
+
+  double asum = 0.0;
+  OMP_PARALLEL_FOR_REDUCE( +:asum )
+  for (size_t i=0; i<length; i++) {
+      asum += fabs(A[i]);
+  }
+
+  double epsilon=1.e-8;
+  if (fabs(ar-asum)/asum > epsilon) {
+      printf("Failed Validation on output array\n"
+             "       Expected checksum: %lf\n"
+             "       Observed checksum: %lf\n"
+             "ERROR: solution did not validate\n", ar, asum);
+      return 1;
+  } else {
+      printf("Solution validates\n");
+      double avgtime = nstream_time/iterations;
+      double nbytes = 4.0 * length * sizeof(double);
+      printf("Rate (MB/s): %lf Avg time (s): %lf\n", 1.e-6*nbytes/avgtime, avgtime);
+  }
+
+  memkind_free(memkind_handle, A);
+  memkind_free(memkind_handle, B);
+  memkind_free(memkind_handle, C);
+
+  err = memkind_destroy_kind(memkind_handle);
+  if (err) {
+      printf("MEMKIND failed to create destroy a memory pool! (err=%d, errno=%d)\n", err, errno);
+  }
+
+  return 0;
+}
+
+
diff --git a/C1z/nstream-mmap.c b/C1z/nstream-mmap.c
new file mode 100644
index 000000000..fb0942c47
--- /dev/null
+++ b/C1z/nstream-mmap.c
@@ -0,0 +1,248 @@
+///
+/// Copyright (c) 2019, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    nstream
+///
+/// PURPOSE: To compute memory bandwidth when adding a vector of a given
+///          number of double precision values to the scalar multiple of
+///          another vector of the same length, and storing the result in
+///          a third vector.
+///
+/// USAGE:   The program takes as input the number
+///          of iterations to loop over the triad vectors, the length of the
+///          vectors, and the offset between vectors
+///
+///          <progname> <# iterations> <vector length> <offset>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// NOTES:   Bandwidth is determined as the number of words read, plus the
+///          number of words written, times the size of the words, divided
+///          by the execution time. For a vector length of N, the total
+///          number of words read and written is 4*N*sizeof(double).
+///
+///
+/// HISTORY: This code is loosely based on the Stream benchmark by John
+///          McCalpin, but does not follow all the Stream rules. Hence,
+///          reported results should not be associated with Stream in
+///          external publications
+///
+///          Converted to C++11 by Jeff Hammond, November 2017.
+///          Converted to C11 by Jeff Hammond, February 2019.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <asm-generic/mman.h>
+
+int main(int argc, char * argv[])
+{
+  printf("Parallel Research Kernels version %.2f\n", PRKVERSION );
+#ifdef _OPENMP
+  printf("C11/OpenMP STREAM triad: A = B + scalar * C\n");
+#else
+  printf("C11 STREAM triad: A = B + scalar * C\n");
+#endif
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  if (argc < 3) {
+    printf("Usage: <# iterations> <vector length>\n");
+    return 1;
+  }
+
+  // number of times to do the transpose
+  int iterations = atoi(argv[1]);
+  if (iterations < 1) {
+    printf("ERROR: iterations must be >= 1\n");
+    return 1;
+  }
+
+  // length of a the matrix
+  size_t length = atol(argv[2]);
+  if (length <= 0) {
+    printf("ERROR: Matrix length must be greater than 0\n");
+    return 1;
+  }
+
+#ifdef _OPENMP
+  printf("Number of threads    = %d\n", omp_get_max_threads());
+#endif
+  printf("Number of iterations = %d\n", iterations);
+  printf("Vector length        = %zu\n", length);
+  //printf("Offset               = %d\n", offset);
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  double nstream_time = 0.0;
+
+  size_t bytes = length*sizeof(double);
+
+  char mmap_path[255] = {0};
+  char * mmap_env = getenv("PRK_MMAP_PATH");
+  fprintf(stderr, "PRK_MMAP_PATH=%s\n", mmap_env);
+  if (mmap_env==NULL) {
+      strcpy(mmap_path, "/tmp/prk_mmap");
+  } else {
+      strcpy(mmap_path, mmap_env);
+  }
+
+  fprintf(stderr, "mmap_path=%s\n", mmap_path);
+  int fd = open(mmap_path, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
+  if (fd == -1) {
+      fprintf(stderr, "open returned %d\n", fd);
+      char error_name[255] = {0};
+      prk_lookup_posix_error(errno, error_name, 255);
+      printf("error name: %s\n", error_name);
+      abort();
+  }
+
+  int rc = ftruncate(fd, 3*bytes);
+  if (rc == -1) {
+      fprintf(stderr, "ftruncate returned %d\n", rc);
+      char error_name[255] = {0};
+      prk_lookup_posix_error(errno, error_name, 255);
+      printf("error name: %s\n", error_name);
+      abort();
+  }
+
+  int flags = 0;
+  //flags |= MAP_PRIVATE;
+  flags |= MAP_SHARED;
+  //flags |= MAP_NORESERVE;
+  flags |= MAP_POPULATE;
+  //flags |= MAP_UNINITIALIZED;
+  //flags |= MAP_HUGETLB;
+  //flags |= MAP_HUGE_2MB;
+  //flags |= MAP_SYNC;
+
+  double * ptr = (double*)mmap(NULL, 3*bytes, PROT_READ | PROT_WRITE, flags, fd, 0);
+  //double * ptr = (double*)mmap(NULL, 3*bytes, PROT_READ | PROT_WRITE, flags | MAP_ANONYMOUS, -1, 0);
+  if (ptr==MAP_FAILED || ptr==NULL) {
+      fprintf(stderr, "mmap returned %p, errno=%d\n", ptr, errno);
+      char error_name[255] = {0};
+      prk_lookup_posix_error(errno, error_name, 255);
+      printf("error name: %s\n", error_name);
+      abort();
+  }
+
+  double * restrict A = &ptr[0];
+  double * restrict B = &ptr[length];
+  double * restrict C = &ptr[length*2];
+
+  double scalar = 3.0;
+
+  OMP_PARALLEL()
+  {
+    OMP_FOR_SIMD()
+    for (size_t i=0; i<length; i++) {
+      A[i] = 0.0;
+      B[i] = 2.0;
+      C[i] = 2.0;
+    }
+
+    for (int iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) {
+          OMP_BARRIER
+          OMP_MASTER
+          nstream_time = prk_wtime();
+      }
+
+      OMP_FOR_SIMD()
+      for (size_t i=0; i<length; i++) {
+          A[i] += B[i] + scalar * C[i];
+      }
+    }
+    OMP_BARRIER
+    OMP_MASTER
+    nstream_time = prk_wtime() - nstream_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  double ar = 0.0;
+  double br = 2.0;
+  double cr = 2.0;
+  for (int i=0; i<=iterations; i++) {
+      ar += br + scalar * cr;
+  }
+
+  ar *= length;
+
+  double asum = 0.0;
+  OMP_PARALLEL_FOR_REDUCE( +:asum )
+  for (size_t i=0; i<length; i++) {
+      asum += fabs(A[i]);
+  }
+
+  double epsilon=1.e-8;
+  if (fabs(ar-asum)/asum > epsilon) {
+      printf("Failed Validation on output array\n"
+             "       Expected checksum: %lf\n"
+             "       Observed checksum: %lf\n"
+             "ERROR: solution did not validate\n", ar, asum);
+      return 1;
+  } else {
+      printf("Solution validates\n");
+      double avgtime = nstream_time/iterations;
+      double nbytes = 4.0 * length * sizeof(double);
+      printf("Rate (MB/s): %lf Avg time (s): %lf\n", 1.e-6*nbytes/avgtime, avgtime);
+  }
+
+  int err = munmap(ptr, 3*bytes);
+  if (err) {
+      printf("munmap failed! (err=%d, errno=%d)\n", err, errno);
+  }
+  err = close(fd);
+  if (err) {
+      printf("close failed! (err=%d, errno=%d)\n", err, errno);
+  }
+
+  return 0;
+}
+
+
diff --git a/C1z/nstream-mpi.c b/C1z/nstream-mpi.c
new file mode 100644
index 000000000..438842859
--- /dev/null
+++ b/C1z/nstream-mpi.c
@@ -0,0 +1,216 @@
+///
+/// Copyright (c) 2019, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    nstream
+///
+/// PURPOSE: To compute memory bandwidth when adding a vector of a given
+///          number of double precision values to the scalar multiple of
+///          another vector of the same length, and storing the result in
+///          a third vector.
+///
+/// USAGE:   The program takes as input the number
+///          of iterations to loop over the triad vectors, the length of the
+///          vectors, and the offset between vectors
+///
+///          <progname> <# iterations> <vector length> <offset>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// NOTES:   Bandwidth is determined as the number of words read, plus the
+///          number of words written, times the size of the words, divided
+///          by the execution time. For a vector length of N, the total
+///          number of words read and written is 4*N*sizeof(double).
+///
+///
+/// HISTORY: This code is loosely based on the Stream benchmark by John
+///          McCalpin, but does not follow all the Stream rules. Hence,
+///          reported results should not be associated with Stream in
+///          external publications
+///
+///          Converted to C++11 by Jeff Hammond, November 2017.
+///          Converted to C11 by Jeff Hammond, February 2019.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+
+#include <mpi.h>
+
+int main(int argc, char * argv[])
+{
+  int me, np;
+
+  MPI_Init(&argc, &argv);
+  MPI_Comm_rank(MPI_COMM_WORLD, &me);
+  MPI_Comm_size(MPI_COMM_WORLD, &np);
+
+  if (me==0) {
+      printf("Parallel Research Kernels version %.2f\n", PRKVERSION );
+      printf("C11/MPI STREAM triad: A = B + scalar * C\n");
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  if (argc < 3) {
+    if (me==0) printf("Usage: <# iterations> <vector length>\n");
+    MPI_Finalize();
+    return 1;
+  }
+
+  // number of times to do the transpose
+  int iterations = atoi(argv[1]);
+  if (iterations < 1) {
+    if (me==0) printf("ERROR: iterations must be >= 1\n");
+    MPI_Finalize();
+    return 1;
+  }
+
+  // length of a the matrix
+  size_t length = atol(argv[2]);
+  if (length <= 0) {
+    if (me==0) printf("ERROR: Matrix length must be greater than 0\n");
+    MPI_Finalize();
+    return 1;
+  }
+
+  if (me==0) {
+      printf("Number of processes  = %d\n", np);
+      printf("Number of iterations = %d\n", iterations);
+      printf("Vector length        = %zu\n", length);
+      //printf("Offset               = %d\n", offset);
+  }
+
+  size_t local_length;
+  if (length % np == 0) {
+      local_length = length / np;
+  } else {
+      double x = (double)length / np;
+      size_t y = (size_t)ceil(x);
+      if (me != (np-1)) {
+          local_length = y;
+      } else {
+          local_length = length - y*(np-1);
+      }
+  }
+  //printf("Vector length (%4d) = %zu\n", me, local_length);
+  fflush(stdout);
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  double nstream_time = 0.0;
+
+  double * restrict A;
+  double * restrict B;
+  double * restrict C;
+
+  MPI_Win wA, wB, wC;
+
+  size_t bytes = local_length*sizeof(double);
+
+  MPI_Win_allocate_shared(bytes, sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, (void**)&A, &wA);
+  MPI_Win_allocate_shared(bytes, sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, (void**)&B, &wB);
+  MPI_Win_allocate_shared(bytes, sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, (void**)&C, &wC);
+
+  double scalar = 3.0;
+
+  for (size_t i=0; i<local_length; i++) {
+    A[i] = 0.0;
+    B[i] = 2.0;
+    C[i] = 2.0;
+  }
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  for (int iter = 0; iter<=iterations; iter++) {
+
+    if (iter==1) {
+        MPI_Barrier(MPI_COMM_WORLD);
+        nstream_time = MPI_Wtime();
+    }
+
+    for (size_t i=0; i<local_length; i++) {
+        A[i] += B[i] + scalar * C[i];
+    }
+  }
+  MPI_Barrier(MPI_COMM_WORLD);
+  nstream_time = MPI_Wtime() - nstream_time;
+
+  MPI_Allreduce(MPI_IN_PLACE, &nstream_time, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  double ar = 0.0;
+  double br = 2.0;
+  double cr = 2.0;
+  for (int i=0; i<=iterations; i++) {
+      ar += br + scalar * cr;
+  }
+
+  ar *= local_length;
+
+  double asum = 0.0;
+  for (size_t i=0; i<local_length; i++) {
+      asum += fabs(A[i]);
+  }
+
+  double epsilon=1.e-8;
+  if (fabs(ar-asum)/asum > epsilon) {
+      printf("Failed Validation on output array\n"
+             "       Expected checksum: %lf\n"
+             "       Observed checksum: %lf\n"
+             "ERROR: solution did not validate\n", ar, asum);
+      return 1;
+  } else {
+      if (me==0) printf("Solution validates\n");
+      double avgtime = nstream_time/iterations;
+      double nbytes = 4.0 * length * sizeof(double);
+      if (me==0) printf("Rate (MB/s): %lf Avg time (s): %lf\n", 1.e-6*nbytes/avgtime, avgtime);
+  }
+
+  MPI_Win_free(&wA);
+  MPI_Win_free(&wB);
+  MPI_Win_free(&wC);
+
+  MPI_Finalize();
+
+  return 0;
+}
+
+
diff --git a/C1z/nstream.c b/C1z/nstream.c
new file mode 100644
index 000000000..7661662dc
--- /dev/null
+++ b/C1z/nstream.c
@@ -0,0 +1,183 @@
+///
+/// Copyright (c) 2019, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    nstream
+///
+/// PURPOSE: To compute memory bandwidth when adding a vector of a given
+///          number of double precision values to the scalar multiple of
+///          another vector of the same length, and storing the result in
+///          a third vector.
+///
+/// USAGE:   The program takes as input the number
+///          of iterations to loop over the triad vectors, the length of the
+///          vectors, and the offset between vectors
+///
+///          <progname> <# iterations> <vector length> <offset>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// NOTES:   Bandwidth is determined as the number of words read, plus the
+///          number of words written, times the size of the words, divided
+///          by the execution time. For a vector length of N, the total
+///          number of words read and written is 4*N*sizeof(double).
+///
+///
+/// HISTORY: This code is loosely based on the Stream benchmark by John
+///          McCalpin, but does not follow all the Stream rules. Hence,
+///          reported results should not be associated with Stream in
+///          external publications
+///
+///          Converted to C++11 by Jeff Hammond, November 2017.
+///          Converted to C11 by Jeff Hammond, February 2019.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+
+int main(int argc, char * argv[])
+{
+  printf("Parallel Research Kernels version %.2f\n", PRKVERSION );
+#ifdef _OPENMP
+  printf("C11/OpenMP STREAM triad: A = B + scalar * C\n");
+#else
+  printf("C11 STREAM triad: A = B + scalar * C\n");
+#endif
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  if (argc < 3) {
+    printf("Usage: <# iterations> <vector length>\n");
+    return 1;
+  }
+
+  // number of times to do the transpose
+  int iterations = atoi(argv[1]);
+  if (iterations < 1) {
+    printf("ERROR: iterations must be >= 1\n");
+    return 1;
+  }
+
+  // length of a the matrix
+  size_t length = atol(argv[2]);
+  if (length <= 0) {
+    printf("ERROR: Matrix length must be greater than 0\n");
+    return 1;
+  }
+
+#ifdef _OPENMP
+  printf("Number of threads    = %d\n", omp_get_max_threads());
+#endif
+  printf("Number of iterations = %d\n", iterations);
+  printf("Vector length        = %zu\n", length);
+  //printf("Offset               = %d\n", offset);
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  double nstream_time = 0.0;
+
+  size_t bytes = length*sizeof(double);
+  double * restrict A = prk_malloc(bytes);
+  double * restrict B = prk_malloc(bytes);
+  double * restrict C = prk_malloc(bytes);
+
+  double scalar = 3.0;
+
+  OMP_PARALLEL()
+  {
+    OMP_FOR_SIMD()
+    for (size_t i=0; i<length; i++) {
+      A[i] = 0.0;
+      B[i] = 2.0;
+      C[i] = 2.0;
+    }
+
+    for (int iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) {
+          OMP_BARRIER
+          OMP_MASTER
+          nstream_time = prk_wtime();
+      }
+
+      OMP_FOR_SIMD()
+      for (size_t i=0; i<length; i++) {
+          A[i] += B[i] + scalar * C[i];
+      }
+    }
+    OMP_BARRIER
+    OMP_MASTER
+    nstream_time = prk_wtime() - nstream_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  double ar = 0.0;
+  double br = 2.0;
+  double cr = 2.0;
+  for (int i=0; i<=iterations; i++) {
+      ar += br + scalar * cr;
+  }
+
+  ar *= length;
+
+  double asum = 0.0;
+  OMP_PARALLEL_FOR_REDUCE( +:asum )
+  for (size_t i=0; i<length; i++) {
+      asum += fabs(A[i]);
+  }
+
+  double epsilon=1.e-8;
+  if (fabs(ar-asum)/asum > epsilon) {
+      printf("Failed Validation on output array\n"
+             "       Expected checksum: %lf\n"
+             "       Observed checksum: %lf\n"
+             "ERROR: solution did not validate\n", ar, asum);
+      return 1;
+  } else {
+      printf("Solution validates\n");
+      double avgtime = nstream_time/iterations;
+      double nbytes = 4.0 * length * sizeof(double);
+      printf("Rate (MB/s): %lf Avg time (s): %lf\n", 1.e-6*nbytes/avgtime, avgtime);
+  }
+
+  return 0;
+}
+
+
diff --git a/C1z/prk_util.h b/C1z/prk_util.h
index 5d0831d34..24e428552 100644
--- a/C1z/prk_util.h
+++ b/C1z/prk_util.h
@@ -38,13 +38,11 @@
 
 #define PRAGMA(x) _Pragma(#x)
 
-// All of this is to get posix_memalign defined...
-// #define _POSIX_C_SOURCE (200112L)
-#define _POSIX_C_SOURCE (200809L)
-#define _XOPEN_SOURCE 600
-
 #include <stdio.h>   // atoi
 #include <stdlib.h>  // getenv
+
+int posix_memalign(void **memptr, size_t alignment, size_t size);
+
 #include <stdint.h>
 #if defined(__PGIC__)
 typedef _Bool bool;
@@ -80,7 +78,7 @@ const bool false=0;
 # define OMP_FOR_REDUCE(x) PRAGMA(omp for reduction (x) )
 # if (_OPENMP >= 201300)
 #  define OMP_SIMD PRAGMA(omp simd)
-#  define OMP_FOR_SIMD() PRAGMA(omp for simd x)
+#  define OMP_FOR_SIMD(x) PRAGMA(omp for simd x)
 #  define OMP_TASK(x) PRAGMA(omp task x)
 #  define OMP_TASKLOOP(x) PRAGMA(omp taskloop x )
 #  define OMP_TASKWAIT PRAGMA(omp taskwait)
@@ -89,7 +87,7 @@ const bool false=0;
 # else
 #  warning No OpenMP 4+ features!
 #  define OMP_SIMD
-#  define OMP_FOR_SIMD() PRAGMA(omp for x)
+#  define OMP_FOR_SIMD(x) PRAGMA(omp for x)
 #  define OMP_TASK(x)
 #  define OMP_TASKLOOP(x)
 #  define OMP_TASKWAIT
@@ -105,7 +103,7 @@ const bool false=0;
 # define OMP_FOR(x)
 # define OMP_FOR_REDUCE(x)
 # define OMP_SIMD
-# define OMP_FOR_SIMD()
+# define OMP_FOR_SIMD(x)
 # define OMP_TASK(x)
 # define OMP_TASKLOOP(x)
 # define OMP_TASKWAIT
@@ -298,4 +296,52 @@ static inline void prk_free(void * p)
 #endif
 }
 
+static inline void prk_lookup_posix_error(int e, char * n, int l)
+{
+    switch (e) {
+        case EACCES:
+            strncpy(n,"EACCES",l);
+            break;
+        case EAGAIN:
+            strncpy(n,"EAGAIN",l);
+            break;
+        case EBADF:
+            strncpy(n,"EBADF",l);
+            break;
+        case EEXIST:
+            strncpy(n,"EEXIST",l);
+            break;
+        case EINVAL:
+            strncpy(n,"EINVAL",l);
+            break;
+        case ENFILE:
+            strncpy(n,"ENFILE",l);
+            break;
+        case ENODEV:
+            strncpy(n,"ENODEV",l);
+            break;
+        case ENOMEM:
+            strncpy(n,"ENOMEM",l);
+            break;
+        case EPERM:
+            strncpy(n,"EPERM",l);
+            break;
+        case ETXTBSY:
+            strncpy(n,"ETXTBSY",l);
+            break;
+        case EOPNOTSUPP:
+            strncpy(n,"EOPNOTSUPP",l);
+            break;
+        /*
+        case E:
+            strncpy(n,"E",l);
+            break;
+        */
+        default:
+            printf("error code %d unknown\n", e);
+            strncpy(n,"UNKNOWN",l);
+            break;
+    }
+}
+
 #endif /* PRK_UTIL_H */
diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index e1d68dd5a..0c30fea7f 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -115,3 +115,5 @@ COARRAYFLAG=-fcoarray=single -lcaf_single
 # multi-node
 # COARRAYFLAG=-fcoarray=lib -lcaf_mpi
 
+MEMKINDDIR=/home/parallels/PRK/deps
+MEMKINDFLAGS=-I${MEMKINDDIR}/include -L${MEMKINDDIR}/lib -lmemkind -Wl,-rpath=${MEMKINDDIR}/lib
diff --git a/common/make.defs.intel b/common/make.defs.intel
index bba53d1bb..17a4c2833 100644
--- a/common/make.defs.intel
+++ b/common/make.defs.intel
@@ -116,3 +116,5 @@ COARRAYFLAG=-coarray
 # multi-node
 # COARRAYFLAG=-coarray=distributed
 
+MEMKINDDIR=/home/parallels/PRK/deps
+MEMKINDFLAGS=-I${MEMKINDDIR}/include -L${MEMKINDDIR}/lib -lmemkind -Wl,-rpath=${MEMKINDDIR}/lib
diff --git a/common/make.defs.llvm b/common/make.defs.llvm
index a5c9010d4..c8aa874ea 100644
--- a/common/make.defs.llvm
+++ b/common/make.defs.llvm
@@ -129,3 +129,6 @@ ISPCFLAG=-O3 --target=host --opt=fast-math
 #
 # We assume you have installed an implementation of MPI-3 that is in your path.
 MPICC=mpicc
+
+MEMKINDDIR=/home/parallels/PRK/deps
+MEMKINDFLAGS=-I${MEMKINDDIR}/include -L${MEMKINDDIR}/lib -lmemkind -Wl,-rpath=${MEMKINDDIR}/lib

From 75bc803dacb1edb3330baaf4826be83fed2411cc Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Sat, 2 Mar 2019 21:09:14 -0800
Subject: [PATCH 121/245] Sycl multi device and exceptions (#347)

* triSYCL needs C++17
* fix Julia syntax issue "1./" is a syntax error now.  change to "1.0/"
* do to SYCL what we have for OpenCL
* fix name mangling issue - thanks Rod@CodePlay!
* run 32b for all devices unconditionally
* label result with precision
* hard-code SYCL to CPU execution only due to GPU issues
the bandwidth reported is consistent for elements, not bytes, which
means that something is wrong.  64b data should not lead to BW that is
2x 32b data...
* add host, catch std exception
* c++1z instead of c++17
* fix use of ranges in SYCL
* correct sycl ranges fix
* better example flags
---
 Cxx11/Makefile          |   2 +-
 Cxx11/nstream-opencl.cc |  12 ++-
 Cxx11/nstream-sycl.cc   | 178 +++++++++++++++++++++++++++-------------
 common/make.defs.gcc    |  22 ++++-
 common/make.defs.llvm   |  18 ++--
 5 files changed, 154 insertions(+), 78 deletions(-)

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index 54873e41d..65875fba4 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -36,7 +36,6 @@ TARGETFLAGS = $(OFFLOADFLAG)
 OPENCLFLAGS = $(OPENCLFLAG)
 # We do not yet handle all possible exceptions...
 #OPENCLFLAGS += -D__CL_ENABLE_EXCEPTIONS
-SYCLFLAGS = $(SYCLFLAG) -DUSE_SYCL -DUSE_2D_INDEXING=0
 ORNLACCFLAGS = $(ORNLACCFLAG)
 TBBFLAGS = $(TBBFLAG) -DUSE_TBB -DPRK_TBB_PARTITIONER=$(PRK_TBB_PARTITIONER)
 CBLASFLAGS = $(CBLASFLAG) $(OPENMPFLAG)
@@ -47,6 +46,7 @@ PSTLFLAGS = $(PSTLFLAG) $(RANGEFLAGS) -DUSE_PSTL
 RAJAFLAGS = $(RAJAFLAG) -DUSE_RAJA
 THRUSTFLAGS = $(THRUSTFLAG) $(RANGEFLAGS) -DUSE_THRUST
 KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) $(RANGEFLAGS) -DUSE_KOKKOS
+SYCLFLAGS = $(SYCLFLAG) -DUSE_SYCL -DUSE_2D_INDEXING=0 $(RANGEFLAGS)
 ORNLACCFLAGS = $(ORNLACCFLAG)
 
 ifdef OCCADIR
diff --git a/Cxx11/nstream-opencl.cc b/Cxx11/nstream-opencl.cc
index 18a5a022c..b0241dd5d 100644
--- a/Cxx11/nstream-opencl.cc
+++ b/Cxx11/nstream-opencl.cc
@@ -141,7 +141,8 @@ void run(cl::Context context, int iterations, size_t length)
       std::cout << "Solution validates" << std::endl;
       double avgtime = nstream_time/iterations;
       double nbytes = 4.0 * length * sizeof(T);
-      std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime
+      std::cout << precision << "B "
+                << "Rate (MB/s): " << 1.e-6*nbytes/avgtime
                 << " Avg time (s): " << avgtime << std::endl;
   }
 }
@@ -200,9 +201,8 @@ int main(int argc, char* argv[])
 
     if (precision==64) {
         run<double>(cpu, iterations, length);
-    } else {
-        run<float>(cpu, iterations, length);
     }
+    run<float>(cpu, iterations, length);
   }
 
   cl::Context gpu(CL_DEVICE_TYPE_GPU, NULL, NULL, NULL, &err);
@@ -214,9 +214,8 @@ int main(int argc, char* argv[])
 
     if (precision==64) {
         run<double>(gpu, iterations, length);
-    } else {
-        run<float>(gpu, iterations, length);
     }
+    run<float>(gpu, iterations, length);
   }
 
   cl::Context acc(CL_DEVICE_TYPE_ACCELERATOR, NULL, NULL, NULL, &err);
@@ -229,9 +228,8 @@ int main(int argc, char* argv[])
 
     if (precision==64) {
         run<double>(acc, iterations, length);
-    } else {
-        run<float>(acc, iterations, length);
     }
+    run<float>(acc, iterations, length);
   }
 
   return 0;
diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc
index 2193d4811..b4c056990 100644
--- a/Cxx11/nstream-sycl.cc
+++ b/Cxx11/nstream-sycl.cc
@@ -66,66 +66,32 @@
 
 #include "prk_util.h"
 
-int main(int argc, char * argv[])
-{
-  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
-  std::cout << "C++11/SYCL STREAM triad: A = B + scalar * C" << std::endl;
-
-  //////////////////////////////////////////////////////////////////////
-  /// Read and test input parameters
-  //////////////////////////////////////////////////////////////////////
-
-  int iterations, offset;
-  size_t length;
-  try {
-      if (argc < 3) {
-        throw "Usage: <# iterations> <vector length>";
-      }
-
-      iterations  = std::atoi(argv[1]);
-      if (iterations < 1) {
-        throw "ERROR: iterations must be >= 1";
-      }
-
-      length = std::atol(argv[2]);
-      if (length <= 0) {
-        throw "ERROR: vector length must be positive";
-      }
-
-      offset = (argc>3) ? std::atoi(argv[3]) : 0;
-      if (length <= 0) {
-        throw "ERROR: offset must be nonnegative";
-      }
-  }
-  catch (const char * e) {
-    std::cout << e << std::endl;
-    return 1;
-  }
-
-  std::cout << "Number of iterations = " << iterations << std::endl;
-  std::cout << "Vector length        = " << length << std::endl;
-  std::cout << "Offset               = " << offset << std::endl;
-
-  // SYCL device queue
-  cl::sycl::queue q;
+// need to declare kernel class as template
+// to prevent name mangling conflict below
+template <typename T> class nstream;
 
+template <typename T>
+void run(cl::sycl::queue & q, int iterations, size_t length)
+{
   //////////////////////////////////////////////////////////////////////
   // Allocate space and perform the computation
   //////////////////////////////////////////////////////////////////////
 
   double nstream_time(0);
 
-  std::vector<double> h_A(length,0);
-  std::vector<double> h_B(length,2);
-  std::vector<double> h_C(length,2);
+  std::vector<T> h_A(length);
+  std::vector<T> h_B(length);
+  std::vector<T> h_C(length);
+
+  auto range = prk::range(static_cast<size_t>(0), length);
 
-  double const scalar(3);
+  const T scalar(3);
 
-  {
-    // initialize device buffers from host buffers
-    cl::sycl::buffer<double> d_A { h_A.data(), h_A.size() };
-    cl::sycl::buffer<double> d_B { h_B.data(), h_B.size() };
-    cl::sycl::buffer<double> d_C { h_C.data(), h_C.size() };
+  try {
+
+    cl::sycl::buffer<T> d_A { h_A.data(), h_A.size() };
+    cl::sycl::buffer<T> d_B { h_B.data(), h_B.size() };
+    cl::sycl::buffer<T> d_C { h_C.data(), h_C.size() };
 
     for (int iter = 0; iter<=iterations; ++iter) {
 
@@ -133,12 +99,11 @@ int main(int argc, char * argv[])
 
       q.submit([&](cl::sycl::handler& h) {
 
-        // accessor methods
-        auto A = d_A.get_access<cl::sycl::access::mode::read_write>(h);
-        auto B = d_B.get_access<cl::sycl::access::mode::read>(h);
-        auto C = d_C.get_access<cl::sycl::access::mode::read>(h);
+        auto A = d_A.template get_access<cl::sycl::access::mode::read_write>(h);
+        auto B = d_B.template get_access<cl::sycl::access::mode::read>(h);
+        auto C = d_C.template get_access<cl::sycl::access::mode::read>(h);
 
-        h.parallel_for<class nstream>(cl::sycl::range<1>{length}, [=] (cl::sycl::item<1> i) {
+        h.parallel_for<class nstream<T>>(cl::sycl::range<1>{length}, [=] (cl::sycl::item<1> i) {
             A[i] += B[i] + scalar * C[i];
         });
       });
@@ -150,6 +115,10 @@ int main(int argc, char * argv[])
     // for other device-oriented programming models.
     nstream_time = prk::wtime() - nstream_time;
   }
+  catch (cl::sycl::exception e) {
+    std::cout << e.what() << std::endl;
+    return;
+  }
 
   //////////////////////////////////////////////////////////////////////
   /// Analyze and output results
@@ -175,14 +144,105 @@ int main(int argc, char * argv[])
                 << "       Expected checksum: " << ar << "\n"
                 << "       Observed checksum: " << asum << std::endl;
       std::cout << "ERROR: solution did not validate" << std::endl;
-      return 1;
   } else {
       std::cout << "Solution validates" << std::endl;
       double avgtime = nstream_time/iterations;
-      double nbytes = 4.0 * length * sizeof(double);
-      std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime
+      double nbytes = 4.0 * length * sizeof(T);
+      std::cout << 8*sizeof(T) << "B "
+                << "Rate (MB/s): " << 1.e-6*nbytes/avgtime
                 << " Avg time (s): " << avgtime << std::endl;
   }
+}
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/SYCL STREAM triad: A = B + scalar * C" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations, offset;
+  size_t length;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <vector length>";
+      }
+
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      length = std::atol(argv[2]);
+      if (length <= 0) {
+        throw "ERROR: vector length must be positive";
+      }
+
+      offset = (argc>3) ? std::atoi(argv[3]) : 0;
+      if (length <= 0) {
+        throw "ERROR: offset must be nonnegative";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Vector length        = " << length << std::endl;
+  std::cout << "Offset               = " << offset << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Setup SYCL environment
+  //////////////////////////////////////////////////////////////////////
+
+  try {
+
+    if (1) {
+        cl::sycl::queue host(cl::sycl::host_selector{});
+        auto device      = host.get_device();
+        auto platform    = device.get_platform();
+        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
+        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
+
+        run<float>(host, iterations, length);
+        run<double>(host, iterations, length);
+    }
+
+    // CPU requires spir64 target
+    if (1) {
+        cl::sycl::queue cpu(cl::sycl::cpu_selector{});
+        auto device      = cpu.get_device();
+        auto platform    = device.get_platform();
+        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
+        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
+        //std::cout << "cl_khr_spir:   " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl;
+
+        run<float>(cpu, iterations, length);
+        run<double>(cpu, iterations, length);
+    }
+
+    // NVIDIA GPU requires ptx64 target and does not work very well
+    if (0) {
+        cl::sycl::queue gpu(cl::sycl::gpu_selector{});
+        auto device      = gpu.get_device();
+        auto platform    = device.get_platform();
+        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
+        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
+        //std::cout << "cl_khr_spir:   " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl;
+
+        run<float>(gpu, iterations, length);
+        run<double>(gpu, iterations, length);
+    }
+  }
+  catch (cl::sycl::exception e) {
+    std::cout << e.what() << std::endl;
+  }
+  catch (std::exception e) {
+    std::cout << e.what() << std::endl;
+  }
 
   return 0;
 }
diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index 0c30fea7f..5f3f62f03 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -26,7 +26,9 @@ DEFAULT_OPT_FLAGS=-g -O3 -mtune=native -ffast-math
 # See https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html for details.
 #
 #DEFAULT_OPT_FLAGS+=-fopt-info-vec-missed
-DEFAULT_OPT_FLAGS+=-Wall
+DEFAULT_OPT_FLAGS+=-Wall #-Werror
+DEFAULT_OPT_FLAGS+=-Wno-ignored-attributes -Wno-deprecated-declarations
+#DEFAULT_OPT_FLAGS+=-mavx -mfma
 #
 # OpenMP flags
 #
@@ -43,6 +45,7 @@ OPENCLFLAG=-framework OpenCL
 #OPENCLDIR=/etc/alternatives/opencl-intel-tools
 #OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL
 OPENCLFLAG+=-Wno-ignored-attributes -Wno-deprecated-declarations
+METALFLAG=-framework MetalPerformanceShaders
 #
 # SYCL flags
 #
@@ -67,12 +70,12 @@ SYCLFLAG=-I$(SYCLDIR)/include
 #
 # TBB
 #
-TBBDIR=/usr/local/Cellar/tbb/2018_U3_1
+TBBDIR=/usr/local/Cellar/tbb/2019_U3_1
 TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
 #
 # Parallel STL, Boost, etc.
 #
-BOOSTFLAG=-I/usr/local/Cellar/boost/1.65.1/include
+BOOSTFLAG=-I/usr/local/Cellar/boost/1.68.0_1/include
 #RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
 RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
 PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG}
@@ -83,6 +86,19 @@ RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
 THRUSTDIR=/Users/jrhammon/Work/NVIDIA/thrust
 THRUSTFLAG=-I${THRUSTDIR} -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP
 #
+# SYCL flags
+#
+# triSYCL
+# https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
+SYCLDIR=./triSYCL
+SYCLCXX=${CXX} -std=c++17 ${OPENMPFLAG}
+SYCLFLAG=-I${SYCLDIR}/include ${BOOSTFLAG}
+# ProGTX
+# https://github.com/ProGTX/sycl-gtx
+#SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx
+#SYCLCXX=${CXX} ${OPENMPFLAG}
+#SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG}
+#
 # CBLAS for C++ DGEMM
 #
 CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions
diff --git a/common/make.defs.llvm b/common/make.defs.llvm
index c8aa874ea..4929aa0bb 100644
--- a/common/make.defs.llvm
+++ b/common/make.defs.llvm
@@ -4,8 +4,8 @@
 #
 # Base compilers and language options
 #
-LLVM_ROOT=/usr/local/Cellar/llvm/6.0.0
-LLVM_PATH=${LLVM_ROOT}/bin/
+#LLVM_ROOT=/usr/local/Cellar/llvm/6.0.0
+#LLVM_PATH=${LLVM_ROOT}/bin/
 #LLVM_PATH=/opt/llvm/HEAD/bin/
 # C99 is required in some implementations.
 CC=${LLVM_PATH}clang -std=c11 -pthread
@@ -47,13 +47,13 @@ OPENMPFLAG+=-L${LLVM_ROOT}/lib
 # OpenCL flags
 #
 # MacOS
-OPENCLFLAG=-framework OpenCL
+#OPENCLFLAG=-framework OpenCL
 # POCL
 # http://portablecl.org/docs/html/using.html#linking-your-program-directly-with-pocl is not correct...
 #OPENCLFLAG=-I/opt/pocl/latest/include -L/opt/pocl/latest/lib -lpoclu -I/opt/pocl/latest/share/pocl/include -lOpenCL
 # Linux
 #OPENCLDIR=/etc/alternatives/opencl-intel-tools
-#OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL
+#OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL -Wno-deprecated-declarations
 #
 # SYCL flags
 #
@@ -76,8 +76,8 @@ SYCLFLAG+=-std=c++14
 # triSYCL
 # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
 SYCLDIR=./triSYCL
-SYCLCXX=${CXX} ${OPENMPFLAG}
-SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include $(DEFAULT_OPT_FLAGS)
+SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS)
+SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include
 # ProGTX
 # https://github.com/ProGTX/sycl-gtx
 #SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx
@@ -92,13 +92,15 @@ OCCADIR=${HOME}/prk-repo/Cxx11/occa
 #
 TBBDIR=/usr/local/Cellar/tbb/2018_U3_1
 TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
+#TBBDIR=/opt/intel/compilers_and_libraries_2019.2.159/linux/tbb
+#TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -L${TBBDIR}/lib/intel64_lin/gcc4.7 -ltbb
 #
 # Parallel STL, Boost, etc.
 #
-BOOSTFLAG=-I/usr/local/Cellar/boost/1.65.1/include
+#BOOSTFLAG=-I/usr/local/Cellar/boost/1.65.1/include
 #RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
 RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
-PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG}
+PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG} -Wno-\#pragma-messages
 KOKKOSDIR=/opt/kokkos/clang
 KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl
 RAJADIR=/opt/raja/clang

From 3caffa78582bd47ebb47e1eb5bca51b36d4b52e6 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Tue, 5 Mar 2019 13:24:21 -0800
Subject: [PATCH 122/245] OpenCL: add No Device errors (#373)

* add No Device errors
* errno needs to be included unconditionally
---
 C1z/prk_util.h          | 2 +-
 Cxx11/nstream-opencl.cc | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/C1z/prk_util.h b/C1z/prk_util.h
index 24e428552..313cca471 100644
--- a/C1z/prk_util.h
+++ b/C1z/prk_util.h
@@ -56,6 +56,7 @@ const bool false=0;
 #include <math.h>    // fabs
 #include <time.h>    // clock_gettime, timespec_get
 #include <assert.h>
+#include <errno.h>
 
 #ifndef MIN
 #define MIN(x,y) ((x)<(y)?(x):(y))
@@ -142,7 +143,6 @@ int __cilkrts_get_nworkers(void);
 # include <threads.h>
 #else
 # define HAVE_PTHREADS
-# include <errno.h>
 # include <pthread.h>
 #endif
 
diff --git a/Cxx11/nstream-opencl.cc b/Cxx11/nstream-opencl.cc
index b0241dd5d..9d81f1b8d 100644
--- a/Cxx11/nstream-opencl.cc
+++ b/Cxx11/nstream-opencl.cc
@@ -203,6 +203,8 @@ int main(int argc, char* argv[])
         run<double>(cpu, iterations, length);
     }
     run<float>(cpu, iterations, length);
+  } else {
+    std::cerr << "No CPU" << std::endl;
   }
 
   cl::Context gpu(CL_DEVICE_TYPE_GPU, NULL, NULL, NULL, &err);
@@ -216,6 +218,8 @@ int main(int argc, char* argv[])
         run<double>(gpu, iterations, length);
     }
     run<float>(gpu, iterations, length);
+  } else {
+    std::cerr << "No GPU" << std::endl;
   }
 
   cl::Context acc(CL_DEVICE_TYPE_ACCELERATOR, NULL, NULL, NULL, &err);
@@ -230,6 +234,8 @@ int main(int argc, char* argv[])
         run<double>(acc, iterations, length);
     }
     run<float>(acc, iterations, length);
+  } else {
+    std::cerr << "No ACC" << std::endl;
   }
 
   return 0;

From 8b626e1bd7b14e9c0115a50d6051c314b97656ff Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 2 Mar 2019 21:34:47 -0800
Subject: [PATCH 123/245] fix nstream correctness by initializing host vectors

---
 Cxx11/nstream-sycl.cc | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc
index b4c056990..2f7e64253 100644
--- a/Cxx11/nstream-sycl.cc
+++ b/Cxx11/nstream-sycl.cc
@@ -79,9 +79,9 @@ void run(cl::sycl::queue & q, int iterations, size_t length)
 
   double nstream_time(0);
 
-  std::vector<T> h_A(length);
-  std::vector<T> h_B(length);
-  std::vector<T> h_C(length);
+  std::vector<T> h_A(length,0);
+  std::vector<T> h_B(length,2);
+  std::vector<T> h_C(length,2);
 
   auto range = prk::range(static_cast<size_t>(0), length);
 
@@ -124,9 +124,9 @@ void run(cl::sycl::queue & q, int iterations, size_t length)
   /// Analyze and output results
   //////////////////////////////////////////////////////////////////////
 
-  double ar(0);
-  double br(2);
-  double cr(2);
+  T ar(0);
+  T br(2);
+  T cr(2);
   for (int i=0; i<=iterations; ++i) {
       ar += br + scalar * cr;
   }
@@ -138,7 +138,7 @@ void run(cl::sycl::queue & q, int iterations, size_t length)
       asum += std::fabs(h_A[i]);
   }
 
-  double epsilon(1.e-8);
+  const double epsilon(1.e-8);
   if (std::fabs(ar-asum)/asum > epsilon) {
       std::cout << "Failed Validation on output array\n"
                 << "       Expected checksum: " << ar << "\n"
@@ -202,10 +202,12 @@ int main(int argc, char * argv[])
 
     if (1) {
         cl::sycl::queue host(cl::sycl::host_selector{});
+#ifndef TRISYCL
         auto device      = host.get_device();
-        auto platform    = device.get_platform();
         std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
+        auto platform    = device.get_platform();
         std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
+#endif
 
         run<float>(host, iterations, length);
         run<double>(host, iterations, length);
@@ -214,11 +216,13 @@ int main(int argc, char * argv[])
     // CPU requires spir64 target
     if (1) {
         cl::sycl::queue cpu(cl::sycl::cpu_selector{});
+#ifndef TRISYCL
         auto device      = cpu.get_device();
-        auto platform    = device.get_platform();
         std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
+        auto platform    = device.get_platform();
         std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
         //std::cout << "cl_khr_spir:   " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl;
+#endif
 
         run<float>(cpu, iterations, length);
         run<double>(cpu, iterations, length);
@@ -227,11 +231,13 @@ int main(int argc, char * argv[])
     // NVIDIA GPU requires ptx64 target and does not work very well
     if (0) {
         cl::sycl::queue gpu(cl::sycl::gpu_selector{});
+#ifndef TRISYCL
         auto device      = gpu.get_device();
-        auto platform    = device.get_platform();
         std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
+        auto platform    = device.get_platform();
         std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
         //std::cout << "cl_khr_spir:   " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl;
+#endif
 
         run<float>(gpu, iterations, length);
         run<double>(gpu, iterations, length);

From 40fa3819140ff90bc23a4058aecf6475ecc24d40 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 2 Mar 2019 21:34:59 -0800
Subject: [PATCH 124/245] make transpose-sycl multi-device etc

---
 Cxx11/transpose-sycl.cc | 180 +++++++++++++++++++++++++++-------------
 1 file changed, 123 insertions(+), 57 deletions(-)

diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc
index 1c8489806..d7b33e866 100644
--- a/Cxx11/transpose-sycl.cc
+++ b/Cxx11/transpose-sycl.cc
@@ -53,66 +53,33 @@
 
 #include "prk_util.h"
 
-int main(int argc, char * argv[])
-{
-  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
-  std::cout << "C++11/SYCL Matrix transpose: B = A^T" << std::endl;
-
-  //////////////////////////////////////////////////////////////////////
-  /// Read and test input parameters
-  //////////////////////////////////////////////////////////////////////
-
-  int iterations;
-  size_t order;
-  try {
-      if (argc < 3) {
-        throw "Usage: <# iterations> <matrix order>";
-      }
-
-      // number of times to do the transpose
-      iterations  = std::atoi(argv[1]);
-      if (iterations < 1) {
-        throw "ERROR: iterations must be >= 1";
-      }
-
-      // order of a the matrix
-      order = std::atoi(argv[2]);
-      if (order <= 0) {
-        throw "ERROR: Matrix Order must be greater than 0";
-      } else if (order > std::floor(std::sqrt(INT_MAX))) {
-        throw "ERROR: matrix dimension too large - overflow risk";
-      }
-  }
-  catch (const char * e) {
-    std::cout << e << std::endl;
-    return 1;
-  }
-
-  std::cout << "Number of iterations  = " << iterations << std::endl;
-  std::cout << "Matrix order          = " << order << std::endl;
+// need to declare kernel class as template
+// to prevent name mangling conflict below
+template <typename T> class transpose;
 
+template <typename T>
+void run(cl::sycl::queue & q, int iterations, size_t order)
+{
   //////////////////////////////////////////////////////////////////////
   /// Allocate space for the input and transpose matrix
   //////////////////////////////////////////////////////////////////////
 
   double trans_time(0);
 
-  std::vector<double> h_A(order*order);
-  std::vector<double> h_B(order*order,0.0);
+  std::vector<T> h_A(order*order);
+  std::vector<T> h_B(order*order,static_cast<T>(0));
 
   // fill A with the sequence 0 to order^2-1 as doubles
-  std::iota(h_A.begin(), h_A.end(), 0.0);
+  std::iota(h_A.begin(), h_A.end(), static_cast<T>(0));
+
+  try {
 
-  // SYCL device queue
-  cl::sycl::queue q;
-  {
-    // initialize device buffers from host buffers
 #if USE_2D_INDEXING
     cl::sycl::buffer<double,2> d_A( h_A.data(), cl::sycl::range<2>{order,order} );
     cl::sycl::buffer<double,2> d_B( h_B.data(), cl::sycl::range<2>{order,order} );
 #else
-    cl::sycl::buffer<double> d_A { h_A.data(), h_A.size() };
-    cl::sycl::buffer<double> d_B { h_B.data(), h_B.size() };
+    cl::sycl::buffer<T> d_A { h_A.data(), h_A.size() };
+    cl::sycl::buffer<T> d_B { h_B.data(), h_B.size() };
 #endif
 
     for (int iter = 0; iter<=iterations; ++iter) {
@@ -122,19 +89,19 @@ int main(int argc, char * argv[])
       q.submit([&](cl::sycl::handler& h) {
 
         // accessor methods
-        auto A = d_A.get_access<cl::sycl::access::mode::read_write>(h);
-        auto B = d_B.get_access<cl::sycl::access::mode::read_write>(h);
+        auto A = d_A.template get_access<cl::sycl::access::mode::read_write>(h);
+        auto B = d_B.template get_access<cl::sycl::access::mode::read_write>(h);
 
         // transpose
-        h.parallel_for<class transpose>(cl::sycl::range<2>{order,order}, [=] (cl::sycl::item<2> it) {
+        h.parallel_for<class transpose<T>>(cl::sycl::range<2>{order,order}, [=] (cl::sycl::item<2> it) {
 #if USE_2D_INDEXING
           cl::sycl::id<2> ij{it[0],it[1]};
           cl::sycl::id<2> ji{it[1],it[0]};
           B[ij] += A[ji];
-          A[ji] += 1.0;
+          A[ji] += static_cast<T>(1);
 #else
           B[it[0] * order + it[1]] += A[it[1] * order + it[0]];
-          A[it[1] * order + it[0]] += 1.0;
+          A[it[1] * order + it[0]] += static_cast<T>(1);
 #endif
         });
       });
@@ -146,19 +113,23 @@ int main(int argc, char * argv[])
     // for other device-oriented programming models.
     trans_time = prk::wtime() - trans_time;
   }
+  catch (cl::sycl::exception e) {
+    std::cout << e.what() << std::endl;
+    return;
+  }
 
   //////////////////////////////////////////////////////////////////////
   /// Analyze and output results
   //////////////////////////////////////////////////////////////////////
 
   // TODO: replace with std::generate, std::accumulate, or similar
-  double const addit = (iterations+1.) * (iterations/2.);
+  const T addit = (iterations+1.) * (iterations/2.);
   double abserr(0);
   for (size_t i=0; i<order; ++i) {
     for (size_t j=0; j<order; ++j) {
       size_t const ij = i*order+j;
       size_t const ji = j*order+i;
-      double const reference = static_cast<double>(ij)*(1.+iterations)+addit;
+      const T reference = static_cast<T>(ij)*(1.+iterations)+addit;
       abserr += std::fabs(h_B[ji] - reference);
     }
   }
@@ -167,19 +138,114 @@ int main(int argc, char * argv[])
   std::cout << "Sum of absolute differences: " << abserr << std::endl;
 #endif
 
-  double const epsilon(1.0e-8);
+  const double epsilon(1.0e-8);
   if (abserr < epsilon) {
     std::cout << "Solution validates" << std::endl;
-    auto avgtime = trans_time/iterations;
-    auto bytes = (size_t)order * (size_t)order * sizeof(double);
-    std::cout << "Rate (MB/s): " << 1.0e-6 * (2.*bytes)/avgtime
+    double avgtime = trans_time/iterations;
+    double bytes = (size_t)order * (size_t)order * sizeof(T);
+    std::cout << 8*sizeof(T) << "B "
+              << "Rate (MB/s): " << 1.0e-6 * (2.*bytes)/avgtime
               << " Avg time (s): " << avgtime << std::endl;
   } else {
     std::cout << "ERROR: Aggregate squared error " << abserr
               << " exceeds threshold " << epsilon << std::endl;
+  }
+}
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/SYCL Matrix transpose: B = A^T" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations;
+  size_t order;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <matrix order>";
+      }
+
+      // number of times to do the transpose
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      // order of a the matrix
+      order = std::atoi(argv[2]);
+      if (order <= 0) {
+        throw "ERROR: Matrix Order must be greater than 0";
+      } else if (order > std::floor(std::sqrt(INT_MAX))) {
+        throw "ERROR: matrix dimension too large - overflow risk";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
     return 1;
   }
 
+  std::cout << "Number of iterations  = " << iterations << std::endl;
+  std::cout << "Matrix order          = " << order << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Setup SYCL environment
+  //////////////////////////////////////////////////////////////////////
+
+  try {
+
+    if (1) {
+        cl::sycl::queue host(cl::sycl::host_selector{});
+#ifndef TRISYCL
+        auto device      = host.get_device();
+        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
+        auto platform    = device.get_platform();
+        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
+#endif
+
+        run<float>(host, iterations, order);
+        run<double>(host, iterations, order);
+    }
+
+    // CPU requires spir64 target
+    if (1) {
+        cl::sycl::queue cpu(cl::sycl::cpu_selector{});
+#ifndef TRISYCL
+        auto device      = cpu.get_device();
+        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
+        auto platform    = device.get_platform();
+        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
+        //std::cout << "cl_khr_spir:   " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl;
+#endif
+
+        run<float>(cpu, iterations, order);
+        run<double>(cpu, iterations, order);
+    }
+
+    // NVIDIA GPU requires ptx64 target and does not work very well
+    if (0) {
+        cl::sycl::queue gpu(cl::sycl::gpu_selector{});
+#ifndef TRISYCL
+        auto device      = gpu.get_device();
+        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
+        auto platform    = device.get_platform();
+        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
+        //std::cout << "cl_khr_spir:   " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl;
+#endif
+
+        run<float>(gpu, iterations, order);
+        run<double>(gpu, iterations, order);
+    }
+  }
+  catch (cl::sycl::exception e) {
+    std::cout << e.what() << std::endl;
+  }
+  catch (std::exception e) {
+    std::cout << e.what() << std::endl;
+  }
+
   return 0;
 }
 

From f9425e9c2719e80172d56290c8da0de09154a146 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 2 Mar 2019 21:42:45 -0800
Subject: [PATCH 125/245] templatize stencil sycl kernel over type

---
 Cxx11/generate-sycl-stencil.py |  33 ++--
 Cxx11/stencil_sycl.hpp         | 270 +++++++++++++++++----------------
 2 files changed, 157 insertions(+), 146 deletions(-)

diff --git a/Cxx11/generate-sycl-stencil.py b/Cxx11/generate-sycl-stencil.py
index 1c71ff03c..6f797dac7 100755
--- a/Cxx11/generate-sycl-stencil.py
+++ b/Cxx11/generate-sycl-stencil.py
@@ -6,13 +6,14 @@
 import os
 
 def codegen(src,pattern,stencil_size,radius,model,dim):
+    src.write('template <typename T>\n')
     src.write('void '+pattern+str(radius)+'(cl::sycl::queue & q, const size_t n, ')
     if (dim==2):
-        src.write('cl::sycl::buffer<double, 2> & d_in, ')
-        src.write('cl::sycl::buffer<double, 2> & d_out)\n')
+        src.write('cl::sycl::buffer<T, 2> & d_in, ')
+        src.write('cl::sycl::buffer<T, 2> & d_out)\n')
     else:
-        src.write('cl::sycl::buffer<double> & d_in, ')
-        src.write('cl::sycl::buffer<double> & d_out)\n')
+        src.write('cl::sycl::buffer<T> & d_in, ')
+        src.write('cl::sycl::buffer<T> & d_out)\n')
     src.write('{\n')
     src.write('  q.submit([&](cl::sycl::handler& h) {\n')
     src.write('    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);\n')
@@ -41,36 +42,36 @@ def codegen(src,pattern,stencil_size,radius,model,dim):
                 if i > 1:
                     src.write('\n')
                     src.write(19*' ')
-                src.write('+in[xy+dx'+str(i)+'] * '+str(+1./(2.*i*radius)))
+                src.write('+in[xy+dx'+str(i)+'] * static_cast<T>('+str(+1./(2.*i*radius))+')')
                 src.write('\n'+19*' ')
-                src.write('+in[xy-dx'+str(i)+'] * '+str(-1./(2.*i*radius)))
+                src.write('+in[xy-dx'+str(i)+'] * static_cast<T>('+str(-1./(2.*i*radius))+')')
                 src.write('\n'+19*' ')
-                src.write('+in[xy+dy'+str(i)+'] * '+str(+1./(2.*i*radius)))
+                src.write('+in[xy+dy'+str(i)+'] * static_cast<T>('+str(+1./(2.*i*radius))+')')
                 src.write('\n'+19*' ')
-                src.write('+in[xy-dy'+str(i)+'] * '+str(-1./(2.*i*radius)))
+                src.write('+in[xy-dy'+str(i)+'] * static_cast<T>('+str(-1./(2.*i*radius))+')')
             else:
                 # 1D indexing the slow way
                 #if i > 1:
                 #    src.write('\n')
                 #    src.write(22*' ')
-                #src.write('+in[i*n+(j+'+str(i)+')] * '+str(+1./(2.*i*radius)))
+                #src.write('+in[i*n+(j+'+str(i)+')] * static_cast<T>('+str(+1./(2.*i*radius))+')')
                 #src.write('\n'+22*' ')
-                #src.write('+in[i*n+(j-'+str(i)+')] * '+str(-1./(2.*i*radius)))
+                #src.write('+in[i*n+(j-'+str(i)+')] * static_cast<T>('+str(-1./(2.*i*radius))+')')
                 #src.write('\n'+22*' ')
-                #src.write('+in[(i+'+str(i)+')*n+j] * '+str(+1./(2.*i*radius)))
+                #src.write('+in[(i+'+str(i)+')*n+j] * static_cast<T>('+str(+1./(2.*i*radius))+')')
                 #src.write('\n'+22*' ')
-                #src.write('+in[(i-'+str(i)+')*n+j] * '+str(-1./(2.*i*radius)))
+                #src.write('+in[(i-'+str(i)+')*n+j] * static_cast<T>('+str(-1./(2.*i*radius))+')')
                 # 1D indexing the fast way
                 if i > 1:
                     src.write('\n')
                     src.write(30*' ')
-                src.write('+in[it[0]*n+(it[1]+'+str(i)+')] * '+str(+1./(2.*i*radius)))
+                src.write('+in[it[0]*n+(it[1]+'+str(i)+')] * static_cast<T>('+str(+1./(2.*i*radius))+')')
                 src.write('\n'+30*' ')
-                src.write('+in[it[0]*n+(it[1]-'+str(i)+')] * '+str(-1./(2.*i*radius)))
+                src.write('+in[it[0]*n+(it[1]-'+str(i)+')] * static_cast<T>('+str(-1./(2.*i*radius))+')')
                 src.write('\n'+30*' ')
-                src.write('+in[(it[0]+'+str(i)+')*n+it[1]] * '+str(+1./(2.*i*radius)))
+                src.write('+in[(it[0]+'+str(i)+')*n+it[1]] * static_cast<T>('+str(+1./(2.*i*radius))+')')
                 src.write('\n'+30*' ')
-                src.write('+in[(it[0]-'+str(i)+')*n+it[1]] * '+str(-1./(2.*i*radius)))
+                src.write('+in[(it[0]-'+str(i)+')*n+it[1]] * static_cast<T>('+str(-1./(2.*i*radius))+')')
             if i == radius:
                 src.write(';\n')
     else:
diff --git a/Cxx11/stencil_sycl.hpp b/Cxx11/stencil_sycl.hpp
index 6fbf8d9f7..c8a9d0a5b 100644
--- a/Cxx11/stencil_sycl.hpp
+++ b/Cxx11/stencil_sycl.hpp
@@ -1,18 +1,20 @@
-void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double> & d_in, cl::sycl::buffer<double> & d_out)
+template <typename T>
+void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::sycl::buffer<T> & d_out)
 {
   q.submit([&](cl::sycl::handler& h) {
     auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
     auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
     h.parallel_for<class star1_1d>(cl::sycl::range<2> {n-2,n-2}, cl::sycl::id<2> {1,1}, [=] (cl::sycl::item<2> it) {
-        out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.5
-                              +in[it[0]*n+(it[1]-1)] * -0.5
-                              +in[(it[0]+1)*n+it[1]] * 0.5
-                              +in[(it[0]-1)*n+it[1]] * -0.5;
+        out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast<T>(0.5)
+                              +in[it[0]*n+(it[1]-1)] * static_cast<T>(-0.5)
+                              +in[(it[0]+1)*n+it[1]] * static_cast<T>(0.5)
+                              +in[(it[0]-1)*n+it[1]] * static_cast<T>(-0.5);
     });
   });
 }
 
-void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double, 2> & d_in, cl::sycl::buffer<double, 2> & d_out)
+template <typename T>
+void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, cl::sycl::buffer<T, 2> & d_out)
 {
   q.submit([&](cl::sycl::handler& h) {
     auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
@@ -21,33 +23,35 @@ void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double, 2> & d_
     cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
     h.parallel_for<class star1_2d>(cl::sycl::range<2> {n-2,n-2}, cl::sycl::id<2> {1,1}, [=] (cl::sycl::item<2> it) {
         cl::sycl::id<2> xy = it.get_id();
-        out[xy] += +in[xy+dx1] * 0.5
-                   +in[xy-dx1] * -0.5
-                   +in[xy+dy1] * 0.5
-                   +in[xy-dy1] * -0.5;
+        out[xy] += +in[xy+dx1] * static_cast<T>(0.5)
+                   +in[xy-dx1] * static_cast<T>(-0.5)
+                   +in[xy+dy1] * static_cast<T>(0.5)
+                   +in[xy-dy1] * static_cast<T>(-0.5);
     });
   });
 }
 
-void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double> & d_in, cl::sycl::buffer<double> & d_out)
+template <typename T>
+void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::sycl::buffer<T> & d_out)
 {
   q.submit([&](cl::sycl::handler& h) {
     auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
     auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
     h.parallel_for<class star2_1d>(cl::sycl::range<2> {n-4,n-4}, cl::sycl::id<2> {2,2}, [=] (cl::sycl::item<2> it) {
-        out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.25
-                              +in[it[0]*n+(it[1]-1)] * -0.25
-                              +in[(it[0]+1)*n+it[1]] * 0.25
-                              +in[(it[0]-1)*n+it[1]] * -0.25
-                              +in[it[0]*n+(it[1]+2)] * 0.125
-                              +in[it[0]*n+(it[1]-2)] * -0.125
-                              +in[(it[0]+2)*n+it[1]] * 0.125
-                              +in[(it[0]-2)*n+it[1]] * -0.125;
+        out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast<T>(0.25)
+                              +in[it[0]*n+(it[1]-1)] * static_cast<T>(-0.25)
+                              +in[(it[0]+1)*n+it[1]] * static_cast<T>(0.25)
+                              +in[(it[0]-1)*n+it[1]] * static_cast<T>(-0.25)
+                              +in[it[0]*n+(it[1]+2)] * static_cast<T>(0.125)
+                              +in[it[0]*n+(it[1]-2)] * static_cast<T>(-0.125)
+                              +in[(it[0]+2)*n+it[1]] * static_cast<T>(0.125)
+                              +in[(it[0]-2)*n+it[1]] * static_cast<T>(-0.125);
     });
   });
 }
 
-void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double, 2> & d_in, cl::sycl::buffer<double, 2> & d_out)
+template <typename T>
+void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, cl::sycl::buffer<T, 2> & d_out)
 {
   q.submit([&](cl::sycl::handler& h) {
     auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
@@ -58,41 +62,43 @@ void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double, 2> & d_
     cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2});
     h.parallel_for<class star2_2d>(cl::sycl::range<2> {n-4,n-4}, cl::sycl::id<2> {2,2}, [=] (cl::sycl::item<2> it) {
         cl::sycl::id<2> xy = it.get_id();
-        out[xy] += +in[xy+dx1] * 0.25
-                   +in[xy-dx1] * -0.25
-                   +in[xy+dy1] * 0.25
-                   +in[xy-dy1] * -0.25
-                   +in[xy+dx2] * 0.125
-                   +in[xy-dx2] * -0.125
-                   +in[xy+dy2] * 0.125
-                   +in[xy-dy2] * -0.125;
+        out[xy] += +in[xy+dx1] * static_cast<T>(0.25)
+                   +in[xy-dx1] * static_cast<T>(-0.25)
+                   +in[xy+dy1] * static_cast<T>(0.25)
+                   +in[xy-dy1] * static_cast<T>(-0.25)
+                   +in[xy+dx2] * static_cast<T>(0.125)
+                   +in[xy-dx2] * static_cast<T>(-0.125)
+                   +in[xy+dy2] * static_cast<T>(0.125)
+                   +in[xy-dy2] * static_cast<T>(-0.125);
     });
   });
 }
 
-void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double> & d_in, cl::sycl::buffer<double> & d_out)
+template <typename T>
+void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::sycl::buffer<T> & d_out)
 {
   q.submit([&](cl::sycl::handler& h) {
     auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
     auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
     h.parallel_for<class star3_1d>(cl::sycl::range<2> {n-6,n-6}, cl::sycl::id<2> {3,3}, [=] (cl::sycl::item<2> it) {
-        out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.16666666666666666
-                              +in[it[0]*n+(it[1]-1)] * -0.16666666666666666
-                              +in[(it[0]+1)*n+it[1]] * 0.16666666666666666
-                              +in[(it[0]-1)*n+it[1]] * -0.16666666666666666
-                              +in[it[0]*n+(it[1]+2)] * 0.08333333333333333
-                              +in[it[0]*n+(it[1]-2)] * -0.08333333333333333
-                              +in[(it[0]+2)*n+it[1]] * 0.08333333333333333
-                              +in[(it[0]-2)*n+it[1]] * -0.08333333333333333
-                              +in[it[0]*n+(it[1]+3)] * 0.05555555555555555
-                              +in[it[0]*n+(it[1]-3)] * -0.05555555555555555
-                              +in[(it[0]+3)*n+it[1]] * 0.05555555555555555
-                              +in[(it[0]-3)*n+it[1]] * -0.05555555555555555;
+        out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast<T>(0.166666666667)
+                              +in[it[0]*n+(it[1]-1)] * static_cast<T>(-0.166666666667)
+                              +in[(it[0]+1)*n+it[1]] * static_cast<T>(0.166666666667)
+                              +in[(it[0]-1)*n+it[1]] * static_cast<T>(-0.166666666667)
+                              +in[it[0]*n+(it[1]+2)] * static_cast<T>(0.0833333333333)
+                              +in[it[0]*n+(it[1]-2)] * static_cast<T>(-0.0833333333333)
+                              +in[(it[0]+2)*n+it[1]] * static_cast<T>(0.0833333333333)
+                              +in[(it[0]-2)*n+it[1]] * static_cast<T>(-0.0833333333333)
+                              +in[it[0]*n+(it[1]+3)] * static_cast<T>(0.0555555555556)
+                              +in[it[0]*n+(it[1]-3)] * static_cast<T>(-0.0555555555556)
+                              +in[(it[0]+3)*n+it[1]] * static_cast<T>(0.0555555555556)
+                              +in[(it[0]-3)*n+it[1]] * static_cast<T>(-0.0555555555556);
     });
   });
 }
 
-void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double, 2> & d_in, cl::sycl::buffer<double, 2> & d_out)
+template <typename T>
+void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, cl::sycl::buffer<T, 2> & d_out)
 {
   q.submit([&](cl::sycl::handler& h) {
     auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
@@ -105,49 +111,51 @@ void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double, 2> & d_
     cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3});
     h.parallel_for<class star3_2d>(cl::sycl::range<2> {n-6,n-6}, cl::sycl::id<2> {3,3}, [=] (cl::sycl::item<2> it) {
         cl::sycl::id<2> xy = it.get_id();
-        out[xy] += +in[xy+dx1] * 0.16666666666666666
-                   +in[xy-dx1] * -0.16666666666666666
-                   +in[xy+dy1] * 0.16666666666666666
-                   +in[xy-dy1] * -0.16666666666666666
-                   +in[xy+dx2] * 0.08333333333333333
-                   +in[xy-dx2] * -0.08333333333333333
-                   +in[xy+dy2] * 0.08333333333333333
-                   +in[xy-dy2] * -0.08333333333333333
-                   +in[xy+dx3] * 0.05555555555555555
-                   +in[xy-dx3] * -0.05555555555555555
-                   +in[xy+dy3] * 0.05555555555555555
-                   +in[xy-dy3] * -0.05555555555555555;
+        out[xy] += +in[xy+dx1] * static_cast<T>(0.166666666667)
+                   +in[xy-dx1] * static_cast<T>(-0.166666666667)
+                   +in[xy+dy1] * static_cast<T>(0.166666666667)
+                   +in[xy-dy1] * static_cast<T>(-0.166666666667)
+                   +in[xy+dx2] * static_cast<T>(0.0833333333333)
+                   +in[xy-dx2] * static_cast<T>(-0.0833333333333)
+                   +in[xy+dy2] * static_cast<T>(0.0833333333333)
+                   +in[xy-dy2] * static_cast<T>(-0.0833333333333)
+                   +in[xy+dx3] * static_cast<T>(0.0555555555556)
+                   +in[xy-dx3] * static_cast<T>(-0.0555555555556)
+                   +in[xy+dy3] * static_cast<T>(0.0555555555556)
+                   +in[xy-dy3] * static_cast<T>(-0.0555555555556);
     });
   });
 }
 
-void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double> & d_in, cl::sycl::buffer<double> & d_out)
+template <typename T>
+void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::sycl::buffer<T> & d_out)
 {
   q.submit([&](cl::sycl::handler& h) {
     auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
     auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
     h.parallel_for<class star4_1d>(cl::sycl::range<2> {n-8,n-8}, cl::sycl::id<2> {4,4}, [=] (cl::sycl::item<2> it) {
-        out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.125
-                              +in[it[0]*n+(it[1]-1)] * -0.125
-                              +in[(it[0]+1)*n+it[1]] * 0.125
-                              +in[(it[0]-1)*n+it[1]] * -0.125
-                              +in[it[0]*n+(it[1]+2)] * 0.0625
-                              +in[it[0]*n+(it[1]-2)] * -0.0625
-                              +in[(it[0]+2)*n+it[1]] * 0.0625
-                              +in[(it[0]-2)*n+it[1]] * -0.0625
-                              +in[it[0]*n+(it[1]+3)] * 0.041666666666666664
-                              +in[it[0]*n+(it[1]-3)] * -0.041666666666666664
-                              +in[(it[0]+3)*n+it[1]] * 0.041666666666666664
-                              +in[(it[0]-3)*n+it[1]] * -0.041666666666666664
-                              +in[it[0]*n+(it[1]+4)] * 0.03125
-                              +in[it[0]*n+(it[1]-4)] * -0.03125
-                              +in[(it[0]+4)*n+it[1]] * 0.03125
-                              +in[(it[0]-4)*n+it[1]] * -0.03125;
+        out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast<T>(0.125)
+                              +in[it[0]*n+(it[1]-1)] * static_cast<T>(-0.125)
+                              +in[(it[0]+1)*n+it[1]] * static_cast<T>(0.125)
+                              +in[(it[0]-1)*n+it[1]] * static_cast<T>(-0.125)
+                              +in[it[0]*n+(it[1]+2)] * static_cast<T>(0.0625)
+                              +in[it[0]*n+(it[1]-2)] * static_cast<T>(-0.0625)
+                              +in[(it[0]+2)*n+it[1]] * static_cast<T>(0.0625)
+                              +in[(it[0]-2)*n+it[1]] * static_cast<T>(-0.0625)
+                              +in[it[0]*n+(it[1]+3)] * static_cast<T>(0.0416666666667)
+                              +in[it[0]*n+(it[1]-3)] * static_cast<T>(-0.0416666666667)
+                              +in[(it[0]+3)*n+it[1]] * static_cast<T>(0.0416666666667)
+                              +in[(it[0]-3)*n+it[1]] * static_cast<T>(-0.0416666666667)
+                              +in[it[0]*n+(it[1]+4)] * static_cast<T>(0.03125)
+                              +in[it[0]*n+(it[1]-4)] * static_cast<T>(-0.03125)
+                              +in[(it[0]+4)*n+it[1]] * static_cast<T>(0.03125)
+                              +in[(it[0]-4)*n+it[1]] * static_cast<T>(-0.03125);
     });
   });
 }
 
-void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double, 2> & d_in, cl::sycl::buffer<double, 2> & d_out)
+template <typename T>
+void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, cl::sycl::buffer<T, 2> & d_out)
 {
   q.submit([&](cl::sycl::handler& h) {
     auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
@@ -162,57 +170,59 @@ void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double, 2> & d_
     cl::sycl::id<2> dy4(cl::sycl::range<2> {0,4});
     h.parallel_for<class star4_2d>(cl::sycl::range<2> {n-8,n-8}, cl::sycl::id<2> {4,4}, [=] (cl::sycl::item<2> it) {
         cl::sycl::id<2> xy = it.get_id();
-        out[xy] += +in[xy+dx1] * 0.125
-                   +in[xy-dx1] * -0.125
-                   +in[xy+dy1] * 0.125
-                   +in[xy-dy1] * -0.125
-                   +in[xy+dx2] * 0.0625
-                   +in[xy-dx2] * -0.0625
-                   +in[xy+dy2] * 0.0625
-                   +in[xy-dy2] * -0.0625
-                   +in[xy+dx3] * 0.041666666666666664
-                   +in[xy-dx3] * -0.041666666666666664
-                   +in[xy+dy3] * 0.041666666666666664
-                   +in[xy-dy3] * -0.041666666666666664
-                   +in[xy+dx4] * 0.03125
-                   +in[xy-dx4] * -0.03125
-                   +in[xy+dy4] * 0.03125
-                   +in[xy-dy4] * -0.03125;
+        out[xy] += +in[xy+dx1] * static_cast<T>(0.125)
+                   +in[xy-dx1] * static_cast<T>(-0.125)
+                   +in[xy+dy1] * static_cast<T>(0.125)
+                   +in[xy-dy1] * static_cast<T>(-0.125)
+                   +in[xy+dx2] * static_cast<T>(0.0625)
+                   +in[xy-dx2] * static_cast<T>(-0.0625)
+                   +in[xy+dy2] * static_cast<T>(0.0625)
+                   +in[xy-dy2] * static_cast<T>(-0.0625)
+                   +in[xy+dx3] * static_cast<T>(0.0416666666667)
+                   +in[xy-dx3] * static_cast<T>(-0.0416666666667)
+                   +in[xy+dy3] * static_cast<T>(0.0416666666667)
+                   +in[xy-dy3] * static_cast<T>(-0.0416666666667)
+                   +in[xy+dx4] * static_cast<T>(0.03125)
+                   +in[xy-dx4] * static_cast<T>(-0.03125)
+                   +in[xy+dy4] * static_cast<T>(0.03125)
+                   +in[xy-dy4] * static_cast<T>(-0.03125);
     });
   });
 }
 
-void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double> & d_in, cl::sycl::buffer<double> & d_out)
+template <typename T>
+void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::sycl::buffer<T> & d_out)
 {
   q.submit([&](cl::sycl::handler& h) {
     auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
     auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
     h.parallel_for<class star5_1d>(cl::sycl::range<2> {n-10,n-10}, cl::sycl::id<2> {5,5}, [=] (cl::sycl::item<2> it) {
-        out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.1
-                              +in[it[0]*n+(it[1]-1)] * -0.1
-                              +in[(it[0]+1)*n+it[1]] * 0.1
-                              +in[(it[0]-1)*n+it[1]] * -0.1
-                              +in[it[0]*n+(it[1]+2)] * 0.05
-                              +in[it[0]*n+(it[1]-2)] * -0.05
-                              +in[(it[0]+2)*n+it[1]] * 0.05
-                              +in[(it[0]-2)*n+it[1]] * -0.05
-                              +in[it[0]*n+(it[1]+3)] * 0.03333333333333333
-                              +in[it[0]*n+(it[1]-3)] * -0.03333333333333333
-                              +in[(it[0]+3)*n+it[1]] * 0.03333333333333333
-                              +in[(it[0]-3)*n+it[1]] * -0.03333333333333333
-                              +in[it[0]*n+(it[1]+4)] * 0.025
-                              +in[it[0]*n+(it[1]-4)] * -0.025
-                              +in[(it[0]+4)*n+it[1]] * 0.025
-                              +in[(it[0]-4)*n+it[1]] * -0.025
-                              +in[it[0]*n+(it[1]+5)] * 0.02
-                              +in[it[0]*n+(it[1]-5)] * -0.02
-                              +in[(it[0]+5)*n+it[1]] * 0.02
-                              +in[(it[0]-5)*n+it[1]] * -0.02;
+        out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast<T>(0.1)
+                              +in[it[0]*n+(it[1]-1)] * static_cast<T>(-0.1)
+                              +in[(it[0]+1)*n+it[1]] * static_cast<T>(0.1)
+                              +in[(it[0]-1)*n+it[1]] * static_cast<T>(-0.1)
+                              +in[it[0]*n+(it[1]+2)] * static_cast<T>(0.05)
+                              +in[it[0]*n+(it[1]-2)] * static_cast<T>(-0.05)
+                              +in[(it[0]+2)*n+it[1]] * static_cast<T>(0.05)
+                              +in[(it[0]-2)*n+it[1]] * static_cast<T>(-0.05)
+                              +in[it[0]*n+(it[1]+3)] * static_cast<T>(0.0333333333333)
+                              +in[it[0]*n+(it[1]-3)] * static_cast<T>(-0.0333333333333)
+                              +in[(it[0]+3)*n+it[1]] * static_cast<T>(0.0333333333333)
+                              +in[(it[0]-3)*n+it[1]] * static_cast<T>(-0.0333333333333)
+                              +in[it[0]*n+(it[1]+4)] * static_cast<T>(0.025)
+                              +in[it[0]*n+(it[1]-4)] * static_cast<T>(-0.025)
+                              +in[(it[0]+4)*n+it[1]] * static_cast<T>(0.025)
+                              +in[(it[0]-4)*n+it[1]] * static_cast<T>(-0.025)
+                              +in[it[0]*n+(it[1]+5)] * static_cast<T>(0.02)
+                              +in[it[0]*n+(it[1]-5)] * static_cast<T>(-0.02)
+                              +in[(it[0]+5)*n+it[1]] * static_cast<T>(0.02)
+                              +in[(it[0]-5)*n+it[1]] * static_cast<T>(-0.02);
     });
   });
 }
 
-void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double, 2> & d_in, cl::sycl::buffer<double, 2> & d_out)
+template <typename T>
+void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, cl::sycl::buffer<T, 2> & d_out)
 {
   q.submit([&](cl::sycl::handler& h) {
     auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
@@ -229,26 +239,26 @@ void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double, 2> & d_
     cl::sycl::id<2> dy5(cl::sycl::range<2> {0,5});
     h.parallel_for<class star5_2d>(cl::sycl::range<2> {n-10,n-10}, cl::sycl::id<2> {5,5}, [=] (cl::sycl::item<2> it) {
         cl::sycl::id<2> xy = it.get_id();
-        out[xy] += +in[xy+dx1] * 0.1
-                   +in[xy-dx1] * -0.1
-                   +in[xy+dy1] * 0.1
-                   +in[xy-dy1] * -0.1
-                   +in[xy+dx2] * 0.05
-                   +in[xy-dx2] * -0.05
-                   +in[xy+dy2] * 0.05
-                   +in[xy-dy2] * -0.05
-                   +in[xy+dx3] * 0.03333333333333333
-                   +in[xy-dx3] * -0.03333333333333333
-                   +in[xy+dy3] * 0.03333333333333333
-                   +in[xy-dy3] * -0.03333333333333333
-                   +in[xy+dx4] * 0.025
-                   +in[xy-dx4] * -0.025
-                   +in[xy+dy4] * 0.025
-                   +in[xy-dy4] * -0.025
-                   +in[xy+dx5] * 0.02
-                   +in[xy-dx5] * -0.02
-                   +in[xy+dy5] * 0.02
-                   +in[xy-dy5] * -0.02;
+        out[xy] += +in[xy+dx1] * static_cast<T>(0.1)
+                   +in[xy-dx1] * static_cast<T>(-0.1)
+                   +in[xy+dy1] * static_cast<T>(0.1)
+                   +in[xy-dy1] * static_cast<T>(-0.1)
+                   +in[xy+dx2] * static_cast<T>(0.05)
+                   +in[xy-dx2] * static_cast<T>(-0.05)
+                   +in[xy+dy2] * static_cast<T>(0.05)
+                   +in[xy-dy2] * static_cast<T>(-0.05)
+                   +in[xy+dx3] * static_cast<T>(0.0333333333333)
+                   +in[xy-dx3] * static_cast<T>(-0.0333333333333)
+                   +in[xy+dy3] * static_cast<T>(0.0333333333333)
+                   +in[xy-dy3] * static_cast<T>(-0.0333333333333)
+                   +in[xy+dx4] * static_cast<T>(0.025)
+                   +in[xy-dx4] * static_cast<T>(-0.025)
+                   +in[xy+dy4] * static_cast<T>(0.025)
+                   +in[xy-dy4] * static_cast<T>(-0.025)
+                   +in[xy+dx5] * static_cast<T>(0.02)
+                   +in[xy-dx5] * static_cast<T>(-0.02)
+                   +in[xy+dy5] * static_cast<T>(0.02)
+                   +in[xy-dy5] * static_cast<T>(-0.02);
     });
   });
 }

From 19d457cf04c17a485897c306aa79db20a708520f Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 2 Mar 2019 21:59:31 -0800
Subject: [PATCH 126/245] SYCL stencil now templated

---
 Cxx11/generate-sycl-stencil.py |   4 +-
 Cxx11/nstream-sycl.cc          |   4 +
 Cxx11/stencil-sycl.cc          | 267 +++++++++++++++++++++------------
 Cxx11/stencil_sycl.hpp         |  40 ++---
 Cxx11/transpose-sycl.cc        |   4 +
 5 files changed, 201 insertions(+), 118 deletions(-)

diff --git a/Cxx11/generate-sycl-stencil.py b/Cxx11/generate-sycl-stencil.py
index 6f797dac7..e1857e8c2 100755
--- a/Cxx11/generate-sycl-stencil.py
+++ b/Cxx11/generate-sycl-stencil.py
@@ -16,8 +16,8 @@ def codegen(src,pattern,stencil_size,radius,model,dim):
         src.write('cl::sycl::buffer<T> & d_out)\n')
     src.write('{\n')
     src.write('  q.submit([&](cl::sycl::handler& h) {\n')
-    src.write('    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);\n')
-    src.write('    auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);\n')
+    src.write('    auto in  = d_in.template get_access<cl::sycl::access::mode::read>(h);\n')
+    src.write('    auto out = d_out.template get_access<cl::sycl::access::mode::read_write>(h);\n')
     if (dim==2):
         for r in range(1,radius+1):
             src.write('    cl::sycl::id<2> dx'+str(r)+'(cl::sycl::range<2> {'+str(r)+',0});\n')
diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc
index 2f7e64253..bb94391ad 100644
--- a/Cxx11/nstream-sycl.cc
+++ b/Cxx11/nstream-sycl.cc
@@ -119,6 +119,10 @@ void run(cl::sycl::queue & q, int iterations, size_t length)
     std::cout << e.what() << std::endl;
     return;
   }
+  catch (std::exception e) {
+    std::cout << e.what() << std::endl;
+    return;
+  }
 
   //////////////////////////////////////////////////////////////////////
   /// Analyze and output results
diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc
index e42eaef50..585fe62e9 100644
--- a/Cxx11/stencil-sycl.cc
+++ b/Cxx11/stencil-sycl.cc
@@ -65,10 +65,15 @@
 #include "prk_util.h"
 #include "stencil_sycl.hpp"
 
+template <typename T> class init;
+template <typename T> class add;
+
 #if USE_2D_INDEXING
-void nothing(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double, 2> & d_in, cl::sycl::buffer<double, 2> & d_out)
+template <typename T>
+void nothing(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, cl::sycl::buffer<T, 2> & d_out)
 #else
-void nothing(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double> & d_in, cl::sycl::buffer<double> & d_out)
+template <typename T>
+void nothing(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::sycl::buffer<T> & d_out)
 #endif
 {
     std::cout << "You are trying to use a stencil that does not exist.\n";
@@ -77,74 +82,10 @@ void nothing(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<double> & d_i
     std::abort();
 }
 
-int main(int argc, char* argv[])
+template <typename T>
+void run(cl::sycl::queue & q, int iterations, size_t n, size_t tile_size, bool star, size_t radius)
 {
-  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
-  std::cout << "C++11/SYCL Stencil execution on 2D grid" << std::endl;
-
-  //////////////////////////////////////////////////////////////////////
-  // Process and test input parameters
-  //////////////////////////////////////////////////////////////////////
-
-  int iterations;
-  size_t n, tile_size;
-  bool star = true;
-  size_t radius = 2;
-  try {
-      if (argc < 3) {
-        throw "Usage: <# iterations> <array dimension> [<tile size> <star/grid> <stencil radius>]";
-      }
-
-      // number of times to run the algorithm
-      iterations  = std::atoi(argv[1]);
-      if (iterations < 1) {
-        throw "ERROR: iterations must be >= 1";
-      }
-
-      // linear grid dimension
-      n  = std::atoi(argv[2]);
-      if (n < 1) {
-        throw "ERROR: grid dimension must be positive";
-      } else if (n > std::floor(std::sqrt(INT_MAX))) {
-        throw "ERROR: grid dimension too large - overflow risk";
-      }
-
-      // default tile size for tiling of local transpose
-      tile_size = 32;
-      if (argc > 3) {
-          tile_size = std::atoi(argv[3]);
-          if (tile_size <= 0) tile_size = n;
-          if (tile_size > n) tile_size = n;
-      }
-
-      // stencil pattern
-      if (argc > 4) {
-          auto stencil = std::string(argv[4]);
-          auto grid = std::string("grid");
-          star = (stencil == grid) ? false : true;
-      }
-
-      // stencil radius
-      radius = 2;
-      if (argc > 5) {
-          radius = std::atoi(argv[5]);
-      }
-
-      if ( (radius < 1) || (2*radius+1 > n) ) {
-        throw "ERROR: Stencil radius negative or too large";
-      }
-  }
-  catch (const char * e) {
-    std::cout << e << std::endl;
-    return 1;
-  }
-
-  std::cout << "Number of iterations = " << iterations << std::endl;
-  std::cout << "Grid size            = " << n << std::endl;
-  std::cout << "Type of stencil      = " << (star ? "star" : "grid") << std::endl;
-  std::cout << "Radius of stencil    = " << radius << std::endl;
-
-  auto stencil = nothing;
+  auto stencil = nothing<T>;
   if (star) {
       switch (radius) {
           case 1: stencil = star1; break;
@@ -170,40 +111,39 @@ int main(int argc, char* argv[])
   // Allocate space and perform the computation
   //////////////////////////////////////////////////////////////////////
 
-  auto stencil_time = 0.0;
+  double stencil_time(0);
 
-  std::vector<double> h_in(n*n,0.0);
-  std::vector<double> h_out(n*n,0.0);
+  std::vector<T> h_in(n*n,0);
+  std::vector<T> h_out(n*n,0);
+
+  try {
 
-  // SYCL device queue
-  cl::sycl::queue q;
-  {
     // initialize device buffers from host buffers
 #if USE_2D_INDEXING
-    cl::sycl::buffer<double, 2> d_in  { cl::sycl::range<2> {n, n} };
-    cl::sycl::buffer<double, 2> d_out { h_out.data(), cl::sycl::range<2> {n, n} };
+    cl::sycl::buffer<T, 2> d_in  { cl::sycl::range<2> {n, n} };
+    cl::sycl::buffer<T, 2> d_out { h_out.data(), cl::sycl::range<2> {n, n} };
 #else
     // FIXME: if I don't initialize this buffer from host, the results are wrong.  Why?
-    //cl::sycl::buffer<double> d_in  { cl::sycl::range<1> {n*n} };
-    cl::sycl::buffer<double> d_in  { h_in.data(),  h_in.size() };
-    cl::sycl::buffer<double> d_out { h_out.data(), h_out.size() };
+    //cl::sycl::buffer<T> d_in  { cl::sycl::range<1> {n*n} };
+    cl::sycl::buffer<T> d_in  { h_in.data(),  h_in.size() };
+    cl::sycl::buffer<T> d_out { h_out.data(), h_out.size() };
 #endif
 
     q.submit([&](cl::sycl::handler& h) {
 
       // accessor methods
-      auto in  = d_in.get_access<cl::sycl::access::mode::read_write>(h);
+      auto in  = d_in.template get_access<cl::sycl::access::mode::read_write>(h);
 
-      h.parallel_for<class init>(cl::sycl::range<2> {n, n}, [=] (cl::sycl::item<2> it) {
+      h.parallel_for<class init<T>>(cl::sycl::range<2> {n, n}, [=] (cl::sycl::item<2> it) {
 #if USE_2D_INDEXING
           cl::sycl::id<2> xy = it.get_id();
           auto i = it[0];
           auto j = it[1];
-          in[xy] = static_cast<double>(i+j);
+          in[xy] = static_cast<T>(i+j);
 #else
           auto i = it[0];
           auto j = it[1];
-          in[i*n+j] = static_cast<double>(i+j);
+          in[i*n+j] = static_cast<T>(i+j);
 #endif
       });
     });
@@ -214,27 +154,28 @@ int main(int argc, char* argv[])
       if (iter==1) stencil_time = prk::wtime();
 
       stencil(q, n, d_in, d_out);
-      // This is only necessary with triSYCL
+#ifdef TRISYCL
       q.wait();
+#endif
 
       q.submit([&](cl::sycl::handler& h) {
 
         // accessor methods
-        auto in  = d_in.get_access<cl::sycl::access::mode::read_write>(h);
+        auto in  = d_in.template get_access<cl::sycl::access::mode::read_write>(h);
 
         // Add constant to solution to force refresh of neighbor data, if any
-        h.parallel_for<class add>(cl::sycl::range<2> {n, n}, cl::sycl::id<2> {0, 0},
+        h.parallel_for<class add<T>>(cl::sycl::range<2> {n, n}, cl::sycl::id<2> {0, 0},
                                   [=] (cl::sycl::item<2> it) {
 #if USE_2D_INDEXING
             cl::sycl::id<2> xy = it.get_id();
-            in[xy] += 1.0;
+            in[xy] += static_cast<T>(1);
 #else
 #if 0 // This is noticeably slower :-(
             auto i = it[0];
             auto j = it[1];
             in[i*n+j] += 1.0;
 #else
-            in[it[0]*n+it[1]] += 1.0;
+            in[it[0]*n+it[1]] += static_cast<T>(1);
 #endif
 #endif
         });
@@ -243,6 +184,14 @@ int main(int argc, char* argv[])
     }
     stencil_time = prk::wtime() - stencil_time;
   }
+  catch (cl::sycl::exception e) {
+    std::cout << e.what() << std::endl;
+    return;
+  }
+  catch (std::exception e) {
+    std::cout << e.what() << std::endl;
+    return;
+  }
 
 #if 0
   for (auto i=0; i<n; i++) {
@@ -260,9 +209,9 @@ int main(int argc, char* argv[])
   auto active_points = (n-2L*radius)*(n-2L*radius);
 
   // compute L1 norm in parallel
-  double norm = 0.0;
-  for (auto i=radius; i<n-radius; i++) {
-    for (auto j=radius; j<n-radius; j++) {
+  double norm(0);
+  for (int i=radius; i<n-radius; i++) {
+    for (int j=radius; j<n-radius; j++) {
       norm += std::fabs(h_out[i*n+j]);
     }
   }
@@ -270,11 +219,10 @@ int main(int argc, char* argv[])
 
   // verify correctness
   const double epsilon = 1.0e-8;
-  double reference_norm = 2.*(iterations+1.);
+  const double reference_norm = 2*(iterations+1);
   if (std::fabs(norm-reference_norm) > epsilon) {
     std::cout << "ERROR: L1 norm = " << norm
               << " Reference L1 norm = " << reference_norm << std::endl;
-    return 1;
   } else {
     std::cout << "Solution validates" << std::endl;
 #ifdef VERBOSE
@@ -283,10 +231,137 @@ int main(int argc, char* argv[])
 #endif
     const size_t stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1);
     size_t flops = (2L*stencil_size+1L) * active_points;
-    auto avgtime = stencil_time/iterations;
-    std::cout << "Rate (MFlops/s): " << 1.0e-6 * static_cast<double>(flops)/avgtime
+    double avgtime = stencil_time/iterations;
+    std::cout << 8*sizeof(T) << "B "
+              << "Rate (MFlops/s): " << 1.0e-6 * static_cast<double>(flops)/avgtime
               << " Avg time (s): " << avgtime << std::endl;
   }
+}
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/SYCL Stencil execution on 2D grid" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Process and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations;
+  size_t n, tile_size;
+  bool star = true;
+  size_t radius = 2;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <array dimension> [<tile size> <star/grid> <stencil radius>]";
+      }
+
+      // number of times to run the algorithm
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      // linear grid dimension
+      n  = std::atoi(argv[2]);
+      if (n < 1) {
+        throw "ERROR: grid dimension must be positive";
+      } else if (n > std::floor(std::sqrt(INT_MAX))) {
+        throw "ERROR: grid dimension too large - overflow risk";
+      }
+
+      // default tile size for tiling of local transpose
+      tile_size = 32;
+      if (argc > 3) {
+          tile_size = std::atoi(argv[3]);
+          if (tile_size <= 0) tile_size = n;
+          if (tile_size > n) tile_size = n;
+      }
+
+      // stencil pattern
+      if (argc > 4) {
+          auto stencil = std::string(argv[4]);
+          auto grid = std::string("grid");
+          star = (stencil == grid) ? false : true;
+      }
+
+      // stencil radius
+      radius = 2;
+      if (argc > 5) {
+          radius = std::atoi(argv[5]);
+      }
+
+      if ( (radius < 1) || (2*radius+1 > n) ) {
+        throw "ERROR: Stencil radius negative or too large";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Grid size            = " << n << std::endl;
+  std::cout << "Type of stencil      = " << (star ? "star" : "grid") << std::endl;
+  std::cout << "Radius of stencil    = " << radius << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Setup SYCL environment
+  //////////////////////////////////////////////////////////////////////
+
+  try {
+
+    if (1) {
+        cl::sycl::queue host(cl::sycl::host_selector{});
+#ifndef TRISYCL
+        auto device      = host.get_device();
+        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
+        auto platform    = device.get_platform();
+        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
+#endif
+
+        run<float>(host, iterations, n, tile_size, star, radius);
+        run<double>(host, iterations, n, tile_size, star, radius);
+    }
+
+    // CPU requires spir64 target
+    if (1) {
+        cl::sycl::queue cpu(cl::sycl::cpu_selector{});
+#ifndef TRISYCL
+        auto device      = cpu.get_device();
+        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
+        auto platform    = device.get_platform();
+        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
+        //std::cout << "cl_khr_spir:   " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl;
+#endif
+
+        run<float>(cpu, iterations, n, tile_size, star, radius);
+        run<double>(cpu, iterations, n, tile_size, star, radius);
+    }
+
+    // NVIDIA GPU requires ptx64 target and does not work very well
+    if (0) {
+        cl::sycl::queue gpu(cl::sycl::gpu_selector{});
+#ifndef TRISYCL
+        auto device      = gpu.get_device();
+        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
+        auto platform    = device.get_platform();
+        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
+        //std::cout << "cl_khr_spir:   " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl;
+#endif
+
+        run<float>(gpu, iterations, n, tile_size, star, radius);
+        run<double>(gpu, iterations, n, tile_size, star, radius);
+    }
+  }
+  catch (cl::sycl::exception e) {
+    std::cout << e.what() << std::endl;
+  }
+  catch (std::exception e) {
+    std::cout << e.what() << std::endl;
+  }
 
   return 0;
 }
+
+
diff --git a/Cxx11/stencil_sycl.hpp b/Cxx11/stencil_sycl.hpp
index c8a9d0a5b..587f2adc7 100644
--- a/Cxx11/stencil_sycl.hpp
+++ b/Cxx11/stencil_sycl.hpp
@@ -2,8 +2,8 @@ template <typename T>
 void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::sycl::buffer<T> & d_out)
 {
   q.submit([&](cl::sycl::handler& h) {
-    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
-    auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
+    auto in  = d_in.template get_access<cl::sycl::access::mode::read>(h);
+    auto out = d_out.template get_access<cl::sycl::access::mode::read_write>(h);
     h.parallel_for<class star1_1d>(cl::sycl::range<2> {n-2,n-2}, cl::sycl::id<2> {1,1}, [=] (cl::sycl::item<2> it) {
         out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast<T>(0.5)
                               +in[it[0]*n+(it[1]-1)] * static_cast<T>(-0.5)
@@ -17,8 +17,8 @@ template <typename T>
 void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, cl::sycl::buffer<T, 2> & d_out)
 {
   q.submit([&](cl::sycl::handler& h) {
-    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
-    auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
+    auto in  = d_in.template get_access<cl::sycl::access::mode::read>(h);
+    auto out = d_out.template get_access<cl::sycl::access::mode::read_write>(h);
     cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
     cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
     h.parallel_for<class star1_2d>(cl::sycl::range<2> {n-2,n-2}, cl::sycl::id<2> {1,1}, [=] (cl::sycl::item<2> it) {
@@ -35,8 +35,8 @@ template <typename T>
 void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::sycl::buffer<T> & d_out)
 {
   q.submit([&](cl::sycl::handler& h) {
-    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
-    auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
+    auto in  = d_in.template get_access<cl::sycl::access::mode::read>(h);
+    auto out = d_out.template get_access<cl::sycl::access::mode::read_write>(h);
     h.parallel_for<class star2_1d>(cl::sycl::range<2> {n-4,n-4}, cl::sycl::id<2> {2,2}, [=] (cl::sycl::item<2> it) {
         out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast<T>(0.25)
                               +in[it[0]*n+(it[1]-1)] * static_cast<T>(-0.25)
@@ -54,8 +54,8 @@ template <typename T>
 void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, cl::sycl::buffer<T, 2> & d_out)
 {
   q.submit([&](cl::sycl::handler& h) {
-    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
-    auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
+    auto in  = d_in.template get_access<cl::sycl::access::mode::read>(h);
+    auto out = d_out.template get_access<cl::sycl::access::mode::read_write>(h);
     cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
     cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
     cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0});
@@ -78,8 +78,8 @@ template <typename T>
 void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::sycl::buffer<T> & d_out)
 {
   q.submit([&](cl::sycl::handler& h) {
-    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
-    auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
+    auto in  = d_in.template get_access<cl::sycl::access::mode::read>(h);
+    auto out = d_out.template get_access<cl::sycl::access::mode::read_write>(h);
     h.parallel_for<class star3_1d>(cl::sycl::range<2> {n-6,n-6}, cl::sycl::id<2> {3,3}, [=] (cl::sycl::item<2> it) {
         out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast<T>(0.166666666667)
                               +in[it[0]*n+(it[1]-1)] * static_cast<T>(-0.166666666667)
@@ -101,8 +101,8 @@ template <typename T>
 void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, cl::sycl::buffer<T, 2> & d_out)
 {
   q.submit([&](cl::sycl::handler& h) {
-    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
-    auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
+    auto in  = d_in.template get_access<cl::sycl::access::mode::read>(h);
+    auto out = d_out.template get_access<cl::sycl::access::mode::read_write>(h);
     cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
     cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
     cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0});
@@ -131,8 +131,8 @@ template <typename T>
 void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::sycl::buffer<T> & d_out)
 {
   q.submit([&](cl::sycl::handler& h) {
-    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
-    auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
+    auto in  = d_in.template get_access<cl::sycl::access::mode::read>(h);
+    auto out = d_out.template get_access<cl::sycl::access::mode::read_write>(h);
     h.parallel_for<class star4_1d>(cl::sycl::range<2> {n-8,n-8}, cl::sycl::id<2> {4,4}, [=] (cl::sycl::item<2> it) {
         out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast<T>(0.125)
                               +in[it[0]*n+(it[1]-1)] * static_cast<T>(-0.125)
@@ -158,8 +158,8 @@ template <typename T>
 void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, cl::sycl::buffer<T, 2> & d_out)
 {
   q.submit([&](cl::sycl::handler& h) {
-    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
-    auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
+    auto in  = d_in.template get_access<cl::sycl::access::mode::read>(h);
+    auto out = d_out.template get_access<cl::sycl::access::mode::read_write>(h);
     cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
     cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
     cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0});
@@ -194,8 +194,8 @@ template <typename T>
 void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::sycl::buffer<T> & d_out)
 {
   q.submit([&](cl::sycl::handler& h) {
-    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
-    auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
+    auto in  = d_in.template get_access<cl::sycl::access::mode::read>(h);
+    auto out = d_out.template get_access<cl::sycl::access::mode::read_write>(h);
     h.parallel_for<class star5_1d>(cl::sycl::range<2> {n-10,n-10}, cl::sycl::id<2> {5,5}, [=] (cl::sycl::item<2> it) {
         out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast<T>(0.1)
                               +in[it[0]*n+(it[1]-1)] * static_cast<T>(-0.1)
@@ -225,8 +225,8 @@ template <typename T>
 void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, cl::sycl::buffer<T, 2> & d_out)
 {
   q.submit([&](cl::sycl::handler& h) {
-    auto in  = d_in.get_access<cl::sycl::access::mode::read>(h);
-    auto out = d_out.get_access<cl::sycl::access::mode::read_write>(h);
+    auto in  = d_in.template get_access<cl::sycl::access::mode::read>(h);
+    auto out = d_out.template get_access<cl::sycl::access::mode::read_write>(h);
     cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
     cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
     cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0});
diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc
index d7b33e866..97bb8a09c 100644
--- a/Cxx11/transpose-sycl.cc
+++ b/Cxx11/transpose-sycl.cc
@@ -117,6 +117,10 @@ void run(cl::sycl::queue & q, int iterations, size_t order)
     std::cout << e.what() << std::endl;
     return;
   }
+  catch (std::exception e) {
+    std::cout << e.what() << std::endl;
+    return;
+  }
 
   //////////////////////////////////////////////////////////////////////
   /// Analyze and output results

From 01ad66d53f7453d5b484db337c1d298714d51c0f Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sun, 3 Mar 2019 11:34:55 -0800
Subject: [PATCH 127/245] try to detect working configs better

---
 Cxx11/nstream-sycl.cc | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc
index bb94391ad..bebfb5932 100644
--- a/Cxx11/nstream-sycl.cc
+++ b/Cxx11/nstream-sycl.cc
@@ -225,26 +225,39 @@ int main(int argc, char * argv[])
         std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
         auto platform    = device.get_platform();
         std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
-        //std::cout << "cl_khr_spir:   " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl;
+        bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir"));
+#else
+        bool has_spir = true; // ?
 #endif
-
-        run<float>(cpu, iterations, length);
-        run<double>(cpu, iterations, length);
+        if (has_spir) {
+          run<float>(cpu, iterations, length);
+          run<double>(cpu, iterations, length);
+        }
     }
 
     // NVIDIA GPU requires ptx64 target and does not work very well
-    if (0) {
+    if (1) {
         cl::sycl::queue gpu(cl::sycl::gpu_selector{});
 #ifndef TRISYCL
         auto device      = gpu.get_device();
         std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
         auto platform    = device.get_platform();
         std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
-        //std::cout << "cl_khr_spir:   " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl;
+        bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir"));
+#else
+        bool has_spir = true; // ?
 #endif
-
-        run<float>(gpu, iterations, length);
-        run<double>(gpu, iterations, length);
+        if (has_spir) {
+          run<float>(gpu, iterations, length);
+          run<double>(gpu, iterations, length);
+        } else {
+          std::cout << "SYCL GPU device lacks SPIR-V support." << std::endl;
+#ifdef __COMPUTECPP__
+          std::cout << "You are using ComputeCpp so we will try it anyways..." << std::endl;
+          run<float>(gpu, iterations, length);
+          run<double>(gpu, iterations, length);
+#endif
+        }
     }
   }
   catch (cl::sycl::exception e) {

From 548fe7aecfb259197db4491fc77b07b6c1cc0749 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Mon, 4 Mar 2019 11:48:27 -0800
Subject: [PATCH 128/245] forward-declare kernel names in SYCL stencil

---
 Cxx11/generate-sycl-stencil.py |   7 ++-
 Cxx11/stencil_sycl.hpp         | 111 ++++++++++++++++++---------------
 2 files changed, 67 insertions(+), 51 deletions(-)

diff --git a/Cxx11/generate-sycl-stencil.py b/Cxx11/generate-sycl-stencil.py
index e1857e8c2..f0feb9e3f 100755
--- a/Cxx11/generate-sycl-stencil.py
+++ b/Cxx11/generate-sycl-stencil.py
@@ -22,7 +22,7 @@ def codegen(src,pattern,stencil_size,radius,model,dim):
         for r in range(1,radius+1):
             src.write('    cl::sycl::id<2> dx'+str(r)+'(cl::sycl::range<2> {'+str(r)+',0});\n')
             src.write('    cl::sycl::id<2> dy'+str(r)+'(cl::sycl::range<2> {0,'+str(r)+'});\n')
-    src.write('    h.parallel_for<class '+pattern+str(radius)+'_'+str(dim)+'d>(')
+    src.write('    h.parallel_for<class '+pattern+str(radius)+'_'+str(dim)+'d<T>>(')
     src.write('cl::sycl::range<2> {n-'+str(2*radius)+',n-'+str(2*radius)+'}, ')
     src.write('cl::sycl::id<2> {'+str(radius)+','+str(radius)+'}, ')
     src.write('[=] (cl::sycl::item<2> it) {\n')
@@ -91,6 +91,11 @@ def instance(src,model,pattern,r):
 def main():
     for model in ['sycl']:
       src = open('stencil_'+model+'.hpp','w')
+      for pattern in ['star']:
+        for r in range(1,6):
+          src.write('template <typename T> class '+pattern+str(r)+'_1d;\n')
+          src.write('template <typename T> class '+pattern+str(r)+'_2d;\n')
+      src.write('\n')
       #for pattern in ['star','grid']:
       for pattern in ['star']:
         for r in range(1,6):
diff --git a/Cxx11/stencil_sycl.hpp b/Cxx11/stencil_sycl.hpp
index 587f2adc7..799c86573 100644
--- a/Cxx11/stencil_sycl.hpp
+++ b/Cxx11/stencil_sycl.hpp
@@ -1,10 +1,21 @@
+template <typename T> class star1_1d;
+template <typename T> class star1_2d;
+template <typename T> class star2_1d;
+template <typename T> class star2_2d;
+template <typename T> class star3_1d;
+template <typename T> class star3_2d;
+template <typename T> class star4_1d;
+template <typename T> class star4_2d;
+template <typename T> class star5_1d;
+template <typename T> class star5_2d;
+
 template <typename T>
 void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::sycl::buffer<T> & d_out)
 {
   q.submit([&](cl::sycl::handler& h) {
     auto in  = d_in.template get_access<cl::sycl::access::mode::read>(h);
     auto out = d_out.template get_access<cl::sycl::access::mode::read_write>(h);
-    h.parallel_for<class star1_1d>(cl::sycl::range<2> {n-2,n-2}, cl::sycl::id<2> {1,1}, [=] (cl::sycl::item<2> it) {
+    h.parallel_for<class star1_1d<T>>(cl::sycl::range<2> {n-2,n-2}, cl::sycl::id<2> {1,1}, [=] (cl::sycl::item<2> it) {
         out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast<T>(0.5)
                               +in[it[0]*n+(it[1]-1)] * static_cast<T>(-0.5)
                               +in[(it[0]+1)*n+it[1]] * static_cast<T>(0.5)
@@ -21,7 +32,7 @@ void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, c
     auto out = d_out.template get_access<cl::sycl::access::mode::read_write>(h);
     cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
     cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
-    h.parallel_for<class star1_2d>(cl::sycl::range<2> {n-2,n-2}, cl::sycl::id<2> {1,1}, [=] (cl::sycl::item<2> it) {
+    h.parallel_for<class star1_2d<T>>(cl::sycl::range<2> {n-2,n-2}, cl::sycl::id<2> {1,1}, [=] (cl::sycl::item<2> it) {
         cl::sycl::id<2> xy = it.get_id();
         out[xy] += +in[xy+dx1] * static_cast<T>(0.5)
                    +in[xy-dx1] * static_cast<T>(-0.5)
@@ -37,7 +48,7 @@ void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::
   q.submit([&](cl::sycl::handler& h) {
     auto in  = d_in.template get_access<cl::sycl::access::mode::read>(h);
     auto out = d_out.template get_access<cl::sycl::access::mode::read_write>(h);
-    h.parallel_for<class star2_1d>(cl::sycl::range<2> {n-4,n-4}, cl::sycl::id<2> {2,2}, [=] (cl::sycl::item<2> it) {
+    h.parallel_for<class star2_1d<T>>(cl::sycl::range<2> {n-4,n-4}, cl::sycl::id<2> {2,2}, [=] (cl::sycl::item<2> it) {
         out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast<T>(0.25)
                               +in[it[0]*n+(it[1]-1)] * static_cast<T>(-0.25)
                               +in[(it[0]+1)*n+it[1]] * static_cast<T>(0.25)
@@ -60,7 +71,7 @@ void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, c
     cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
     cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0});
     cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2});
-    h.parallel_for<class star2_2d>(cl::sycl::range<2> {n-4,n-4}, cl::sycl::id<2> {2,2}, [=] (cl::sycl::item<2> it) {
+    h.parallel_for<class star2_2d<T>>(cl::sycl::range<2> {n-4,n-4}, cl::sycl::id<2> {2,2}, [=] (cl::sycl::item<2> it) {
         cl::sycl::id<2> xy = it.get_id();
         out[xy] += +in[xy+dx1] * static_cast<T>(0.25)
                    +in[xy-dx1] * static_cast<T>(-0.25)
@@ -80,19 +91,19 @@ void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::
   q.submit([&](cl::sycl::handler& h) {
     auto in  = d_in.template get_access<cl::sycl::access::mode::read>(h);
     auto out = d_out.template get_access<cl::sycl::access::mode::read_write>(h);
-    h.parallel_for<class star3_1d>(cl::sycl::range<2> {n-6,n-6}, cl::sycl::id<2> {3,3}, [=] (cl::sycl::item<2> it) {
-        out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast<T>(0.166666666667)
-                              +in[it[0]*n+(it[1]-1)] * static_cast<T>(-0.166666666667)
-                              +in[(it[0]+1)*n+it[1]] * static_cast<T>(0.166666666667)
-                              +in[(it[0]-1)*n+it[1]] * static_cast<T>(-0.166666666667)
-                              +in[it[0]*n+(it[1]+2)] * static_cast<T>(0.0833333333333)
-                              +in[it[0]*n+(it[1]-2)] * static_cast<T>(-0.0833333333333)
-                              +in[(it[0]+2)*n+it[1]] * static_cast<T>(0.0833333333333)
-                              +in[(it[0]-2)*n+it[1]] * static_cast<T>(-0.0833333333333)
-                              +in[it[0]*n+(it[1]+3)] * static_cast<T>(0.0555555555556)
-                              +in[it[0]*n+(it[1]-3)] * static_cast<T>(-0.0555555555556)
-                              +in[(it[0]+3)*n+it[1]] * static_cast<T>(0.0555555555556)
-                              +in[(it[0]-3)*n+it[1]] * static_cast<T>(-0.0555555555556);
+    h.parallel_for<class star3_1d<T>>(cl::sycl::range<2> {n-6,n-6}, cl::sycl::id<2> {3,3}, [=] (cl::sycl::item<2> it) {
+        out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast<T>(0.16666666666666666)
+                              +in[it[0]*n+(it[1]-1)] * static_cast<T>(-0.16666666666666666)
+                              +in[(it[0]+1)*n+it[1]] * static_cast<T>(0.16666666666666666)
+                              +in[(it[0]-1)*n+it[1]] * static_cast<T>(-0.16666666666666666)
+                              +in[it[0]*n+(it[1]+2)] * static_cast<T>(0.08333333333333333)
+                              +in[it[0]*n+(it[1]-2)] * static_cast<T>(-0.08333333333333333)
+                              +in[(it[0]+2)*n+it[1]] * static_cast<T>(0.08333333333333333)
+                              +in[(it[0]-2)*n+it[1]] * static_cast<T>(-0.08333333333333333)
+                              +in[it[0]*n+(it[1]+3)] * static_cast<T>(0.05555555555555555)
+                              +in[it[0]*n+(it[1]-3)] * static_cast<T>(-0.05555555555555555)
+                              +in[(it[0]+3)*n+it[1]] * static_cast<T>(0.05555555555555555)
+                              +in[(it[0]-3)*n+it[1]] * static_cast<T>(-0.05555555555555555);
     });
   });
 }
@@ -109,20 +120,20 @@ void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, c
     cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2});
     cl::sycl::id<2> dx3(cl::sycl::range<2> {3,0});
     cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3});
-    h.parallel_for<class star3_2d>(cl::sycl::range<2> {n-6,n-6}, cl::sycl::id<2> {3,3}, [=] (cl::sycl::item<2> it) {
+    h.parallel_for<class star3_2d<T>>(cl::sycl::range<2> {n-6,n-6}, cl::sycl::id<2> {3,3}, [=] (cl::sycl::item<2> it) {
         cl::sycl::id<2> xy = it.get_id();
-        out[xy] += +in[xy+dx1] * static_cast<T>(0.166666666667)
-                   +in[xy-dx1] * static_cast<T>(-0.166666666667)
-                   +in[xy+dy1] * static_cast<T>(0.166666666667)
-                   +in[xy-dy1] * static_cast<T>(-0.166666666667)
-                   +in[xy+dx2] * static_cast<T>(0.0833333333333)
-                   +in[xy-dx2] * static_cast<T>(-0.0833333333333)
-                   +in[xy+dy2] * static_cast<T>(0.0833333333333)
-                   +in[xy-dy2] * static_cast<T>(-0.0833333333333)
-                   +in[xy+dx3] * static_cast<T>(0.0555555555556)
-                   +in[xy-dx3] * static_cast<T>(-0.0555555555556)
-                   +in[xy+dy3] * static_cast<T>(0.0555555555556)
-                   +in[xy-dy3] * static_cast<T>(-0.0555555555556);
+        out[xy] += +in[xy+dx1] * static_cast<T>(0.16666666666666666)
+                   +in[xy-dx1] * static_cast<T>(-0.16666666666666666)
+                   +in[xy+dy1] * static_cast<T>(0.16666666666666666)
+                   +in[xy-dy1] * static_cast<T>(-0.16666666666666666)
+                   +in[xy+dx2] * static_cast<T>(0.08333333333333333)
+                   +in[xy-dx2] * static_cast<T>(-0.08333333333333333)
+                   +in[xy+dy2] * static_cast<T>(0.08333333333333333)
+                   +in[xy-dy2] * static_cast<T>(-0.08333333333333333)
+                   +in[xy+dx3] * static_cast<T>(0.05555555555555555)
+                   +in[xy-dx3] * static_cast<T>(-0.05555555555555555)
+                   +in[xy+dy3] * static_cast<T>(0.05555555555555555)
+                   +in[xy-dy3] * static_cast<T>(-0.05555555555555555);
     });
   });
 }
@@ -133,7 +144,7 @@ void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::
   q.submit([&](cl::sycl::handler& h) {
     auto in  = d_in.template get_access<cl::sycl::access::mode::read>(h);
     auto out = d_out.template get_access<cl::sycl::access::mode::read_write>(h);
-    h.parallel_for<class star4_1d>(cl::sycl::range<2> {n-8,n-8}, cl::sycl::id<2> {4,4}, [=] (cl::sycl::item<2> it) {
+    h.parallel_for<class star4_1d<T>>(cl::sycl::range<2> {n-8,n-8}, cl::sycl::id<2> {4,4}, [=] (cl::sycl::item<2> it) {
         out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast<T>(0.125)
                               +in[it[0]*n+(it[1]-1)] * static_cast<T>(-0.125)
                               +in[(it[0]+1)*n+it[1]] * static_cast<T>(0.125)
@@ -142,10 +153,10 @@ void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::
                               +in[it[0]*n+(it[1]-2)] * static_cast<T>(-0.0625)
                               +in[(it[0]+2)*n+it[1]] * static_cast<T>(0.0625)
                               +in[(it[0]-2)*n+it[1]] * static_cast<T>(-0.0625)
-                              +in[it[0]*n+(it[1]+3)] * static_cast<T>(0.0416666666667)
-                              +in[it[0]*n+(it[1]-3)] * static_cast<T>(-0.0416666666667)
-                              +in[(it[0]+3)*n+it[1]] * static_cast<T>(0.0416666666667)
-                              +in[(it[0]-3)*n+it[1]] * static_cast<T>(-0.0416666666667)
+                              +in[it[0]*n+(it[1]+3)] * static_cast<T>(0.041666666666666664)
+                              +in[it[0]*n+(it[1]-3)] * static_cast<T>(-0.041666666666666664)
+                              +in[(it[0]+3)*n+it[1]] * static_cast<T>(0.041666666666666664)
+                              +in[(it[0]-3)*n+it[1]] * static_cast<T>(-0.041666666666666664)
                               +in[it[0]*n+(it[1]+4)] * static_cast<T>(0.03125)
                               +in[it[0]*n+(it[1]-4)] * static_cast<T>(-0.03125)
                               +in[(it[0]+4)*n+it[1]] * static_cast<T>(0.03125)
@@ -168,7 +179,7 @@ void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, c
     cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3});
     cl::sycl::id<2> dx4(cl::sycl::range<2> {4,0});
     cl::sycl::id<2> dy4(cl::sycl::range<2> {0,4});
-    h.parallel_for<class star4_2d>(cl::sycl::range<2> {n-8,n-8}, cl::sycl::id<2> {4,4}, [=] (cl::sycl::item<2> it) {
+    h.parallel_for<class star4_2d<T>>(cl::sycl::range<2> {n-8,n-8}, cl::sycl::id<2> {4,4}, [=] (cl::sycl::item<2> it) {
         cl::sycl::id<2> xy = it.get_id();
         out[xy] += +in[xy+dx1] * static_cast<T>(0.125)
                    +in[xy-dx1] * static_cast<T>(-0.125)
@@ -178,10 +189,10 @@ void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, c
                    +in[xy-dx2] * static_cast<T>(-0.0625)
                    +in[xy+dy2] * static_cast<T>(0.0625)
                    +in[xy-dy2] * static_cast<T>(-0.0625)
-                   +in[xy+dx3] * static_cast<T>(0.0416666666667)
-                   +in[xy-dx3] * static_cast<T>(-0.0416666666667)
-                   +in[xy+dy3] * static_cast<T>(0.0416666666667)
-                   +in[xy-dy3] * static_cast<T>(-0.0416666666667)
+                   +in[xy+dx3] * static_cast<T>(0.041666666666666664)
+                   +in[xy-dx3] * static_cast<T>(-0.041666666666666664)
+                   +in[xy+dy3] * static_cast<T>(0.041666666666666664)
+                   +in[xy-dy3] * static_cast<T>(-0.041666666666666664)
                    +in[xy+dx4] * static_cast<T>(0.03125)
                    +in[xy-dx4] * static_cast<T>(-0.03125)
                    +in[xy+dy4] * static_cast<T>(0.03125)
@@ -196,7 +207,7 @@ void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::
   q.submit([&](cl::sycl::handler& h) {
     auto in  = d_in.template get_access<cl::sycl::access::mode::read>(h);
     auto out = d_out.template get_access<cl::sycl::access::mode::read_write>(h);
-    h.parallel_for<class star5_1d>(cl::sycl::range<2> {n-10,n-10}, cl::sycl::id<2> {5,5}, [=] (cl::sycl::item<2> it) {
+    h.parallel_for<class star5_1d<T>>(cl::sycl::range<2> {n-10,n-10}, cl::sycl::id<2> {5,5}, [=] (cl::sycl::item<2> it) {
         out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast<T>(0.1)
                               +in[it[0]*n+(it[1]-1)] * static_cast<T>(-0.1)
                               +in[(it[0]+1)*n+it[1]] * static_cast<T>(0.1)
@@ -205,10 +216,10 @@ void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::
                               +in[it[0]*n+(it[1]-2)] * static_cast<T>(-0.05)
                               +in[(it[0]+2)*n+it[1]] * static_cast<T>(0.05)
                               +in[(it[0]-2)*n+it[1]] * static_cast<T>(-0.05)
-                              +in[it[0]*n+(it[1]+3)] * static_cast<T>(0.0333333333333)
-                              +in[it[0]*n+(it[1]-3)] * static_cast<T>(-0.0333333333333)
-                              +in[(it[0]+3)*n+it[1]] * static_cast<T>(0.0333333333333)
-                              +in[(it[0]-3)*n+it[1]] * static_cast<T>(-0.0333333333333)
+                              +in[it[0]*n+(it[1]+3)] * static_cast<T>(0.03333333333333333)
+                              +in[it[0]*n+(it[1]-3)] * static_cast<T>(-0.03333333333333333)
+                              +in[(it[0]+3)*n+it[1]] * static_cast<T>(0.03333333333333333)
+                              +in[(it[0]-3)*n+it[1]] * static_cast<T>(-0.03333333333333333)
                               +in[it[0]*n+(it[1]+4)] * static_cast<T>(0.025)
                               +in[it[0]*n+(it[1]-4)] * static_cast<T>(-0.025)
                               +in[(it[0]+4)*n+it[1]] * static_cast<T>(0.025)
@@ -237,7 +248,7 @@ void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, c
     cl::sycl::id<2> dy4(cl::sycl::range<2> {0,4});
     cl::sycl::id<2> dx5(cl::sycl::range<2> {5,0});
     cl::sycl::id<2> dy5(cl::sycl::range<2> {0,5});
-    h.parallel_for<class star5_2d>(cl::sycl::range<2> {n-10,n-10}, cl::sycl::id<2> {5,5}, [=] (cl::sycl::item<2> it) {
+    h.parallel_for<class star5_2d<T>>(cl::sycl::range<2> {n-10,n-10}, cl::sycl::id<2> {5,5}, [=] (cl::sycl::item<2> it) {
         cl::sycl::id<2> xy = it.get_id();
         out[xy] += +in[xy+dx1] * static_cast<T>(0.1)
                    +in[xy-dx1] * static_cast<T>(-0.1)
@@ -247,10 +258,10 @@ void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, c
                    +in[xy-dx2] * static_cast<T>(-0.05)
                    +in[xy+dy2] * static_cast<T>(0.05)
                    +in[xy-dy2] * static_cast<T>(-0.05)
-                   +in[xy+dx3] * static_cast<T>(0.0333333333333)
-                   +in[xy-dx3] * static_cast<T>(-0.0333333333333)
-                   +in[xy+dy3] * static_cast<T>(0.0333333333333)
-                   +in[xy-dy3] * static_cast<T>(-0.0333333333333)
+                   +in[xy+dx3] * static_cast<T>(0.03333333333333333)
+                   +in[xy-dx3] * static_cast<T>(-0.03333333333333333)
+                   +in[xy+dy3] * static_cast<T>(0.03333333333333333)
+                   +in[xy-dy3] * static_cast<T>(-0.03333333333333333)
                    +in[xy+dx4] * static_cast<T>(0.025)
                    +in[xy-dx4] * static_cast<T>(-0.025)
                    +in[xy+dy4] * static_cast<T>(0.025)

From 2d7e4e8b251bc2522af53902c34b6f79c53e42a4 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Mon, 4 Mar 2019 12:42:26 -0800
Subject: [PATCH 129/245] fix float template for 2D case

---
 Cxx11/transpose-sycl.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc
index 97bb8a09c..e7b1d94d2 100644
--- a/Cxx11/transpose-sycl.cc
+++ b/Cxx11/transpose-sycl.cc
@@ -75,8 +75,8 @@ void run(cl::sycl::queue & q, int iterations, size_t order)
   try {
 
 #if USE_2D_INDEXING
-    cl::sycl::buffer<double,2> d_A( h_A.data(), cl::sycl::range<2>{order,order} );
-    cl::sycl::buffer<double,2> d_B( h_B.data(), cl::sycl::range<2>{order,order} );
+    cl::sycl::buffer<T,2> d_A( h_A.data(), cl::sycl::range<2>{order,order} );
+    cl::sycl::buffer<T,2> d_B( h_B.data(), cl::sycl::range<2>{order,order} );
 #else
     cl::sycl::buffer<T> d_A { h_A.data(), h_A.size() };
     cl::sycl::buffer<T> d_B { h_B.data(), h_B.size() };

From b2c63f6e132f6ff114a53fae4d816590ad879c97 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Mon, 4 Mar 2019 12:43:02 -0800
Subject: [PATCH 130/245] declare kernel name templates closer to usage

---
 Cxx11/generate-sycl-stencil.py |  7 ++-----
 Cxx11/stencil_sycl.hpp         | 37 +++++++++++++++++++++++++---------
 2 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/Cxx11/generate-sycl-stencil.py b/Cxx11/generate-sycl-stencil.py
index f0feb9e3f..d88cae37b 100755
--- a/Cxx11/generate-sycl-stencil.py
+++ b/Cxx11/generate-sycl-stencil.py
@@ -6,6 +6,8 @@
 import os
 
 def codegen(src,pattern,stencil_size,radius,model,dim):
+    src.write('// declare the kernel name used in SYCL parallel_for\n')
+    src.write('template <typename T> class '+pattern+str(radius)+'_'+str(dim)+'d;\n\n')
     src.write('template <typename T>\n')
     src.write('void '+pattern+str(radius)+'(cl::sycl::queue & q, const size_t n, ')
     if (dim==2):
@@ -91,11 +93,6 @@ def instance(src,model,pattern,r):
 def main():
     for model in ['sycl']:
       src = open('stencil_'+model+'.hpp','w')
-      for pattern in ['star']:
-        for r in range(1,6):
-          src.write('template <typename T> class '+pattern+str(r)+'_1d;\n')
-          src.write('template <typename T> class '+pattern+str(r)+'_2d;\n')
-      src.write('\n')
       #for pattern in ['star','grid']:
       for pattern in ['star']:
         for r in range(1,6):
diff --git a/Cxx11/stencil_sycl.hpp b/Cxx11/stencil_sycl.hpp
index 799c86573..41412e5b4 100644
--- a/Cxx11/stencil_sycl.hpp
+++ b/Cxx11/stencil_sycl.hpp
@@ -1,13 +1,5 @@
+// declare the kernel name used in SYCL parallel_for
 template <typename T> class star1_1d;
-template <typename T> class star1_2d;
-template <typename T> class star2_1d;
-template <typename T> class star2_2d;
-template <typename T> class star3_1d;
-template <typename T> class star3_2d;
-template <typename T> class star4_1d;
-template <typename T> class star4_2d;
-template <typename T> class star5_1d;
-template <typename T> class star5_2d;
 
 template <typename T>
 void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::sycl::buffer<T> & d_out)
@@ -24,6 +16,9 @@ void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::
   });
 }
 
+// declare the kernel name used in SYCL parallel_for
+template <typename T> class star1_2d;
+
 template <typename T>
 void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, cl::sycl::buffer<T, 2> & d_out)
 {
@@ -42,6 +37,9 @@ void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, c
   });
 }
 
+// declare the kernel name used in SYCL parallel_for
+template <typename T> class star2_1d;
+
 template <typename T>
 void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::sycl::buffer<T> & d_out)
 {
@@ -61,6 +59,9 @@ void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::
   });
 }
 
+// declare the kernel name used in SYCL parallel_for
+template <typename T> class star2_2d;
+
 template <typename T>
 void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, cl::sycl::buffer<T, 2> & d_out)
 {
@@ -85,6 +86,9 @@ void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, c
   });
 }
 
+// declare the kernel name used in SYCL parallel_for
+template <typename T> class star3_1d;
+
 template <typename T>
 void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::sycl::buffer<T> & d_out)
 {
@@ -108,6 +112,9 @@ void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::
   });
 }
 
+// declare the kernel name used in SYCL parallel_for
+template <typename T> class star3_2d;
+
 template <typename T>
 void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, cl::sycl::buffer<T, 2> & d_out)
 {
@@ -138,6 +145,9 @@ void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, c
   });
 }
 
+// declare the kernel name used in SYCL parallel_for
+template <typename T> class star4_1d;
+
 template <typename T>
 void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::sycl::buffer<T> & d_out)
 {
@@ -165,6 +175,9 @@ void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::
   });
 }
 
+// declare the kernel name used in SYCL parallel_for
+template <typename T> class star4_2d;
+
 template <typename T>
 void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, cl::sycl::buffer<T, 2> & d_out)
 {
@@ -201,6 +214,9 @@ void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, c
   });
 }
 
+// declare the kernel name used in SYCL parallel_for
+template <typename T> class star5_1d;
+
 template <typename T>
 void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::sycl::buffer<T> & d_out)
 {
@@ -232,6 +248,9 @@ void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::
   });
 }
 
+// declare the kernel name used in SYCL parallel_for
+template <typename T> class star5_2d;
+
 template <typename T>
 void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, cl::sycl::buffer<T, 2> & d_out)
 {

From 7eeec67c9f978608197a292d19361311a9c5bf9d Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Tue, 12 Feb 2019 17:25:08 -0800
Subject: [PATCH 131/245] do not incorrectly declare non-read-only buffers as
 read-only

---
 Cxx11/nstream-opencl.cc       | 2 +-
 Cxx11/p2p-innerloop-opencl.cc | 2 +-
 Cxx11/stencil-opencl.cc       | 2 +-
 Cxx11/transpose-opencl.cc     | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Cxx11/nstream-opencl.cc b/Cxx11/nstream-opencl.cc
index 9d81f1b8d..40b76d4cc 100644
--- a/Cxx11/nstream-opencl.cc
+++ b/Cxx11/nstream-opencl.cc
@@ -93,7 +93,7 @@ void run(cl::Context context, int iterations, size_t length)
   std::vector<T> h_c(length, T(2));
 
   // copy input from host to device
-  cl::Buffer d_a = cl::Buffer(context, begin(h_a), end(h_a), true);
+  cl::Buffer d_a = cl::Buffer(context, begin(h_a), end(h_a), false);
   cl::Buffer d_b = cl::Buffer(context, begin(h_b), end(h_b), true);
   cl::Buffer d_c = cl::Buffer(context, begin(h_c), end(h_c), true);
 
diff --git a/Cxx11/p2p-innerloop-opencl.cc b/Cxx11/p2p-innerloop-opencl.cc
index 2552fe787..43bbefa28 100644
--- a/Cxx11/p2p-innerloop-opencl.cc
+++ b/Cxx11/p2p-innerloop-opencl.cc
@@ -93,7 +93,7 @@ void run(cl::Context context, int iterations, int n)
   }
 
   // copy input from host to device
-  cl::Buffer d_grid = cl::Buffer(context, begin(h_grid), end(h_grid), true);
+  cl::Buffer d_grid = cl::Buffer(context, begin(h_grid), end(h_grid), false);
 
   auto pipeline_time = 0.0;
 
diff --git a/Cxx11/stencil-opencl.cc b/Cxx11/stencil-opencl.cc
index 8db6adfa4..0b82a5e79 100644
--- a/Cxx11/stencil-opencl.cc
+++ b/Cxx11/stencil-opencl.cc
@@ -124,7 +124,7 @@ void run(cl::Context context, int iterations, int n, int radius, bool star)
 
   // copy input from host to device
   cl::Buffer d_in = cl::Buffer(context, begin(h_in), end(h_in), true);
-  cl::Buffer d_out = cl::Buffer(context, begin(h_out), end(h_out), true);
+  cl::Buffer d_out = cl::Buffer(context, begin(h_out), end(h_out), false);
 
   for (auto iter = 0; iter<=iterations; iter++) {
 
diff --git a/Cxx11/transpose-opencl.cc b/Cxx11/transpose-opencl.cc
index 4e22114d5..dc1186ff2 100644
--- a/Cxx11/transpose-opencl.cc
+++ b/Cxx11/transpose-opencl.cc
@@ -85,8 +85,8 @@ void run(cl::Context context, int iterations, int order)
   std::iota(h_a.begin(), h_a.end(), (T)0);
 
   // copy input from host to device
-  cl::Buffer d_a = cl::Buffer(context, begin(h_a), end(h_a), true);
-  cl::Buffer d_b = cl::Buffer(context, begin(h_b), end(h_b), true);
+  cl::Buffer d_a = cl::Buffer(context, begin(h_a), end(h_a), false);
+  cl::Buffer d_b = cl::Buffer(context, begin(h_b), end(h_b), false);
 
   auto trans_time = 0.0;
 

From 725d5eebeef8a2c4fa9da17a5a5a57407c6d652d Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Fri, 15 Mar 2019 15:14:15 -0700
Subject: [PATCH 132/245] remove Rust from parent makefile to unbreak case when
 cargo missing

---
 Makefile | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index e3be66f23..99d7ee2ae 100644
--- a/Makefile
+++ b/Makefile
@@ -72,14 +72,14 @@ help:
 	@echo "       \"make allfreaks\"    (re-)builds the above four targets"
 	@echo "       optionally, specify   \"matrix_rank=<n> number_of_functions=<m>\""
 	@echo "       optionally, specify   \"default_opt_flags=<list of optimization flags>\""
-	@echo "       \"make allshared\"    (re-)builds the shared-memory targets (C89, C1z, C++11, Fortran, RUST)"
+	@echo "       \"make allshared\"    (re-)builds the shared-memory targets (C89, C1z, C++11, Fortran)"
 	@echo "       \"make clean\"        removes all objects and executables"
 	@echo "       \"make veryclean\"    removes some generated source files as well"
 
 all: alldarwin allfreaks allshared
 alldarwin: allserial allopenmp allmpi1 allfgmpi allmpiopenmp allmpirma allshmem allmpishm allupc allfortran allfenix
 allfreaks: allcharm++ allampi allgrappa alllegion
-allshared: allserial allopenmp allfortran allcxx allc1z allrust
+allshared: allserial allopenmp allfortran allcxx allc1z
 allnew: allfortran allcxx allc1z
 
 allmpi1:
@@ -332,7 +332,6 @@ clean:
 	make -C FORTRAN clean
 	make -C Cxx11 clean
 	make -C C1z clean
-	make -C RUST clean
 	rm -f stats.json
 
 veryclean: clean

From 30a2c6f6b5c27f19f3f604ee9e85985fc83db334 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Fri, 15 Mar 2019 15:58:15 -0700
Subject: [PATCH 133/245] avoid overflow

---
 AMPI/Stencil/stencil.c      | 2 +-
 CHARM++/Stencil/stencil.C   | 2 +-
 FG_MPI/Stencil/stencil.c    | 2 +-
 MPI1/Stencil/stencil.c      | 2 +-
 MPIOPENMP/Stencil/stencil.c | 2 +-
 MPIRMA/Stencil/stencil.c    | 2 +-
 MPISHM/Stencil/stencil.c    | 2 +-
 SHMEM/Stencil/stencil.c     | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/AMPI/Stencil/stencil.c b/AMPI/Stencil/stencil.c
index 228853b24..6d79c5abe 100644
--- a/AMPI/Stencil/stencil.c
+++ b/AMPI/Stencil/stencil.c
@@ -88,7 +88,7 @@ HISTORY: - Written by Rob Van der Wijngaart, November 2006.
 #endif
 
 /* define shorthand for indexing multi-dimensional arrays with offsets           */
-#define INDEXIN(i,j)  (i+RADIUS+(j+RADIUS)*(width+2*RADIUS))
+#define INDEXIN(i,j)  (i+RADIUS+(long)(j+RADIUS)*(long)(width+2*RADIUS))
 /* need to add offset of RADIUS to j to account for ghost points                 */
 #define IN(i,j)       in[INDEXIN(i-istart,j-jstart)]
 #define INDEXOUT(i,j) (i+(j)*(width))
diff --git a/CHARM++/Stencil/stencil.C b/CHARM++/Stencil/stencil.C
index 273198b3f..2dee692af 100644
--- a/CHARM++/Stencil/stencil.C
+++ b/CHARM++/Stencil/stencil.C
@@ -4,7 +4,7 @@
 #define EPSILON       1.e-8
 #define COEFX         1.0
 #define COEFY         1.0
-#define INDEXIN(i,j)  (i+RADIUS+(width+2*RADIUS)*(j+RADIUS))
+#define INDEXIN(i,j)  (i+RADIUS+(long)(width+2*RADIUS)*(long)(j+RADIUS))
 #define IN(i,j)       in[INDEXIN(i-istart,j-jstart)]
 #define INDEXOUT(i,j) (i+width*(j))
 #define OUT(i,j)      out[INDEXOUT(i-istart,j-jstart)]
diff --git a/FG_MPI/Stencil/stencil.c b/FG_MPI/Stencil/stencil.c
index bee84ae1e..f346264a3 100644
--- a/FG_MPI/Stencil/stencil.c
+++ b/FG_MPI/Stencil/stencil.c
@@ -88,7 +88,7 @@ HISTORY: - Written by Rob Van der Wijngaart, November 2006.
 #endif
 
 /* define shorthand for indexing multi-dimensional arrays with offsets           */
-#define INDEXIN(i,j)  (i+RADIUS+(j+RADIUS)*(width+2*RADIUS))
+#define INDEXIN(i,j)  (i+RADIUS+(long)(j+RADIUS)*(long)(width+2*RADIUS))
 /* need to add offset of RADIUS to j to account for ghost points                 */
 #define IN(i,j)       in[INDEXIN(i-istart,j-jstart)]
 #define INDEXOUT(i,j) (i+(j)*(width))
diff --git a/MPI1/Stencil/stencil.c b/MPI1/Stencil/stencil.c
index 2f33fe5e9..2417b1e00 100644
--- a/MPI1/Stencil/stencil.c
+++ b/MPI1/Stencil/stencil.c
@@ -88,7 +88,7 @@ HISTORY: - Written by Rob Van der Wijngaart, November 2006.
 #endif
 
 /* define shorthand for indexing multi-dimensional arrays with offsets           */
-#define INDEXIN(i,j)  (i+RADIUS+(j+RADIUS)*(width+2*RADIUS))
+#define INDEXIN(i,j)  (i+RADIUS+(long)(j+RADIUS)*(long)(width+2*RADIUS))
 /* need to add offset of RADIUS to j to account for ghost points                 */
 #define IN(i,j)       in[INDEXIN(i-istart,j-jstart)]
 #define INDEXOUT(i,j) (i+(j)*(width))
diff --git a/MPIOPENMP/Stencil/stencil.c b/MPIOPENMP/Stencil/stencil.c
index 54b614d52..a80200030 100644
--- a/MPIOPENMP/Stencil/stencil.c
+++ b/MPIOPENMP/Stencil/stencil.c
@@ -88,7 +88,7 @@ HISTORY: - Written by Rob Van der Wijngaart, November 2006.
 #endif
 
 /* define shorthand for indexing multi-dimensional arrays with offsets           */
-#define INDEXIN(i,j)  (i+RADIUS+(j+RADIUS)*(width+2*RADIUS))
+#define INDEXIN(i,j)  (i+RADIUS+(long)(j+RADIUS)*(long)(width+2*RADIUS))
 /* need to add offset of RADIUS to j to account for ghost points                 */
 #define IN(i,j)       in[INDEXIN(i-istart,j-jstart)]
 #define INDEXOUT(i,j) (i+(j)*(width))
diff --git a/MPIRMA/Stencil/stencil.c b/MPIRMA/Stencil/stencil.c
index a2c320af2..49d63e85e 100644
--- a/MPIRMA/Stencil/stencil.c
+++ b/MPIRMA/Stencil/stencil.c
@@ -88,7 +88,7 @@ HISTORY: - Written by Rob Van der Wijngaart, November 2006.
 #endif
 
 /* define shorthand for indexing multi-dimensional arrays with offsets           */
-#define INDEXIN(i,j)  (i+RADIUS+(j+RADIUS)*(width+2*RADIUS))
+#define INDEXIN(i,j)  (i+RADIUS+(long)(j+RADIUS)*(long)(width+2*RADIUS))
 /* need to add offset of RADIUS to j to account for ghost points                 */
 #define IN(i,j)       in[INDEXIN(i-istart,j-jstart)]
 #define INDEXOUT(i,j) (i+(j)*(width))
diff --git a/MPISHM/Stencil/stencil.c b/MPISHM/Stencil/stencil.c
index 5f2b92242..a7593680c 100644
--- a/MPISHM/Stencil/stencil.c
+++ b/MPISHM/Stencil/stencil.c
@@ -148,7 +148,7 @@ HISTORY: - Written by Rob Van der Wijngaart, November 2006.
 #endif
 
 /* define shorthand for indexing multi-dimensional arrays with offsets           */
-#define INDEXIN(i,j)  (i+RADIUS+(j+RADIUS)*(width+2*RADIUS))
+#define INDEXIN(i,j)  (i+RADIUS+(long)(j+RADIUS)*(long)(width+2*RADIUS))
 /* need to add offset of RADIUS to j to account for ghost points                 */
 #define IN(i,j)       in[INDEXIN(i-istart,j-jstart)]
 #define INDEXOUT(i,j) (i+(j)*(width))
diff --git a/SHMEM/Stencil/stencil.c b/SHMEM/Stencil/stencil.c
index 400e4b1f3..4e2b35376 100644
--- a/SHMEM/Stencil/stencil.c
+++ b/SHMEM/Stencil/stencil.c
@@ -77,7 +77,7 @@ HISTORY: - Written by Tom St. John, July 2015.
 #endif
 
 /* define shorthand for indexing multi-dimensional arrays with offsets           */
-#define INDEXIN(i,j)  (i+RADIUS+(j+RADIUS)*(width[0]+2*RADIUS))
+#define INDEXIN(i,j)  (i+RADIUS+(long)(j+RADIUS)*(long)(width[0]+2*RADIUS))
 /* need to add offset of RADIUS to j to account for ghost points                 */
 #define IN(i,j)       in[INDEXIN(i-istart,j-jstart)]
 #define INDEXOUT(i,j) (i+(j)*(width[0]))

From d4ca82f39d0fcd177c9a8dd14b88e23aace8a1b7 Mon Sep 17 00:00:00 2001
From: Christian Trott <crtrott@sandia.gov>
Date: Thu, 4 Apr 2019 10:49:23 -0700
Subject: [PATCH 134/245] Cxx11 nstream-kokkos: add missing fences

There are fences missing hence you wont' measure what you think on
asynchronous backends such as CUDA or HPX.  This also fixes using the actual name of the exec space instead of typeid.
Example for CUDA on V100:
Original:
Parallel Research Kernels version 2.16
C++11/Kokkos STREAM triad: A = B + scalar * C
Number of iterations = 1
Vector length        = 100000000
Offset               = 0
Kokkos execution space: N6Kokkos4CudaE
Solution validates
Rate (MB/s): 422188 Avg time (s): 0.00757957

With fences (and name fix):
Parallel Research Kernels version 2.16
C++11/Kokkos STREAM triad: A = B + scalar * C
Number of iterations = 1
Vector length        = 100000000
Offset               = 0
Kokkos execution space: Cuda
Solution validates
Rate (MB/s): 842600 Avg time (s): 0.00379777
---
 Cxx11/nstream-kokkos.cc | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/Cxx11/nstream-kokkos.cc b/Cxx11/nstream-kokkos.cc
index d03a47207..9e0af56bd 100644
--- a/Cxx11/nstream-kokkos.cc
+++ b/Cxx11/nstream-kokkos.cc
@@ -118,7 +118,7 @@ int main(int argc, char * argv[])
     std::cout << "Number of iterations = " << iterations << std::endl;
     std::cout << "Vector length        = " << length << std::endl;
     std::cout << "Offset               = " << offset << std::endl;
-    std::cout << "Kokkos execution space: " << typeid(Kokkos::DefaultExecutionSpace).name() << std::endl;
+    std::cout << "Kokkos execution space: " << Kokkos::DefaultExecutionSpace::name() << std::endl;
 
     //////////////////////////////////////////////////////////////////////
     // Allocate space and perform the computation
@@ -138,15 +138,19 @@ int main(int argc, char * argv[])
           B[i] = 2.0;
           C[i] = 2.0;
       });
-
+      Kokkos::fence();
       for (int iter = 0; iter<=iterations; ++iter) {
 
-        if (iter==1) nstream_time = prk::wtime();
+        if (iter==1) {
+          Kokkos::fence();
+          nstream_time = prk::wtime();
+        }
 
         Kokkos::parallel_for(length, KOKKOS_LAMBDA(size_t const i) {
             A[i] += B[i] + scalar * C[i];
         });
       }
+      Kokkos::fence();
       nstream_time = prk::wtime() - nstream_time;
     }
 

From ac5a44bdaa52a066097f47bad22f4648f06aed64 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sun, 7 Apr 2019 20:32:08 -0700
Subject: [PATCH 135/245] fix how BLAS linked in Fortran

---
 FORTRAN/Makefile     | 9 +++++----
 common/make.defs.gcc | 4 +++-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile
index 4d479881c..551c8fc36 100644
--- a/FORTRAN/Makefile
+++ b/FORTRAN/Makefile
@@ -14,9 +14,7 @@ FCFLAGS  = $(DEFAULT_OPT_FLAGS)
 FCFLAGS += -DRADIUS=$(RADIUS) $(STARFLAG)
 
 ifeq ($(findstring ifort,$(FC)),ifort)
-  BLASFLAGS = $(CBLASFLAG) -heap-arrays
-else
-  BLASFLAGS = $(CBLASFLAG)
+  BLASFLAGS += -heap-arrays
 endif
 
 .PHONY: all clean serial pretty openmp coarray target ornlacc
@@ -68,9 +66,12 @@ stencil: stencil.f90 stencil_serial.f90
 	#$(FC) $(FCFLAGS) -c stencil_serial.f90 -o stencil_serial.o
 	$(FC) $(FCFLAGS) $< -o $@
 
-%-pretty: %-pretty.f90
+dgemm-pretty: dgemm-pretty.f90
 	$(FC) $(FCFLAGS) $(BLASFLAGS) $< -o $@
 
+%-pretty: %-pretty.f90
+	$(FC) $(FCFLAGS) $< -o $@
+
 %-openmp: %.f90
 	$(FC) $(FCFLAGS) $(OPENMPFLAG) $< -o $@
 
diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index 5f3f62f03..89a0a1b0e 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -92,15 +92,17 @@ THRUSTFLAG=-I${THRUSTDIR} -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP
 # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
 SYCLDIR=./triSYCL
 SYCLCXX=${CXX} -std=c++17 ${OPENMPFLAG}
-SYCLFLAG=-I${SYCLDIR}/include ${BOOSTFLAG}
+SYCLFLAG=-I${SYCLDIR}/include ${BOOSTFLAG} -DTRISYCL
 # ProGTX
 # https://github.com/ProGTX/sycl-gtx
 #SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx
 #SYCLCXX=${CXX} ${OPENMPFLAG}
 #SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG}
+SYCLFLAG+=${RANGEFLAG}
 #
 # CBLAS for C++ DGEMM
 #
+BLASFLAG=-DACCELERATE -framework Accelerate
 CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions
 #
 # CUDA flags

From 432ac49bd2ba28c01e90909613dc45d633cf4a2f Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sun, 7 Apr 2019 20:34:50 -0700
Subject: [PATCH 136/245] range-based TBB parallel_for

---
 Cxx11/nstream-vector-tbb.cc | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/Cxx11/nstream-vector-tbb.cc b/Cxx11/nstream-vector-tbb.cc
index 0fbc777c2..0859e0654 100644
--- a/Cxx11/nstream-vector-tbb.cc
+++ b/Cxx11/nstream-vector-tbb.cc
@@ -126,6 +126,7 @@ int main(int argc, char * argv[])
   tbb::blocked_range<size_t> range(0, length);
 
   {
+#if 0
     tbb::parallel_for( range, [&](decltype(range)& r) {
                        for (auto i=r.begin(); i!=r.end(); ++i ) {
                            A[i] = 0.0;
@@ -133,16 +134,29 @@ int main(int argc, char * argv[])
                            C[i] = 2.0;
                        }
                      }, tbb_partitioner);
+#else
+    tbb::parallel_for( std::begin(range), std::end(range), [&](size_t i) {
+                           A[i] = 0.0;
+                           B[i] = 2.0;
+                           C[i] = 2.0;
+                       }, tbb_partitioner);
+#endif
 
     for (auto iter = 0; iter<=iterations; iter++) {
 
       if (iter==1) nstream_time = prk::wtime();
 
+#if 0
       tbb::parallel_for( range, [&](decltype(range)& r) {
                          for (auto i=r.begin(); i!=r.end(); ++i ) {
                              A[i] += B[i] + scalar * C[i];
                          }
                        }, tbb_partitioner);
+#else
+      tbb::parallel_for( std::begin(range), std::end(range), [&](size_t i) {
+                             A[i] += B[i] + scalar * C[i];
+                         }, tbb_partitioner);
+#endif
     }
     nstream_time = prk::wtime() - nstream_time;
   }

From 06c3bffb7cf1fcd783f1db39ee5c0cfc5de31a9f Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sun, 7 Apr 2019 20:35:35 -0700
Subject: [PATCH 137/245] show but do not enable non-range-based for in RAJA

---
 Cxx11/nstream-raja.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Cxx11/nstream-raja.cc b/Cxx11/nstream-raja.cc
index c98dae978..ef7b6c08e 100644
--- a/Cxx11/nstream-raja.cc
+++ b/Cxx11/nstream-raja.cc
@@ -133,6 +133,7 @@ int main(int argc, char * argv[])
   double scalar(3);
 
   {
+    //RAJA::forall<thread_exec>(0, length, [=](RAJA::Index_type i) {
     RAJA::forall<thread_exec>(range, [=](RAJA::Index_type i) {
         A(i) = 0.0;
         B(i) = 2.0;
@@ -143,6 +144,7 @@ int main(int argc, char * argv[])
 
       if (iter==1) nstream_time = prk::wtime();
 
+      //RAJA::forall<thread_exec>(0, length, [=](RAJA::Index_type i) {
       RAJA::forall<thread_exec>(range, [=](RAJA::Index_type i) {
           A(i) += B(i) + scalar * C(i);
       });

From 7f0e0ff264c994ea070230c188bd36f4c1198e5e Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Mon, 22 Apr 2019 17:45:09 -0500
Subject: [PATCH 138/245] fix issues with Thrust when not using NVCC

---
 Cxx11/nstream-host-thrust.cc |  2 --
 Cxx11/prk_thrust.h           | 14 +++++++++++---
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/Cxx11/nstream-host-thrust.cc b/Cxx11/nstream-host-thrust.cc
index c06c89108..ac82f33d3 100644
--- a/Cxx11/nstream-host-thrust.cc
+++ b/Cxx11/nstream-host-thrust.cc
@@ -115,8 +115,6 @@ int main(int argc, char * argv[])
   thrust::host_vector<double> B(length);
   thrust::host_vector<double> C(length);
 
-  auto range = prk::range(static_cast<size_t>(0), length);
-
   double scalar(3);
   {
     thrust::fill(thrust::host, A.begin(), A.end(), 0.0);
diff --git a/Cxx11/prk_thrust.h b/Cxx11/prk_thrust.h
index 4ffd50c34..1d733bf67 100644
--- a/Cxx11/prk_thrust.h
+++ b/Cxx11/prk_thrust.h
@@ -35,16 +35,24 @@
 #ifdef USE_THRUST
 # ifdef __NVCC__
 #  include <thrust/device_vector.h>
+# elif defined(_OPENMP)
+#define THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_OMP
+#  include <thrust/system/omp/execution_policy.h>
+//#  include <thrust/system/omp/vector.h>
+#else
+#define THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_CPP
+#  include <thrust/system/omp/execution_policy.h>
+//#  include <thrust/system/cpp/vector.h>
 # endif
 # include <thrust/host_vector.h>
 # include <thrust/fill.h>
-# include <thrust/sequence.h>
-# include <thrust/for_each.h>
 # include <thrust/transform.h>
 # include <thrust/transform_reduce.h>
+# include <thrust/functional.h>
+# include <thrust/sequence.h>
+# include <thrust/for_each.h>
 # include <thrust/iterator/counting_iterator.h>
 # include <thrust/execution_policy.h>
-# include <thrust/functional.h>
 #endif
 
 #endif /* PRK_THRUST_H */

From 0c920eeb627402d90ee243bca2fb8222a090100e Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Mon, 22 Apr 2019 17:52:37 -0500
Subject: [PATCH 139/245] update examples for Thrust changes

---
 common/make.defs.cuda  | 66 ++++++++++++++++++++++++++++++++++++++++--
 common/make.defs.gcc   | 15 ++++++----
 common/make.defs.intel |  4 +--
 common/make.defs.llvm  |  5 ++--
 4 files changed, 77 insertions(+), 13 deletions(-)

diff --git a/common/make.defs.cuda b/common/make.defs.cuda
index 48b85710d..0f5fafb75 100644
--- a/common/make.defs.cuda
+++ b/common/make.defs.cuda
@@ -6,24 +6,37 @@
 #
 VERSION=-7
 # C99 is required in some implementations.
-CC=gcc${VERSION} -std=gnu99
+CC=gcc${VERSION} -std=gnu11
+#EXTRA_CLIBS=-lrt
 # All of the Fortran code is written for the 2008 standard and requires preprocessing.
 FC=gfortran${VERSION} -std=f2008 -cpp
 # C++11 may not be required but does no harm here.
-CXX=g++${VERSION} -std=gnu++11
+CXX=g++${VERSION} -std=gnu++17
 #
 # Compiler flags
 #
 # -mtune=native is appropriate for most cases.
 # -march=native is appropriate if you want portable binaries.
-DEFAULT_OPT_FLAGS=-g -O3 -mtune=native
+#DEFAULT_OPT_FLAGS=-O3 -mtune=native -ffast-math
+DEFAULT_OPT_FLAGS=-O0
+DEFAULT_OPT_FLAGS+=-g3
+#DEFAULT_OPT_FLAGS+=-fsanitize=undefined
+#DEFAULT_OPT_FLAGS+=-fsanitize=undefined,leak
+#DEFAULT_OPT_FLAGS+=-fsanitize=address
+#DEFAULT_OPT_FLAGS+=-fsanitize=thread
 # If you are compiling for KNL on a Xeon login node, use the following:
 # DEFAULT_OPT_FLAGS=-g -O3 -march=knl
 # See https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html for details.
 #
+#DEFAULT_OPT_FLAGS+=-fopt-info-vec-missed
+DEFAULT_OPT_FLAGS+=-Wall #-Werror
+DEFAULT_OPT_FLAGS+=-Wno-ignored-attributes -Wno-deprecated-declarations
+#DEFAULT_OPT_FLAGS+=-mavx -mfma
+#
 # OpenMP flags
 #
 OPENMPFLAG=-fopenmp
+OPENMPSIMDFLAG=-fopenmp-simd
 OFFLOADFLAG=-foffload="-O3 -v"
 ORNLACCFLAG=-fopenacc
 #
@@ -35,6 +48,53 @@ ORNLACCFLAG=-fopenacc
 #OPENCLDIR=/etc/alternatives/opencl-intel-tools
 #OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL
 OPENCLFLAG=-I/usr/include -L/usr/lib/x86_64-linux-gnu -lOpenCL
+OPENCLFLAG+=-Wno-ignored-attributes -Wno-deprecated-declarations
+METALFLAG=-framework MetalPerformanceShaders
+#
+# OCCA
+#
+#OCCADIR=${HOME}/prk-repo/Cxx11/occa
+#
+# Cilk
+#
+#CILKFLAG=-fcilkplus
+#
+# TBB
+#
+TBBDIR=/usr/local/Cellar/tbb/2019_U5_1
+TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
+#
+# Parallel STL, Boost, etc.
+#
+BOOSTFLAG=-I/usr/local/Cellar/boost/1.69.0_2/include
+RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
+#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
+PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG}
+KOKKOSDIR=/opt/kokkos/gcc
+KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG}
+RAJADIR=/opt/raja/gcc
+RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
+THRUSTDIR=/opt/nvidia/thrust
+THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG}
+#
+# SYCL flags
+#
+# triSYCL
+# https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
+SYCLDIR=./triSYCL
+SYCLCXX=${CXX} -std=c++17 ${OPENMPFLAG}
+SYCLFLAG=-I${SYCLDIR}/include ${BOOSTFLAG} -DTRISYCL
+# ProGTX
+# https://github.com/ProGTX/sycl-gtx
+#SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx
+#SYCLCXX=${CXX} ${OPENMPFLAG}
+#SYCLFLAG=-I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG}
+SYCLFLAG+=${RANGEFLAG}
+#
+# CBLAS for C++ DGEMM
+#
+BLASFLAG=-DACCELERATE -framework Accelerate
+CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions
 #
 # CUDA flags
 #
diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index 89a0a1b0e..50b7a572a 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -17,10 +17,13 @@ CXX=g++${VERSION} -std=gnu++17 -pthread
 #
 # -mtune=native is appropriate for most cases.
 # -march=native is appropriate if you want portable binaries.
-DEFAULT_OPT_FLAGS=-g -O3 -mtune=native -ffast-math
-#DEFAULT_OPT_FLAGS=-g -O3 -mtune=native -ffast-math -fsanitize=undefined,leak
-#DEFAULT_OPT_FLAGS=-g -O3 -mtune=native -ffast-math -fsanitize=address
-#DEFAULT_OPT_FLAGS=-g -O3 -mtune=native -ffast-math -fsanitize=thread
+DEFAULT_OPT_FLAGS=-O3 -mtune=native -ffast-math
+#DEFAULT_OPT_FLAGS=-O0
+DEFAULT_OPT_FLAGS+=-g3
+#DEFAULT_OPT_FLAGS+=-fsanitize=undefined
+#DEFAULT_OPT_FLAGS+=-fsanitize=undefined,leak
+#DEFAULT_OPT_FLAGS+=-fsanitize=address
+#DEFAULT_OPT_FLAGS+=-fsanitize=thread
 # If you are compiling for KNL on a Xeon login node, use the following:
 # DEFAULT_OPT_FLAGS=-g -O3 -march=knl
 # See https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html for details.
@@ -84,7 +87,7 @@ KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG}
 RAJADIR=/opt/raja/gcc
 RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
 THRUSTDIR=/Users/jrhammon/Work/NVIDIA/thrust
-THRUSTFLAG=-I${THRUSTDIR} -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP
+THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG}
 #
 # SYCL flags
 #
@@ -97,7 +100,7 @@ SYCLFLAG=-I${SYCLDIR}/include ${BOOSTFLAG} -DTRISYCL
 # https://github.com/ProGTX/sycl-gtx
 #SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx
 #SYCLCXX=${CXX} ${OPENMPFLAG}
-#SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG}
+#SYCLFLAG=-I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG}
 SYCLFLAG+=${RANGEFLAG}
 #
 # CBLAS for C++ DGEMM
diff --git a/common/make.defs.intel b/common/make.defs.intel
index 17a4c2833..664d79e0c 100644
--- a/common/make.defs.intel
+++ b/common/make.defs.intel
@@ -76,8 +76,8 @@ KOKKOSDIR=/opt/kokkos/intel
 KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl
 RAJADIR=/opt/raja/intel
 RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
-#THRUSTDIR=/opt/nvidia/thrust
-#THRUSTFLAG=-I${THRUSTDIR} -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP
+THRUSTDIR=/opt/nvidia/thrust
+THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG}
 #
 # CBLAS for C++ DGEMM
 #
diff --git a/common/make.defs.llvm b/common/make.defs.llvm
index 4929aa0bb..fda35f476 100644
--- a/common/make.defs.llvm
+++ b/common/make.defs.llvm
@@ -105,11 +105,12 @@ KOKKOSDIR=/opt/kokkos/clang
 KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl
 RAJADIR=/opt/raja/clang
 RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
-#THRUSTDIR=/opt/nvidia/thrust
-#THRUSTFLAG=-I${THRUSTDIR} -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP
+THRUSTDIR=/opt/nvidia/thrust
+THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG}
 #
 # CBLAS for C++ DGEMM
 #
+BLASFLAG=-DACCELERATE -framework Accelerate
 CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions
 #
 # CUDA flags

From 27fff516c43653facef1ff3a12643ee19220c3d6 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Mon, 22 Apr 2019 17:53:10 -0500
Subject: [PATCH 140/245] switch Thrust to use PRK range wrapper

---
 Cxx11/transpose-device-thrust.cu | 2 +-
 Cxx11/transpose-host-thrust.cc   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cxx11/transpose-device-thrust.cu b/Cxx11/transpose-device-thrust.cu
index b4c9a1874..907f45e94 100644
--- a/Cxx11/transpose-device-thrust.cu
+++ b/Cxx11/transpose-device-thrust.cu
@@ -128,7 +128,7 @@ int main(int argc, char * argv[])
   thrust::sequence(thrust::device, A.begin(), A.end() );
   thrust::fill(thrust::device, B.begin(), B.end(), 0.0);
 
-  auto range = boost::irange(0,order);
+  auto range = prk::range(0,order);
 
   auto trans_time = 0.0;
 
diff --git a/Cxx11/transpose-host-thrust.cc b/Cxx11/transpose-host-thrust.cc
index 11482700a..07065b7e8 100644
--- a/Cxx11/transpose-host-thrust.cc
+++ b/Cxx11/transpose-host-thrust.cc
@@ -100,7 +100,7 @@ int main(int argc, char * argv[])
   thrust::sequence(thrust::host, A.begin(), A.end() );
   thrust::fill(thrust::host, B.begin(), B.end(), 0.0);
 
-  auto range = boost::irange(0,order);
+  auto range = prk::range(0,order);
 
   auto trans_time = 0.0;
 

From 4447d97cff7f75a543e78ea517209f6ea4b84eef Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Fri, 19 Apr 2019 16:50:46 -0700
Subject: [PATCH 141/245] not yet working prk::vector

---
 Cxx11/Makefile   |   2 +-
 Cxx11/prk_util.h | 107 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 108 insertions(+), 1 deletion(-)

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index 65875fba4..f71b2a568 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -81,7 +81,7 @@ transpose: transpose-valarray transpose-vector transpose-vector-async transpose-
 	   transpose-vector-taskloop transpose-vector-stl transpose-vector-pstl transpose-vector-raja \
 	   transpose-vector-rangefor transpose-vector-tbb transpose-vector-thread transpose-kokkos transpose-opencl
 
-nstream: nstream-valarray nstream-vector nstream-openmp nstream-openmp-target \
+#nstream: nstream-valarray nstream-vector nstream-openmp nstream-openmp-target \
 	 nstream-vector-taskloop nstream-vector-stl nstream-vector-pstl nstream-vector-raja \
 	 nstream-vector-rangefor nstream-vector-tbb nstream-kokkos nstream-opencl
 
diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h
index b25dccdf6..0062ba66d 100644
--- a/Cxx11/prk_util.h
+++ b/Cxx11/prk_util.h
@@ -84,6 +84,85 @@
 
 namespace prk {
 
+    int get_alignment(void)
+    {
+        /* a := alignment */
+#ifdef PRK_ALIGNMENT
+        int a = PRK_ALIGNMENT;
+#else
+        char* temp = getenv("PRK_ALIGNMENT");
+        int a = (temp!=NULL) ? atoi(temp) : 64;
+        if (a < 8) a = 8;
+        assert( (a & (~a+1)) == a ); /* is power of 2? */
+#endif
+        return a;
+    }
+
+#if defined(__INTEL_COMPILER)
+
+    template <typename T>
+    T * malloc(size_t bytes)
+    {
+        const int alignment = prk::get_alignment();
+        return (T*)_mm_malloc( bytes, alignment);
+    }
+
+    template <typename T>
+    void free(T * p)
+    {
+        _mm_free(p);
+    }
+
+#else // !__INTEL_COMPILER
+
+    template <typename T>
+    void * malloc(size_t bytes)
+    {
+        const int alignment = prk_get_alignment();
+
+        // We cannot use C11 aligned_alloc on Mac.
+        // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69680 */
+        // GCC claims to be C11 without knowing if glibc is compliant...
+#if !defined(__GNUC__) && \
+    !defined(__APPLE__) && \
+     defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
+
+        // From ISO C11:
+        //
+        // "The aligned_alloc function allocates space for an object
+        //  whose alignment is specified by alignment, whose size is
+        //  specified by size, and whose value is indeterminate.
+        //  The value of alignment shall be a valid alignment supported
+        //  by the implementation and the value of size shall be an
+        //  integral multiple of alignment."
+        //
+        //  Thus, if we do not round up the bytes to be a multiple
+        //  of the alignment, we violate ISO C.
+
+        size_t padded = bytes;
+        size_t excess = bytes % alignment;
+        if (excess>0) padded += (alignment - excess);
+        return aligned_alloc(alignment,padded);
+
+#else
+
+        T * ptr = NULL;
+        int ret = posix_memalign(&ptr,alignment,bytes);
+        if (ret!=0) ptr = NULL;
+        return ptr;
+
+#endif
+
+    }
+
+    template <typename T>
+    void free(void * p)
+    {
+        free(p);
+    }
+
+#endif // __INTEL_COMPILER
+
     template<class I, class T>
     const T reduce(I first, I last, T init) {
 #if (defined(__cplusplus) && (__cplusplus >= 201703L)) && !defined(__GNUC__)
@@ -100,6 +179,34 @@ namespace prk {
 #endif
     }
 
+    template <class T>
+    class vector {
+
+        private:
+            T * data_;
+            size_t size_;
+
+        public:
+
+        vector(size_t n) {
+            this->data_ = prk::malloc<T>(n);
+        }
+
+        vector(size_t n, T v) {
+            this->data_ = prk::malloc<T>(n);
+            for (size_t i=0; i<n; ++i) this->data_[i] = v;
+        }
+
+        ~vector() {
+            prk::free(this->data_);
+        }
+
+        T & operator[] (size_t n) {
+            return this->data_[n];
+        }
+
+    };
+
     static inline double wtime(void)
     {
 #if defined(USE_OPENMP) && defined(_OPENMP)

From 7bde7d95257de6950e0254e78009155a2b6cf6b5 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 20 Apr 2019 14:44:52 -0700
Subject: [PATCH 142/245] fix prk::vector

---
 Cxx11/prk_util.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h
index 0062ba66d..889e7e87d 100644
--- a/Cxx11/prk_util.h
+++ b/Cxx11/prk_util.h
@@ -101,9 +101,10 @@ namespace prk {
 #if defined(__INTEL_COMPILER)
 
     template <typename T>
-    T * malloc(size_t bytes)
+    T * malloc(size_t n)
     {
         const int alignment = prk::get_alignment();
+        const size_t bytes = n * sizeof(T);
         return (T*)_mm_malloc( bytes, alignment);
     }
 
@@ -189,15 +190,18 @@ namespace prk {
         public:
 
         vector(size_t n) {
+            //this->data_ = new T[n];
             this->data_ = prk::malloc<T>(n);
         }
 
         vector(size_t n, T v) {
+            //this->data_ = new T[n];
             this->data_ = prk::malloc<T>(n);
             for (size_t i=0; i<n; ++i) this->data_[i] = v;
         }
 
         ~vector() {
+            //delete[] this->data_;
             prk::free(this->data_);
         }
 

From 8f344f76b878f59abe3d571606ba22144694ac40 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 20 Apr 2019 14:45:00 -0700
Subject: [PATCH 143/245] eliminate rule conflict

---
 Cxx11/Makefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index f71b2a568..3953c4d80 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -68,16 +68,16 @@ endif
 
 all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sycl boost-compute $(EXTRA)
 
-p2p: p2p-vector p2p-doacross-openmp p2p-hyperplane-openmp p2p-tasks-openmp p2p-openmp-target \
+#p2p: p2p-vector p2p-doacross-openmp p2p-hyperplane-openmp p2p-tasks-openmp p2p-openmp-target \
      p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl p2p-hyperplane-vector-tbb \
      p2p-hyperplane-sycl p2p-hyperplane-vector-ornlacc p2p-tasks-tbb
 
-stencil: stencil-valarray stencil-vector stencil-vector-async stencil-openmp stencil-openmp-target \
+#stencil: stencil-valarray stencil-vector stencil-vector-async stencil-openmp stencil-openmp-target \
 	 stencil-vector-taskloop stencil-vector-stl stencil-vector-pstl stencil-vector-raja \
 	 stencil-vector-rangefor stencil-vector-tbb stencil-vector-thread stencil-kokkos stencil-opencl \
 	 stencil-cuda
 
-transpose: transpose-valarray transpose-vector transpose-vector-async transpose-openmp transpose-openmp-target \
+#transpose: transpose-valarray transpose-vector transpose-vector-async transpose-openmp transpose-openmp-target \
 	   transpose-vector-taskloop transpose-vector-stl transpose-vector-pstl transpose-vector-raja \
 	   transpose-vector-rangefor transpose-vector-tbb transpose-vector-thread transpose-kokkos transpose-opencl
 
@@ -85,7 +85,7 @@ transpose: transpose-valarray transpose-vector transpose-vector-async transpose-
 	 nstream-vector-taskloop nstream-vector-stl nstream-vector-pstl nstream-vector-raja \
 	 nstream-vector-rangefor nstream-vector-tbb nstream-kokkos nstream-opencl
 
-dgemm: dgemm-vector dgemm-cblas dgemm-cublas
+#dgemm: dgemm-vector dgemm-cblas dgemm-cublas
 
 vector: p2p-vector p2p-hyperplane-vector stencil-vector transpose-vector nstream-vector sparse-vector dgemm-vector \
 	transpose-vector-async transpose-vector-thread

From bc3e6b159499bb4c6bca5baead38aaefe9b83df4 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 20 Apr 2019 21:34:00 -0700
Subject: [PATCH 144/245] switch from std::vector to prk::vector

---
 Cxx11/nstream-vector-tbb.cc        |  6 +++---
 Cxx11/p2p-hyperplane-vector-tbb.cc |  2 +-
 Cxx11/p2p-innerloop-vector-tbb.cc  |  2 +-
 Cxx11/p2p-vector-tbb.cc            |  4 ++--
 Cxx11/stencil-vector-tbb.cc        |  6 +++---
 Cxx11/stencil_tbb.hpp              | 20 ++++++++++----------
 Cxx11/transpose-vector-tbb.cc      |  4 ++--
 7 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/Cxx11/nstream-vector-tbb.cc b/Cxx11/nstream-vector-tbb.cc
index 0859e0654..2c507af18 100644
--- a/Cxx11/nstream-vector-tbb.cc
+++ b/Cxx11/nstream-vector-tbb.cc
@@ -117,9 +117,9 @@ int main(int argc, char * argv[])
 
   auto nstream_time = 0.0;
 
-  std::vector<double> A(length);
-  std::vector<double> B(length);
-  std::vector<double> C(length);
+  prk::vector<double> A(length);
+  prk::vector<double> B(length);
+  prk::vector<double> C(length);
 
   double scalar(3);
 
diff --git a/Cxx11/p2p-hyperplane-vector-tbb.cc b/Cxx11/p2p-hyperplane-vector-tbb.cc
index 9c523a369..426006182 100644
--- a/Cxx11/p2p-hyperplane-vector-tbb.cc
+++ b/Cxx11/p2p-hyperplane-vector-tbb.cc
@@ -123,7 +123,7 @@ int main(int argc, char* argv[])
 
   auto pipeline_time = 0.0; // silence compiler warning
 
-  std::vector<double> grid(n*n,0.0);
+  prk::vector<double> grid(n*n,0.0);
 
   // set boundary values (bottom and left side of grid)
   for (auto j=0; j<n; j++) {
diff --git a/Cxx11/p2p-innerloop-vector-tbb.cc b/Cxx11/p2p-innerloop-vector-tbb.cc
index 1f58ab081..0eca934e7 100644
--- a/Cxx11/p2p-innerloop-vector-tbb.cc
+++ b/Cxx11/p2p-innerloop-vector-tbb.cc
@@ -112,7 +112,7 @@ int main(int argc, char* argv[])
 
   auto pipeline_time = 0.0; // silence compiler warning
 
-  std::vector<double> grid(n*n,0.0);
+  prk::vector<double> grid(n*n,0.0);
 
   // set boundary values (bottom and left side of grid)
   for (auto j=0; j<n; j++) {
diff --git a/Cxx11/p2p-vector-tbb.cc b/Cxx11/p2p-vector-tbb.cc
index 74cf57819..bded7e6db 100644
--- a/Cxx11/p2p-vector-tbb.cc
+++ b/Cxx11/p2p-vector-tbb.cc
@@ -62,7 +62,7 @@
 #include "prk_util.h"
 #include "prk_tbb.h"
 
-void SequentialSweep(int m, int n, std::vector<double> & grid)
+void SequentialSweep(int m, int n, prk::vector<double> & grid)
 {
   for (auto i=1; i<m; i++) {
     for (auto j=1; j<n; j++) {
@@ -176,7 +176,7 @@ int main(int argc, char* argv[])
 
   auto pipeline_time = 0.0; // silence compiler warning
 
-  std::vector<double> grid(m*n,0.0);
+  prk::vector<double> grid(m*n,0.0);
 
   // set boundary values (bottom and left side of grid)
   for (auto j=0; j<n; j++) {
diff --git a/Cxx11/stencil-vector-tbb.cc b/Cxx11/stencil-vector-tbb.cc
index 81a252019..0875bbf52 100644
--- a/Cxx11/stencil-vector-tbb.cc
+++ b/Cxx11/stencil-vector-tbb.cc
@@ -64,7 +64,7 @@
 #include "prk_tbb.h"
 #include "stencil_tbb.hpp"
 
-void nothing(const int n, const int t, std::vector<double> & in, std::vector<double> & out)
+void nothing(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out)
 {
     std::cout << "You are trying to use a stencil that does not exist." << std::endl;
     std::cout << "Please generate the new stencil using the code generator." << std::endl;
@@ -170,8 +170,8 @@ int main(int argc, char* argv[])
 
   auto stencil_time = 0.0;
 
-  std::vector<double> in(n*n);
-  std::vector<double> out(n*n);
+  prk::vector<double> in(n*n);
+  prk::vector<double> out(n*n);
 
   tbb::blocked_range2d<int> range(0, n, tile_size, 0, n, tile_size);
   tbb::parallel_for( range, [&](decltype(range)& r) {
diff --git a/Cxx11/stencil_tbb.hpp b/Cxx11/stencil_tbb.hpp
index 7b68173a9..edc168be2 100644
--- a/Cxx11/stencil_tbb.hpp
+++ b/Cxx11/stencil_tbb.hpp
@@ -1,4 +1,4 @@
-void star1(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void star1(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {
   tbb::blocked_range2d<int> range(1, n-1, t, 1, n-1, t);
   tbb::parallel_for( range, [&](decltype(range)& r ) {
     for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) {
@@ -13,7 +13,7 @@ void star1(const int n, const int t, std::vector<double> & in, std::vector<doubl
   }, tbb_partitioner );
 }
 
-void star2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void star2(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {
   tbb::blocked_range2d<int> range(2, n-2, t, 2, n-2, t);
   tbb::parallel_for( range, [&](decltype(range)& r ) {
     for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) {
@@ -32,7 +32,7 @@ void star2(const int n, const int t, std::vector<double> & in, std::vector<doubl
   }, tbb_partitioner );
 }
 
-void star3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void star3(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {
   tbb::blocked_range2d<int> range(3, n-3, t, 3, n-3, t);
   tbb::parallel_for( range, [&](decltype(range)& r ) {
     for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) {
@@ -55,7 +55,7 @@ void star3(const int n, const int t, std::vector<double> & in, std::vector<doubl
   }, tbb_partitioner );
 }
 
-void star4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void star4(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {
   tbb::blocked_range2d<int> range(4, n-4, t, 4, n-4, t);
   tbb::parallel_for( range, [&](decltype(range)& r ) {
     for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) {
@@ -82,7 +82,7 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
   }, tbb_partitioner );
 }
 
-void star5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void star5(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {
   tbb::blocked_range2d<int> range(5, n-5, t, 5, n-5, t);
   tbb::parallel_for( range, [&](decltype(range)& r ) {
     for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) {
@@ -113,7 +113,7 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
   }, tbb_partitioner );
 }
 
-void grid1(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void grid1(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {
   tbb::blocked_range2d<int> range(1, n-1, t, 1, n-1, t);
   tbb::parallel_for( range, [&](decltype(range)& r ) {
     for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) {
@@ -131,7 +131,7 @@ void grid1(const int n, const int t, std::vector<double> & in, std::vector<doubl
   }, tbb_partitioner );
 }
 
-void grid2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void grid2(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {
   tbb::blocked_range2d<int> range(2, n-2, t, 2, n-2, t);
   tbb::parallel_for( range, [&](decltype(range)& r ) {
     for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) {
@@ -163,7 +163,7 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
   }, tbb_partitioner );
 }
 
-void grid3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void grid3(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {
   tbb::blocked_range2d<int> range(3, n-3, t, 3, n-3, t);
   tbb::parallel_for( range, [&](decltype(range)& r ) {
     for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) {
@@ -217,7 +217,7 @@ void grid3(const int n, const int t, std::vector<double> & in, std::vector<doubl
   }, tbb_partitioner );
 }
 
-void grid4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void grid4(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {
   tbb::blocked_range2d<int> range(4, n-4, t, 4, n-4, t);
   tbb::parallel_for( range, [&](decltype(range)& r ) {
     for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) {
@@ -301,7 +301,7 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
   }, tbb_partitioner );
 }
 
-void grid5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void grid5(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {
   tbb::blocked_range2d<int> range(5, n-5, t, 5, n-5, t);
   tbb::parallel_for( range, [&](decltype(range)& r ) {
     for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) {
diff --git a/Cxx11/transpose-vector-tbb.cc b/Cxx11/transpose-vector-tbb.cc
index d154677fd..8e31954b6 100644
--- a/Cxx11/transpose-vector-tbb.cc
+++ b/Cxx11/transpose-vector-tbb.cc
@@ -112,8 +112,8 @@ int main(int argc, char * argv[])
 
   auto trans_time = 0.0;
 
-  std::vector<double> A(order*order);
-  std::vector<double> B(order*order);
+  prk::vector<double> A(order*order);
+  prk::vector<double> B(order*order);
 
   tbb::blocked_range2d<int> range(0, order, tile_size, 0, order, tile_size);
   tbb::parallel_for( range, [&](decltype(range)& r) {

From 5cf87f75247f3962123fb5acb7571b2e786a12a2 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 20 Apr 2019 21:48:53 -0700
Subject: [PATCH 145/245] use prk::vector instead of std::vector

---
 Cxx11/transpose-cblas.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cxx11/transpose-cblas.cc b/Cxx11/transpose-cblas.cc
index 9f7f17b07..6add9f18d 100644
--- a/Cxx11/transpose-cblas.cc
+++ b/Cxx11/transpose-cblas.cc
@@ -105,9 +105,9 @@ int main(int argc, char * argv[])
 
   auto trans_time = 0.0;
 
-  std::vector<double> A(order*order);
-  std::vector<double> B(order*order,0.0);
-  std::vector<double> T(order*order);
+  prk::vector<double> A(order*order);
+  prk::vector<double> B(order*order,0.0);
+  prk::vector<double> T(order*order);
   double one[1] = {1.0};
 
   // fill A with the sequence 0 to order^2-1 as doubles

From fd46cfdc07c50b7b4095bc561517f7fd6a3981ce Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 20 Apr 2019 21:49:22 -0700
Subject: [PATCH 146/245] use prk::vector instead of std::vector

---
 Cxx11/nstream-vector-rangefor.cc   |  6 +++---
 Cxx11/stencil-vector-rangefor.cc   |  6 +++---
 Cxx11/stencil_rangefor.hpp         | 20 ++++++++++----------
 Cxx11/transpose-vector-rangefor.cc |  4 ++--
 4 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/Cxx11/nstream-vector-rangefor.cc b/Cxx11/nstream-vector-rangefor.cc
index 2bdadea3d..56948aaa0 100644
--- a/Cxx11/nstream-vector-rangefor.cc
+++ b/Cxx11/nstream-vector-rangefor.cc
@@ -112,9 +112,9 @@ int main(int argc, char * argv[])
 
   auto nstream_time = 0.0;
 
-  std::vector<double> A(length,0.0);
-  std::vector<double> B(length,2.0);
-  std::vector<double> C(length,2.0);
+  prk::vector<double> A(length,0.0);
+  prk::vector<double> B(length,2.0);
+  prk::vector<double> C(length,2.0);
 
   auto range = prk::range(0,length);
 
diff --git a/Cxx11/stencil-vector-rangefor.cc b/Cxx11/stencil-vector-rangefor.cc
index 040bde745..4ec0eb06e 100644
--- a/Cxx11/stencil-vector-rangefor.cc
+++ b/Cxx11/stencil-vector-rangefor.cc
@@ -63,7 +63,7 @@
 #include "prk_util.h"
 #include "stencil_seq.hpp"
 
-void nothing(const int n, const int t, std::vector<double> & in, std::vector<double> & out)
+void nothing(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out)
 {
     std::cout << "You are trying to use a stencil that does not exist.\n";
     std::cout << "Please generate the new stencil using the code generator\n";
@@ -164,8 +164,8 @@ int main(int argc, char* argv[])
 
   auto stencil_time = 0.0;
 
-  std::vector<double> in(n*n);
-  std::vector<double> out(n*n);
+  prk::vector<double> in(n*n);
+  prk::vector<double> out(n*n);
 
   // initialize the input and output arrays
   auto range = prk::range(0,n);
diff --git a/Cxx11/stencil_rangefor.hpp b/Cxx11/stencil_rangefor.hpp
index f1ecb729e..85b23d342 100644
--- a/Cxx11/stencil_rangefor.hpp
+++ b/Cxx11/stencil_rangefor.hpp
@@ -1,4 +1,4 @@
-void star1(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void star1(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {
     auto inside = prk::range(1,n-1);
     for (auto i : inside) {
       PRAGMA_SIMD
@@ -13,7 +13,7 @@ void star1(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void star2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void star2(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {
     auto inside = prk::range(2,n-2);
     for (auto i : inside) {
       PRAGMA_SIMD
@@ -32,7 +32,7 @@ void star2(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void star3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void star3(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {
     auto inside = prk::range(3,n-3);
     for (auto i : inside) {
       PRAGMA_SIMD
@@ -55,7 +55,7 @@ void star3(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void star4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void star4(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {
     auto inside = prk::range(4,n-4);
     for (auto i : inside) {
       PRAGMA_SIMD
@@ -82,7 +82,7 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void star5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void star5(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {
     auto inside = prk::range(5,n-5);
     for (auto i : inside) {
       PRAGMA_SIMD
@@ -113,7 +113,7 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void grid1(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void grid1(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {
     auto inside = prk::range(1,n-1);
     for (auto i : inside) {
       PRAGMA_SIMD
@@ -131,7 +131,7 @@ void grid1(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void grid2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void grid2(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {
     auto inside = prk::range(2,n-2);
     for (auto i : inside) {
       PRAGMA_SIMD
@@ -163,7 +163,7 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void grid3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void grid3(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {
     auto inside = prk::range(3,n-3);
     for (auto i : inside) {
       PRAGMA_SIMD
@@ -217,7 +217,7 @@ void grid3(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void grid4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void grid4(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {
     auto inside = prk::range(4,n-4);
     for (auto i : inside) {
       PRAGMA_SIMD
@@ -301,7 +301,7 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void grid5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void grid5(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {
     auto inside = prk::range(5,n-5);
     for (auto i : inside) {
       PRAGMA_SIMD
diff --git a/Cxx11/transpose-vector-rangefor.cc b/Cxx11/transpose-vector-rangefor.cc
index 3d2e4f9f1..2d4ba5449 100644
--- a/Cxx11/transpose-vector-rangefor.cc
+++ b/Cxx11/transpose-vector-rangefor.cc
@@ -103,8 +103,8 @@ int main(int argc, char * argv[])
 
   auto trans_time = 0.0;
 
-  std::vector<double> A(order*order);
-  std::vector<double> B(order*order,0.0);
+  prk::vector<double> A(order*order);
+  prk::vector<double> B(order*order,0.0);
 
   // fill A with the sequence 0 to order^2-1 as doubles
   std::iota(A.begin(), A.end(), 0.0);

From 6bf07538edc128ec9be36cfe497df23c0fe3d784 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 20 Apr 2019 21:50:14 -0700
Subject: [PATCH 147/245] use prk::vector instead of std::vector

---
 Cxx11/transpose-vector-thread.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cxx11/transpose-vector-thread.cc b/Cxx11/transpose-vector-thread.cc
index 44071ca95..1ac5c693a 100644
--- a/Cxx11/transpose-vector-thread.cc
+++ b/Cxx11/transpose-vector-thread.cc
@@ -130,8 +130,8 @@ int main(int argc, char * argv[])
   // Allocate space and perform the computation
   //////////////////////////////////////////////////////////////////////
 
-  std::vector<double> A(order*order);
-  std::vector<double> B(order*order,0.0);
+  prk::vector<double> A(order*order);
+  prk::vector<double> B(order*order,0.0);
 
   // fill A with the sequence 0 to order^2-1 as doubles
   std::iota(A.begin(), A.end(), 0.0);

From 9dddc2ce65280b9d5fc677290b94ef449ec7921b Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 20 Apr 2019 21:54:53 -0700
Subject: [PATCH 148/245] use prk::vector instead of std::vector

---
 Cxx11/transpose-vector-async.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cxx11/transpose-vector-async.cc b/Cxx11/transpose-vector-async.cc
index c68b8c463..c23011d75 100644
--- a/Cxx11/transpose-vector-async.cc
+++ b/Cxx11/transpose-vector-async.cc
@@ -126,8 +126,8 @@ int main(int argc, char * argv[])
   // Allocate space and perform the computation
   //////////////////////////////////////////////////////////////////////
 
-  std::vector<double> A(order*order);
-  std::vector<double> B(order*order,0.0);
+  prk::vector<double> A(order*order);
+  prk::vector<double> B(order*order,0.0);
 
   // fill A with the sequence 0 to order^2-1 as doubles
   std::iota(A.begin(), A.end(), 0.0);

From 7ccf646ecae92bc9fefa0f6c637b3f8c61ac8620 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 20 Apr 2019 21:55:38 -0700
Subject: [PATCH 149/245] use prk::vector instead of std::vector

---
 Cxx11/nstream-vector-taskloop.cc   |  6 +++---
 Cxx11/stencil-vector-taskloop.cc   |  6 +++---
 Cxx11/stencil_taskloop.hpp         | 20 ++++++++++----------
 Cxx11/transpose-vector-taskloop.cc |  4 ++--
 4 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/Cxx11/nstream-vector-taskloop.cc b/Cxx11/nstream-vector-taskloop.cc
index 95bd5c925..3f4c8f1d6 100644
--- a/Cxx11/nstream-vector-taskloop.cc
+++ b/Cxx11/nstream-vector-taskloop.cc
@@ -120,9 +120,9 @@ int main(int argc, char * argv[])
 
   auto nstream_time = 0.0;
 
-  std::vector<double> A(length);
-  std::vector<double> B(length);
-  std::vector<double> C(length);
+  prk::vector<double> A(length);
+  prk::vector<double> B(length);
+  prk::vector<double> C(length);
 
   double scalar = 3.0;
 
diff --git a/Cxx11/stencil-vector-taskloop.cc b/Cxx11/stencil-vector-taskloop.cc
index 971d71db1..6cc5fb0cd 100644
--- a/Cxx11/stencil-vector-taskloop.cc
+++ b/Cxx11/stencil-vector-taskloop.cc
@@ -63,7 +63,7 @@
 #include "prk_util.h"
 #include "stencil_taskloop.hpp"
 
-void nothing(const int n, const int t, std::vector<double> & in, std::vector<double> & out, const int gs)
+void nothing(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out, const int gs)
 {
     std::cout << "You are trying to use a stencil that does not exist.\n";
     std::cout << "Please generate the new stencil using the code generator\n";
@@ -174,8 +174,8 @@ int main(int argc, char* argv[])
 
   auto stencil_time = 0.0;
 
-  std::vector<double> in(n*n);;
-  std::vector<double> out(n*n);;
+  prk::vector<double> in(n*n);;
+  prk::vector<double> out(n*n);;
 
   OMP_PARALLEL()
   OMP_MASTER
diff --git a/Cxx11/stencil_taskloop.hpp b/Cxx11/stencil_taskloop.hpp
index 856f41995..874f122cc 100644
--- a/Cxx11/stencil_taskloop.hpp
+++ b/Cxx11/stencil_taskloop.hpp
@@ -1,4 +1,4 @@
-void star1(const int n, const int t, std::vector<double> & in, std::vector<double> & out, const int gs) {
+void star1(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out, const int gs) {
     OMP_TASKLOOP_COLLAPSE(2, firstprivate(n) shared(in,out) grainsize(gs) )
     for (auto it=1; it<n-1; it+=t) {
       for (auto jt=1; jt<n-1; jt+=t) {
@@ -15,7 +15,7 @@ void star1(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void star2(const int n, const int t, std::vector<double> & in, std::vector<double> & out, const int gs) {
+void star2(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out, const int gs) {
     OMP_TASKLOOP_COLLAPSE(2, firstprivate(n) shared(in,out) grainsize(gs) )
     for (auto it=2; it<n-2; it+=t) {
       for (auto jt=2; jt<n-2; jt+=t) {
@@ -36,7 +36,7 @@ void star2(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void star3(const int n, const int t, std::vector<double> & in, std::vector<double> & out, const int gs) {
+void star3(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out, const int gs) {
     OMP_TASKLOOP_COLLAPSE(2, firstprivate(n) shared(in,out) grainsize(gs) )
     for (auto it=3; it<n-3; it+=t) {
       for (auto jt=3; jt<n-3; jt+=t) {
@@ -61,7 +61,7 @@ void star3(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void star4(const int n, const int t, std::vector<double> & in, std::vector<double> & out, const int gs) {
+void star4(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out, const int gs) {
     OMP_TASKLOOP_COLLAPSE(2, firstprivate(n) shared(in,out) grainsize(gs) )
     for (auto it=4; it<n-4; it+=t) {
       for (auto jt=4; jt<n-4; jt+=t) {
@@ -90,7 +90,7 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void star5(const int n, const int t, std::vector<double> & in, std::vector<double> & out, const int gs) {
+void star5(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out, const int gs) {
     OMP_TASKLOOP_COLLAPSE(2, firstprivate(n) shared(in,out) grainsize(gs) )
     for (auto it=5; it<n-5; it+=t) {
       for (auto jt=5; jt<n-5; jt+=t) {
@@ -123,7 +123,7 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void grid1(const int n, const int t, std::vector<double> & in, std::vector<double> & out, const int gs) {
+void grid1(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out, const int gs) {
     OMP_TASKLOOP_COLLAPSE(2, firstprivate(n) shared(in,out) grainsize(gs) )
     for (auto it=1; it<n-1; it+=t) {
       for (auto jt=1; jt<n-1; jt+=t) {
@@ -143,7 +143,7 @@ void grid1(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void grid2(const int n, const int t, std::vector<double> & in, std::vector<double> & out, const int gs) {
+void grid2(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out, const int gs) {
     OMP_TASKLOOP_COLLAPSE(2, firstprivate(n) shared(in,out) grainsize(gs) )
     for (auto it=2; it<n-2; it+=t) {
       for (auto jt=2; jt<n-2; jt+=t) {
@@ -177,7 +177,7 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void grid3(const int n, const int t, std::vector<double> & in, std::vector<double> & out, const int gs) {
+void grid3(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out, const int gs) {
     OMP_TASKLOOP_COLLAPSE(2, firstprivate(n) shared(in,out) grainsize(gs) )
     for (auto it=3; it<n-3; it+=t) {
       for (auto jt=3; jt<n-3; jt+=t) {
@@ -233,7 +233,7 @@ void grid3(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void grid4(const int n, const int t, std::vector<double> & in, std::vector<double> & out, const int gs) {
+void grid4(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out, const int gs) {
     OMP_TASKLOOP_COLLAPSE(2, firstprivate(n) shared(in,out) grainsize(gs) )
     for (auto it=4; it<n-4; it+=t) {
       for (auto jt=4; jt<n-4; jt+=t) {
@@ -319,7 +319,7 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void grid5(const int n, const int t, std::vector<double> & in, std::vector<double> & out, const int gs) {
+void grid5(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out, const int gs) {
     OMP_TASKLOOP_COLLAPSE(2, firstprivate(n) shared(in,out) grainsize(gs) )
     for (auto it=5; it<n-5; it+=t) {
       for (auto jt=5; jt<n-5; jt+=t) {
diff --git a/Cxx11/transpose-vector-taskloop.cc b/Cxx11/transpose-vector-taskloop.cc
index 17dbad525..56d9fdfa8 100644
--- a/Cxx11/transpose-vector-taskloop.cc
+++ b/Cxx11/transpose-vector-taskloop.cc
@@ -113,8 +113,8 @@ int main(int argc, char * argv[])
   // Allocate space and perform the computation
   //////////////////////////////////////////////////////////////////////
 
-  std::vector<double> A(order*order);
-  std::vector<double> B(order*order);
+  prk::vector<double> A(order*order);
+  prk::vector<double> B(order*order);
 
   auto trans_time = 0.0;
 

From d530b2dc5327304a0a9093002574d41c886fef14 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 20 Apr 2019 21:57:47 -0700
Subject: [PATCH 150/245] better=simpler use of STL

---
 Cxx11/nstream-vector-boost-compute.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Cxx11/nstream-vector-boost-compute.cc b/Cxx11/nstream-vector-boost-compute.cc
index 619c02374..785d496a9 100644
--- a/Cxx11/nstream-vector-boost-compute.cc
+++ b/Cxx11/nstream-vector-boost-compute.cc
@@ -119,8 +119,7 @@ int main(int argc, char * argv[])
 
   auto nstream_time = 0.0;
 
-  std::vector<float> h_A;
-  h_A.resize(length);
+  std::vector<float> h_A(length);
 
   const float scalar(3);
 

From d8b28a965f276058e6a3ba9f2aee78e0c6a7fef6 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 20 Apr 2019 21:59:09 -0700
Subject: [PATCH 151/245] add variant for prk::vector

---
 Cxx11/p2p-kernel.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/Cxx11/p2p-kernel.h b/Cxx11/p2p-kernel.h
index f402eba37..54b88c6c6 100644
--- a/Cxx11/p2p-kernel.h
+++ b/Cxx11/p2p-kernel.h
@@ -24,6 +24,17 @@ inline void sweep_tile(int startm, int endm,
   }
 }
 
+inline void sweep_tile(int startm, int endm,
+                       int startn, int endn,
+                       int n, prk::vector<double> & grid)
+{
+  for (auto i=startm; i<endm; i++) {
+    for (auto j=startn; j<endn; j++) {
+      grid[i*n+j] = grid[(i-1)*n+j] + grid[i*n+(j-1)] - grid[(i-1)*n+(j-1)];
+    }
+  }
+}
+
 #else
 
 inline void sweep_tile(int startm, int endm,
@@ -60,4 +71,21 @@ inline void sweep_tile(int startm, int endm,
     }
 }
 
+inline void sweep_tile(int startm, int endm,
+                       int startn, int endn,
+                       int n, prk::vector<double> & grid)
+{
+    for (int i=startm; i<endm; i++) {
+        double olda = grid[  i  *n+(startn-1)];
+        double oldb = grid[(i-1)*n+(startn-1)];
+        for (int j=startn; j<endn; j++) {
+            double const newb = grid[(i-1)*n+j];
+            double const newa = newb - oldb + olda;
+            grid[i*n+j] = newa;
+            olda = newa;
+            oldb = newb;
+        }
+    }
+}
+
 #endif

From 47bce9b0a218b0d9ed27cf6c67b4afd1244f5976 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 20 Apr 2019 21:59:42 -0700
Subject: [PATCH 152/245] try to implement prk::vector - works for some impls

---
 Cxx11/prk_util.h | 73 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 56 insertions(+), 17 deletions(-)

diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h
index 889e7e87d..3ad580f55 100644
--- a/Cxx11/prk_util.h
+++ b/Cxx11/prk_util.h
@@ -180,7 +180,7 @@ namespace prk {
 #endif
     }
 
-    template <class T>
+    template <typename T>
     class vector {
 
         private:
@@ -189,26 +189,65 @@ namespace prk {
 
         public:
 
-        vector(size_t n) {
-            //this->data_ = new T[n];
-            this->data_ = prk::malloc<T>(n);
-        }
+            vector(size_t n) {
+                //this->data_ = new T[n];
+                this->data_ = prk::malloc<T>(n);
+            }
+
+            vector(size_t n, T v) {
+                //this->data_ = new T[n];
+                this->data_ = prk::malloc<T>(n);
+                for (size_t i=0; i<n; ++i) this->data_[i] = v;
+            }
+
+            ~vector() {
+                //delete[] this->data_;
+                prk::free(this->data_);
+            }
+
+            T * data() {
+                return this->data_;
+            }
+
+            size_t size() {
+                return this->size_;
+            }
+
+#if 0
+            T const & operator[] (int n) const {
+                return this->data_[n];
+            }
+
+            T & operator[] (int n) {
+                return this->data_[n];
+            }
+#endif
 
-        vector(size_t n, T v) {
-            //this->data_ = new T[n];
-            this->data_ = prk::malloc<T>(n);
-            for (size_t i=0; i<n; ++i) this->data_[i] = v;
-        }
+            T const & operator[] (size_t n) const {
+                return this->data_[n];
+            }
 
-        ~vector() {
-            //delete[] this->data_;
-            prk::free(this->data_);
-        }
+            T & operator[] (size_t n) {
+                return this->data_[n];
+            }
 
-        T & operator[] (size_t n) {
-            return this->data_[n];
-        }
+            T * begin() {
+                return &(this->data_[0]);
+            }
+
+            T * end() {
+                return &(this->data_[this->size_]);
+            }
 
+#if 0
+            T & begin() {
+                return this->data_[0];
+            }
+
+            T & end() {
+                return this->data_[this->size_];
+            }
+#endif
     };
 
     static inline double wtime(void)

From 2970a8417effd7c6cb7dce2f107c482b0173de9d Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 20 Apr 2019 22:01:22 -0700
Subject: [PATCH 153/245] add versions that use prk::vector rather than STL

---
 Cxx11/dgemm.cc     | 224 ++++++++++++++++++++++++++++++++++++++++++
 Cxx11/nstream.cc   | 166 ++++++++++++++++++++++++++++++++
 Cxx11/p2p.cc       | 185 +++++++++++++++++++++++++++++++++++
 Cxx11/sparse.cc    | 235 +++++++++++++++++++++++++++++++++++++++++++++
 Cxx11/stencil.cc   | 231 ++++++++++++++++++++++++++++++++++++++++++++
 Cxx11/transpose.cc | 177 ++++++++++++++++++++++++++++++++++
 6 files changed, 1218 insertions(+)
 create mode 100644 Cxx11/dgemm.cc
 create mode 100644 Cxx11/nstream.cc
 create mode 100644 Cxx11/p2p.cc
 create mode 100644 Cxx11/sparse.cc
 create mode 100644 Cxx11/stencil.cc
 create mode 100644 Cxx11/transpose.cc

diff --git a/Cxx11/dgemm.cc b/Cxx11/dgemm.cc
new file mode 100644
index 000000000..5d7fa7897
--- /dev/null
+++ b/Cxx11/dgemm.cc
@@ -0,0 +1,224 @@
+///
+/// Copyright (c) 2017, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    dgemm
+///
+/// PURPOSE: This program tests the efficiency with which a dense matrix
+///          dense multiplication is carried out
+///
+/// USAGE:   The program takes as input the matrix order,
+///          the number of times the matrix-matrix multiplication
+///          is carried out, and, optionally, a tile size for matrix
+///          blocking
+///
+///          <progname> <# iterations> <matrix order> [<tile size>]
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// FUNCTIONS CALLED:
+///
+///          Other than OpenMP or standard C functions, the following
+///          functions are used in this program:
+///
+///          wtime()
+///
+/// HISTORY: Written by Rob Van der Wijngaart, February 2009.
+///          Converted to C++11 by Jeff Hammond, December, 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+
+void prk_dgemm(const int order,
+               const prk::vector<double> & A,
+               const prk::vector<double> & B,
+                     prk::vector<double> & C)
+{
+    PRAGMA_SIMD
+    for (auto i=0; i<order; ++i) {
+      PRAGMA_SIMD
+      for (auto k=0; k<order; ++k) {
+        PRAGMA_SIMD
+        for (auto j=0; j<order; ++j) {
+            C[i*order+j] += A[i*order+k] * B[k*order+j];
+        }
+      }
+    }
+}
+
+void prk_dgemm(const int order, const int tile_size,
+               const prk::vector<double> & A,
+               const prk::vector<double> & B,
+                     prk::vector<double> & C)
+{
+    for (auto it=0; it<order; it+=tile_size) {
+      for (auto kt=0; kt<order; kt+=tile_size) {
+        for (auto jt=0; jt<order; jt+=tile_size) {
+          // ICC will not hoist these on its own...
+          auto iend = std::min(order,it+tile_size);
+          auto jend = std::min(order,jt+tile_size);
+          auto kend = std::min(order,kt+tile_size);
+          PRAGMA_SIMD
+          for (auto i=it; i<iend; ++i) {
+            PRAGMA_SIMD
+            for (auto k=kt; k<kend; ++k) {
+              PRAGMA_SIMD
+              for (auto j=jt; j<jend; ++j) {
+                C[i*order+j] += A[i*order+k] * B[k*order+j];
+              }
+            }
+          }
+        }
+      }
+    }
+}
+
+int main(int argc, char * argv[])
+{
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11 Dense matrix-matrix multiplication: C += A x B" << std::endl;
+
+  int iterations;
+  int order;
+  int tile_size;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <matrix order> [tile size]";
+      }
+
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      order = std::atoi(argv[2]);
+      if (order <= 0) {
+        throw "ERROR: Matrix Order must be greater than 0";
+      } else if (order > std::floor(std::sqrt(INT_MAX))) {
+        throw "ERROR: matrix dimension too large - overflow risk";
+      }
+
+      tile_size = (argc>3) ? std::atoi(argv[3]) : 32;
+      if (tile_size <= 0) tile_size = order;
+
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Matrix order         = " << order << std::endl;
+  if (tile_size < order) {
+      std::cout << "Tile size            = " << tile_size << std::endl;
+  } else {
+      std::cout << "Untiled (IKJ loop order)" << std::endl;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Allocate space for matrices
+  //////////////////////////////////////////////////////////////////////
+
+  double dgemm_time(0);
+
+  prk::vector<double> A(order*order);
+  prk::vector<double> B(order*order);
+  prk::vector<double> C(order*order,0.0);
+  for (auto i=0; i<order; ++i) {
+    for (auto j=0; j<order; ++j) {
+       A[i*order+j] = i;
+       B[i*order+j] = i;
+    }
+  }
+
+  {
+    for (auto iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) dgemm_time = prk::wtime();
+
+      if (tile_size < order) {
+          prk_dgemm(order, tile_size, A, B, C);
+      } else {
+          prk_dgemm(order, A, B, C);
+      }
+    }
+    dgemm_time = prk::wtime() - dgemm_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  const auto forder = static_cast<double>(order);
+  const auto reference = 0.25 * std::pow(forder,3) * std::pow(forder-1.0,2) * (iterations+1);
+  const auto checksum = prk::reduce(C.begin(), C.end(), 0.0);
+
+  const auto epsilon = 1.0e-8;
+  const auto residuum = std::abs(checksum-reference)/reference;
+  if (residuum < epsilon) {
+#if VERBOSE
+    std::cout << "Reference checksum = " << reference << "\n"
+              << "Actual checksum = " << checksum << std::endl;
+#endif
+    std::cout << "Solution validates" << std::endl;
+    auto avgtime = dgemm_time/iterations;
+    auto nflops = 2.0 * std::pow(forder,3);
+    std::cout << "Rate (MF/s): " << 1.0e-6 * nflops/avgtime
+              << " Avg time (s): " << avgtime << std::endl;
+  } else {
+    std::cout << "Reference checksum = " << reference << "\n"
+              << "Actual checksum = " << checksum << std::endl;
+#if VERBOSE
+    for (auto i=0; i<order; ++i)
+      for (auto j=0; j<order; ++j)
+        std::cout << "A(" << i << "," << j << ") = " << A[i*order+j] << "\n";
+    for (auto i=0; i<order; ++i)
+      for (auto j=0; j<order; ++j)
+        std::cout << "B(" << i << "," << j << ") = " << B[i*order+j] << "\n";
+    for (auto i=0; i<order; ++i)
+      for (auto j=0; j<order; ++j)
+        std::cout << "C(" << i << "," << j << ") = " << C[i*order+j] << "\n";
+    std::cout << std::endl;
+#endif
+    return 1;
+  }
+
+  return 0;
+}
+
+
diff --git a/Cxx11/nstream.cc b/Cxx11/nstream.cc
new file mode 100644
index 000000000..5673d3cf6
--- /dev/null
+++ b/Cxx11/nstream.cc
@@ -0,0 +1,166 @@
+///
+/// Copyright (c) 2017, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    nstream
+///
+/// PURPOSE: To compute memory bandwidth when adding a vector of a given
+///          number of double precision values to the scalar multiple of
+///          another vector of the same length, and storing the result in
+///          a third vector.
+///
+/// USAGE:   The program takes as input the number
+///          of iterations to loop over the triad vectors, the length of the
+///          vectors, and the offset between vectors
+///
+///          <progname> <# iterations> <vector length> <offset>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// NOTES:   Bandwidth is determined as the number of words read, plus the
+///          number of words written, times the size of the words, divided
+///          by the execution time. For a vector length of N, the total
+///          number of words read and written is 4*N*sizeof(double).
+///
+/// HISTORY: This code is loosely based on the Stream benchmark by John
+///          McCalpin, but does not follow all the Stream rules. Hence,
+///          reported results should not be associated with Stream in
+///          external publications
+///
+///          Converted to C++11 by Jeff Hammond, November 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11 STREAM triad: A = B + scalar * C" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations, offset;
+  size_t length;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <vector length> [<offset>]";
+      }
+
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      length = std::atol(argv[2]);
+      if (length <= 0) {
+        throw "ERROR: vector length must be positive";
+      }
+
+      offset = (argc>3) ? std::atoi(argv[3]) : 0;
+      if (length <= 0) {
+        throw "ERROR: offset must be nonnegative";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Vector length        = " << length << std::endl;
+  std::cout << "Offset               = " << offset << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  auto nstream_time = 0.0;
+
+  prk::vector<double> A(length,0.0);
+  prk::vector<double> B(length,2.0);
+  prk::vector<double> C(length,2.0);
+
+  double scalar = 3.0;
+
+  {
+    for (auto iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) nstream_time = prk::wtime();
+
+      for (size_t i=0; i<length; i++) {
+          A[i] += B[i] + scalar * C[i];
+      }
+    }
+    nstream_time = prk::wtime() - nstream_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  double ar(0);
+  double br(2);
+  double cr(2);
+  for (auto i=0; i<=iterations; i++) {
+      ar += br + scalar * cr;
+  }
+
+  ar *= length;
+
+  double asum(0);
+  for (size_t i=0; i<length; i++) {
+      asum += std::fabs(A[i]);
+  }
+
+  double epsilon=1.e-8;
+  if (std::fabs(ar-asum)/asum > epsilon) {
+      std::cout << "Failed Validation on output array\n"
+                << "       Expected checksum: " << ar << "\n"
+                << "       Observed checksum: " << asum << std::endl;
+      std::cout << "ERROR: solution did not validate" << std::endl;
+      return 1;
+  } else {
+      std::cout << "Solution validates" << std::endl;
+      double avgtime = nstream_time/iterations;
+      double nbytes = 4.0 * length * sizeof(double);
+      std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime
+                << " Avg time (s): " << avgtime << std::endl;
+  }
+
+  return 0;
+}
+
+
diff --git a/Cxx11/p2p.cc b/Cxx11/p2p.cc
new file mode 100644
index 000000000..119fecfe2
--- /dev/null
+++ b/Cxx11/p2p.cc
@@ -0,0 +1,185 @@
+///
+/// Copyright (c) 2013, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    Pipeline
+///
+/// PURPOSE: This program tests the efficiency with which point-to-point
+///          synchronization can be carried out. It does so by executing
+///          a pipelined algorithm on an m*n grid. The first array dimension
+///          is distributed among the threads (stripwise decomposition).
+///
+/// USAGE:   The program takes as input the
+///          dimensions of the grid, and the number of iterations on the grid
+///
+///                <progname> <iterations> <m> <n>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// FUNCTIONS CALLED:
+///
+///          Other than standard C functions, the following
+///          functions are used in this program:
+///
+///          wtime()
+///
+/// HISTORY: - Written by Rob Van der Wijngaart, February 2009.
+///            C99-ification by Jeff Hammond, February 2016.
+///            C++11-ification by Jeff Hammond, May 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+#include "p2p-kernel.h"
+
+int main(int argc, char* argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11 pipeline execution on 2D grid" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Process and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations;
+  int m, n;
+  int mc, nc;
+  try {
+      if (argc < 4){
+        throw " <# iterations> <first array dimension> <second array dimension> [<first chunk dimension> <second chunk dimension>]";
+      }
+
+      // number of times to run the pipeline algorithm
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      // grid dimensions
+      m = std::atoi(argv[2]);
+      n = std::atoi(argv[3]);
+      if (m < 1 || n < 1) {
+        throw "ERROR: grid dimensions must be positive";
+      } else if ( static_cast<size_t>(m)*static_cast<size_t>(n) > INT_MAX) {
+        throw "ERROR: grid dimension too large - overflow risk";
+      }
+
+      // grid chunk dimensions
+      mc = (argc > 4) ? std::atoi(argv[4]) : m;
+      nc = (argc > 5) ? std::atoi(argv[5]) : n;
+      if (mc < 1 || mc > m || nc < 1 || nc > n) {
+        std::cout << "WARNING: grid chunk dimensions invalid: " << mc <<  nc << " (ignoring)" << std::endl;
+        mc = m;
+        nc = n;
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Grid sizes           = " << m << ", " << n << std::endl;
+  std::cout << "Grid chunk sizes     = " << mc << ", " << nc << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  auto pipeline_time = 0.0; // silence compiler warning
+
+  prk::vector<double> grid(m*n,0.0);;
+
+  {
+    // set boundary values (bottom and left side of grid)
+    for (int j=0; j<n; j++) {
+      grid[0*n+j] = static_cast<double>(j);
+    }
+    for (int i=0; i<m; i++) {
+      grid[i*n+0] = static_cast<double>(i);
+    }
+
+    for (int iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) pipeline_time = prk::wtime();
+
+      double * RESTRICT pgrid = grid.data();
+
+      if (mc==m && nc==n) {
+        for (int i=1; i<m; i++) {
+          double olda = grid[  i  *n];
+          double oldb = grid[(i-1)*n];
+          for (int j=1; j<n; j++) {
+            double const newb = grid[(i-1)*n+j];
+            double const newa = newb - oldb + olda;
+            grid[i*n+j] = newa;
+            olda = newa;
+            oldb = newb;
+          }
+        }
+      } else {
+        for (int i=1; i<m; i+=mc) {
+          for (int j=1; j<n; j+=nc) {
+            sweep_tile(i, std::min(m,i+mc), j, std::min(n,j+nc), n, pgrid);
+          }
+        }
+      }
+      pgrid[0*n+0] = -pgrid[(m-1)*n+(n-1)];
+    }
+    pipeline_time = prk::wtime() - pipeline_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  // Analyze and output results.
+  //////////////////////////////////////////////////////////////////////
+
+  const double epsilon = 1.e-8;
+  auto corner_val = ((iterations+1.)*(n+m-2.));
+  if ( (std::fabs(grid[(m-1)*n+(n-1)] - corner_val)/corner_val) > epsilon) {
+    std::cout << "ERROR: checksum " << grid[(m-1)*n+(n-1)]
+              << " does not match verification value " << corner_val << std::endl;
+    return 1;
+  }
+
+#ifdef VERBOSE
+  std::cout << "Solution validates; verification value = " << corner_val << std::endl;
+#else
+  std::cout << "Solution validates" << std::endl;
+#endif
+  auto avgtime = pipeline_time/iterations;
+  std::cout << "Rate (MFlops/s): "
+            << 2.0e-6 * ( (m-1.)*(n-1.) )/avgtime
+            << " Avg time (s): " << avgtime << std::endl;
+
+  return 0;
+}
diff --git a/Cxx11/sparse.cc b/Cxx11/sparse.cc
new file mode 100644
index 000000000..38fb68deb
--- /dev/null
+++ b/Cxx11/sparse.cc
@@ -0,0 +1,235 @@
+
+///
+/// Copyright (c) 2013, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    Stencil
+///
+/// PURPOSE: This program tests the efficiency with which a space-invariant,
+///          linear, symmetric filter (stencil) can be applied to a square
+///          grid or image.
+///
+/// USAGE:   The program takes as input the linear
+///          dimension of the grid, and the number of iterations on the grid
+///
+///                <progname> <iterations> <grid size>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// FUNCTIONS CALLED:
+///
+///          Other than standard C functions, the following functions are used in
+///          this program:
+///          wtime()
+///
+/// HISTORY: - Written by Rob Van der Wijngaart, February 2009.
+///          - RvdW: Removed unrolling pragmas for clarity;
+///            added constant to array "in" at end of each iteration to force
+///            refreshing of neighbor data in parallel versions; August 2013
+///            C++11-ification by Jeff Hammond, May 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+
+static inline size_t offset(size_t i, size_t j, size_t lsize)
+{
+    return (i+(j<<lsize));
+}
+
+/* Code below reverses bits in unsigned integer stored in a 64-bit word.
+   Bit reversal is with respect to the largest integer that is going to be
+   processed for the particular run of the code, to make sure the reversal
+   constitutes a true permutation. Hence, the final result needs to be shifted
+   to the right.
+   Example: if largest integer being processed is 0x000000ff = 255 =
+   0000...0011111111 (binary), then the unshifted reversal of 0x00000006 = 6 =
+   0000...0000000110 (binary) would be 011000000...0000 = 3*2^61, which is
+   outside the range of the original sequence 0-255. Setting shift_in_bits to
+   2log(256) = 8, the final result is shifted the the right by 64-8=56 bits,
+   so we get 000...0001100000 (binary) = 96, which is within the proper range */
+
+static inline uint64_t reverse(uint64_t x, int shift_in_bits)
+{
+  x = ((x >> 1)  & 0x5555555555555555) | ((x << 1)  & 0xaaaaaaaaaaaaaaaa);
+  x = ((x >> 2)  & 0x3333333333333333) | ((x << 2)  & 0xcccccccccccccccc);
+  x = ((x >> 4)  & 0x0f0f0f0f0f0f0f0f) | ((x << 4)  & 0xf0f0f0f0f0f0f0f0);
+  x = ((x >> 8)  & 0x00ff00ff00ff00ff) | ((x << 8)  & 0xff00ff00ff00ff00);
+  x = ((x >> 16) & 0x0000ffff0000ffff) | ((x << 16) & 0xffff0000ffff0000);
+  x = ((x >> 32) & 0x00000000ffffffff) | ((x << 32) & 0xffffffff00000000);
+  return ( x >> (8*sizeof(uint64_t)-shift_in_bits) );
+}
+
+#if SCRAMBLE
+  #define REVERSE(a,b)  reverse((a),(b))
+#else
+  #define REVERSE(a,b) (a)
+#endif
+
+int main(int argc, char* argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11 Sparse matrix-vector multiplication" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Process and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations, lsize, radius, stencil_size;
+  size_t size, size2, nent;
+  double sparsity;
+  try {
+      if (argc < 4) {
+        throw "Usage: <# iterations> <2log grid size> <stencil radius>]";
+      }
+
+      // number of times to run the algorithm
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      // linear grid dimension
+      lsize  = std::atoi(argv[2]);
+      if (lsize < 1) {
+        throw "ERROR: grid dimension must be positive";
+      }
+      size_t lsize2 = 2*lsize;
+      size = 1L<<lsize;
+      size2 = size*size;
+
+      // stencil radius
+      radius = std::atoi(argv[5]);
+
+      if (radius < 0) {
+        throw "ERROR: Stencil radius must be nonnegative";
+      }
+
+      stencil_size = 4*radius+1;
+      sparsity = (4.*radius+1.)/size2;
+      nent = size2 * stencil_size;
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Matrix order         = " << size2 << std::endl;
+  std::cout << "Stencil diameter     = " << 2*radius+1 << std::endl;
+  std::cout << "Sparsity             = " << sparsity << std::endl;
+#if SCRAMBLE
+  std::cout << "Using scrambled indexing"  << std::endl;
+#else
+  std::cout << "Using canonical indexing"  << std::endl;
+#endif
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  prk::vector<double> matrix(nent,0.0);
+  prk::vector<size_t> colIndex(nent,0);
+  prk::vector<double> vector(size2,0.0);
+  prk::vector<double> result(size2,0.0);
+
+  double sparse_time(0);
+
+  {
+    for (size_t row=0; row<size2; row++) {
+      size_t i = row % size;
+      size_t j = row / size;
+      size_t elm = row*stencil_size;
+      colIndex[elm] = REVERSE(offset(i,j,lsize),lsize2);
+      for (size_t r=1; r<=radius; r++, elm+=4) {
+        colIndex[elm+1] = REVERSE(offset((i+r)%size,j,lsize),lsize2);
+        colIndex[elm+2] = REVERSE(offset((i-r+size)%size,j,lsize),lsize2);
+        colIndex[elm+3] = REVERSE(offset(i,(j+r)%size,lsize),lsize2);
+        colIndex[elm+4] = REVERSE(offset(i,(j-r+size)%size,lsize),lsize2);
+      }
+      std::sort(&(colIndex[row*stencil_size]), &(colIndex[(row+1)*stencil_size]));
+      for (size_t elm=row*stencil_size; elm<(row+1)*stencil_size; elm++) {
+        matrix[elm] = 1.0/(colIndex[elm]+1.);
+      }
+    }
+
+    for (auto iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) sparse_time = prk::wtime();
+
+      for (size_t row=0; row<size2; row++) {
+          vector[row] += (row+1.);
+      }
+
+      for (size_t row=0; row<size2; row++) {
+          double temp(0);
+          for (size_t col=stencil_size*row; col<stencil_size*(row+1); col++) {
+              temp += matrix[col]*vector[colIndex[col]];
+          }
+          result[row] += temp;
+      }
+
+    }
+    sparse_time = prk::wtime() - sparse_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  // Analyze and output results.
+  //////////////////////////////////////////////////////////////////////
+
+  double reference_sum = (0.5*nent) * (iterations+1.) * (iterations+2.);
+
+  double vector_sum(0);
+  for (size_t row=0; row<size2; row++) {
+      vector_sum += result[row];
+  }
+
+  const double epsilon(1.e-8);
+
+  if (std::fabs(vector_sum-reference_sum) > epsilon) {
+    std::cout << "ERROR: Vector norm = " << vector_sum
+              << " Reference vector norm = " << reference_sum << std::endl;
+    return 1;
+  } else {
+    std::cout << "Solution validates" << std::endl;
+#ifdef VERBOSE
+    std::cout << "Reference sum = " << reference_sum
+              << ", vector sum = " << vector_sum << std::endl;
+#endif
+    double avgtime = sparse_time/iterations;
+    std::cout << "Rate (MFlops/s): " << 1.0e-6 * (2.*nent)/avgtime
+              << " Avg time (s): " << avgtime << std::endl;
+  }
+
+  return 0;
+}
diff --git a/Cxx11/stencil.cc b/Cxx11/stencil.cc
new file mode 100644
index 000000000..f0aab6461
--- /dev/null
+++ b/Cxx11/stencil.cc
@@ -0,0 +1,231 @@
+
+///
+/// Copyright (c) 2013, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    Stencil
+///
+/// PURPOSE: This program tests the efficiency with which a space-invariant,
+///          linear, symmetric filter (stencil) can be applied to a square
+///          grid or image.
+///
+/// USAGE:   The program takes as input the linear
+///          dimension of the grid, and the number of iterations on the grid
+///
+///                <progname> <iterations> <grid size>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// FUNCTIONS CALLED:
+///
+///          Other than standard C functions, the following functions are used in
+///          this program:
+///          wtime()
+///
+/// HISTORY: - Written by Rob Van der Wijngaart, February 2009.
+///          - RvdW: Removed unrolling pragmas for clarity;
+///            added constant to array "in" at end of each iteration to force
+///            refreshing of neighbor data in parallel versions; August 2013
+///            C++11-ification by Jeff Hammond, May 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+#include "stencil_seq.hpp"
+
+void nothing(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out)
+{
+    std::cout << "You are trying to use a stencil that does not exist.\n";
+    std::cout << "Please generate the new stencil using the code generator\n";
+    std::cout << "and add it to the case-switch in the driver." << std::endl;
+    // n will never be zero - this is to silence compiler warnings.
+    if (n==0 || t==0) std::cout << in.size() << out.size() << std::endl;
+    std::abort();
+}
+
+int main(int argc, char* argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11 Stencil execution on 2D grid" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Process and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations, n, radius, tile_size;
+  bool star = true;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <array dimension> [<tile_size> <star/grid> <radius>]";
+      }
+
+      // number of times to run the algorithm
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      // linear grid dimension
+      n  = std::atoi(argv[2]);
+      if (n < 1) {
+        throw "ERROR: grid dimension must be positive";
+      } else if (n > std::floor(std::sqrt(INT_MAX))) {
+        throw "ERROR: grid dimension too large - overflow risk";
+      }
+
+      // default tile size for tiling of local transpose
+      tile_size = 32;
+      if (argc > 3) {
+          tile_size = std::atoi(argv[3]);
+          if (tile_size <= 0) tile_size = n;
+          if (tile_size > n) tile_size = n;
+      }
+
+      // stencil pattern
+      if (argc > 4) {
+          auto stencil = std::string(argv[4]);
+          auto grid = std::string("grid");
+          star = (stencil == grid) ? false : true;
+      }
+
+      // stencil radius
+      radius = 2;
+      if (argc > 5) {
+          radius = std::atoi(argv[5]);
+      }
+
+      if ( (radius < 1) || (2*radius+1 > n) ) {
+        throw "ERROR: Stencil radius negative or too large";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Grid size            = " << n << std::endl;
+  std::cout << "Tile size            = " << tile_size << std::endl;
+  std::cout << "Type of stencil      = " << (star ? "star" : "grid") << std::endl;
+  std::cout << "Radius of stencil    = " << radius << std::endl;
+
+  auto stencil = nothing;
+  if (star) {
+      switch (radius) {
+          case 1: stencil = star1; break;
+          case 2: stencil = star2; break;
+          case 3: stencil = star3; break;
+          case 4: stencil = star4; break;
+          case 5: stencil = star5; break;
+      }
+  } else {
+      switch (radius) {
+          case 1: stencil = grid1; break;
+          case 2: stencil = grid2; break;
+          case 3: stencil = grid3; break;
+          case 4: stencil = grid4; break;
+          case 5: stencil = grid5; break;
+      }
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  auto stencil_time = 0.0;
+
+  prk::vector<double> in(n*n);
+  prk::vector<double> out(n*n);
+
+  {
+    for (auto it=0; it<n; it+=tile_size) {
+      for (auto jt=0; jt<n; jt+=tile_size) {
+        for (auto i=it; i<std::min(n,it+tile_size); i++) {
+          PRAGMA_SIMD
+          for (auto j=jt; j<std::min(n,jt+tile_size); j++) {
+            in[i*n+j] = static_cast<double>(i+j);
+            out[i*n+j] = 0.0;
+          }
+        }
+      }
+    }
+
+    for (auto iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) stencil_time = prk::wtime();
+      // Apply the stencil operator
+      stencil(n, tile_size, in, out);
+      // Add constant to solution to force refresh of neighbor data, if any
+      std::transform(in.begin(), in.end(), in.begin(), [](double c) { return c+=1.0; });
+    }
+    stencil_time = prk::wtime() - stencil_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  // Analyze and output results.
+  //////////////////////////////////////////////////////////////////////
+
+  // interior of grid with respect to stencil
+  size_t active_points = static_cast<size_t>(n-2*radius)*static_cast<size_t>(n-2*radius);
+
+  // compute L1 norm in parallel
+  double norm = 0.0;
+  for (auto i=radius; i<n-radius; i++) {
+    for (auto j=radius; j<n-radius; j++) {
+      norm += std::fabs(out[i*n+j]);
+    }
+  }
+  norm /= active_points;
+
+  // verify correctness
+  const double epsilon = 1.0e-8;
+  double reference_norm = 2.*(iterations+1.);
+  if (std::fabs(norm-reference_norm) > epsilon) {
+    std::cout << "ERROR: L1 norm = " << norm
+              << " Reference L1 norm = " << reference_norm << std::endl;
+    return 1;
+  } else {
+    std::cout << "Solution validates" << std::endl;
+#ifdef VERBOSE
+    std::cout << "L1 norm = " << norm
+              << " Reference L1 norm = " << reference_norm << std::endl;
+#endif
+    const int stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1);
+    size_t flops = (2L*(size_t)stencil_size+1L) * active_points;
+    auto avgtime = stencil_time/iterations;
+    std::cout << "Rate (MFlops/s): " << 1.0e-6 * static_cast<double>(flops)/avgtime
+              << " Avg time (s): " << avgtime << std::endl;
+  }
+
+  return 0;
+}
diff --git a/Cxx11/transpose.cc b/Cxx11/transpose.cc
new file mode 100644
index 000000000..7907bae56
--- /dev/null
+++ b/Cxx11/transpose.cc
@@ -0,0 +1,177 @@
+///
+/// Copyright (c) 2013, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    transpose
+///
+/// PURPOSE: This program measures the time for the transpose of a
+///          column-major stored matrix into a row-major stored matrix.
+///
+/// USAGE:   Program input is the matrix order and the number of times to
+///          repeat the operation:
+///
+///          transpose <matrix_size> <# iterations> [tile size]
+///
+///          An optional parameter specifies the tile size used to divide the
+///          individual matrix blocks for improved cache and TLB performance.
+///
+///          The output consists of diagnostics to make sure the
+///          transpose worked and timing statistics.
+///
+/// HISTORY: Written by  Rob Van der Wijngaart, February 2009.
+///          Converted to C++11 by Jeff Hammond, February 2016 and May 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11 Matrix transpose: B = A^T" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations;
+  int order;
+  int tile_size;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <matrix order> [tile size]";
+      }
+
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      order = std::atoi(argv[2]);
+      if (order <= 0) {
+        throw "ERROR: Matrix Order must be greater than 0";
+      } else if (order > std::floor(std::sqrt(INT_MAX))) {
+        throw "ERROR: matrix dimension too large - overflow risk";
+      }
+
+      // default tile size for tiling of local transpose
+      tile_size = (argc>3) ? std::atoi(argv[3]) : 32;
+      // a negative tile size means no tiling of the local transpose
+      if (tile_size <= 0) tile_size = order;
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Matrix order         = " << order << std::endl;
+  std::cout << "Tile size            = " << tile_size << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  auto trans_time = 0.0;
+
+  prk::vector<double> A(order*order);
+  prk::vector<double> B(order*order,0.0);
+
+  // fill A with the sequence 0 to order^2-1 as doubles
+  std::iota(A.begin(), A.end(), 0.0);
+
+  {
+    for (auto iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) trans_time = prk::wtime();
+
+      // transpose the  matrix
+      if (tile_size < order) {
+        for (auto it=0; it<order; it+=tile_size) {
+          for (auto jt=0; jt<order; jt+=tile_size) {
+            for (auto i=it; i<std::min(order,it+tile_size); i++) {
+              for (auto j=jt; j<std::min(order,jt+tile_size); j++) {
+                B[i*order+j] += A[j*order+i];
+                A[j*order+i] += 1.0;
+              }
+            }
+          }
+        }
+      } else {
+        for (auto i=0;i<order; i++) {
+          for (auto j=0;j<order;j++) {
+            B[i*order+j] += A[j*order+i];
+            A[j*order+i] += 1.0;
+          }
+        }
+      }
+    }
+    trans_time = prk::wtime() - trans_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  const auto addit = (iterations+1.) * (iterations/2.);
+  double abserr(0);
+  // TODO: replace with std::generate, std::accumulate, or similar
+  for (auto j=0; j<order; j++) {
+    for (auto i=0; i<order; i++) {
+      const int ij = i*order+j;
+      const int ji = j*order+i;
+      const double reference = static_cast<double>(ij)*(1.+iterations)+addit;
+      abserr += std::fabs(B[ji] - reference);
+    }
+  }
+
+#ifdef VERBOSE
+  std::cout << "Sum of absolute differences: " << abserr << std::endl;
+#endif
+
+  const auto epsilon = 1.0e-8;
+  if (abserr < epsilon) {
+    std::cout << "Solution validates" << std::endl;
+    auto avgtime = trans_time/iterations;
+    auto bytes = (size_t)order * (size_t)order * sizeof(double);
+    std::cout << "Rate (MB/s): " << 1.0e-6 * (2L*bytes)/avgtime
+              << " Avg time (s): " << avgtime << std::endl;
+  } else {
+    std::cout << "ERROR: Aggregate squared error " << abserr
+              << " exceeds threshold " << epsilon << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
+
+

From c4f79aaeb0a4139a5b70d741729064864026c085 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 20 Apr 2019 22:01:31 -0700
Subject: [PATCH 154/245] add versions that use prk::vector rather than STL

---
 Cxx11/stencil_seq.hpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/Cxx11/stencil_seq.hpp b/Cxx11/stencil_seq.hpp
index 4ed03972e..c6af093af 100644
--- a/Cxx11/stencil_seq.hpp
+++ b/Cxx11/stencil_seq.hpp
@@ -1,4 +1,4 @@
-void star1(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void star1(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {
     for (auto it=1; it<n-1; it+=t) {
       for (auto jt=1; jt<n-1; jt+=t) {
         for (auto i=it; i<std::min(n-1,it+t); ++i) {
@@ -14,7 +14,7 @@ void star1(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void star2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void star2(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {
     for (auto it=2; it<n-2; it+=t) {
       for (auto jt=2; jt<n-2; jt+=t) {
         for (auto i=it; i<std::min(n-2,it+t); ++i) {
@@ -34,7 +34,7 @@ void star2(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void star3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void star3(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {
     for (auto it=3; it<n-3; it+=t) {
       for (auto jt=3; jt<n-3; jt+=t) {
         for (auto i=it; i<std::min(n-3,it+t); ++i) {
@@ -58,7 +58,7 @@ void star3(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void star4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void star4(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {
     for (auto it=4; it<n-4; it+=t) {
       for (auto jt=4; jt<n-4; jt+=t) {
         for (auto i=it; i<std::min(n-4,it+t); ++i) {
@@ -86,7 +86,7 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void star5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void star5(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {
     for (auto it=5; it<n-5; it+=t) {
       for (auto jt=5; jt<n-5; jt+=t) {
         for (auto i=it; i<std::min(n-5,it+t); ++i) {
@@ -118,7 +118,7 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void grid1(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void grid1(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {
     for (auto it=1; it<n-1; it+=t) {
       for (auto jt=1; jt<n-1; jt+=t) {
         for (auto i=it; i<std::min(n-1,it+t); ++i) {
@@ -137,7 +137,7 @@ void grid1(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void grid2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void grid2(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {
     for (auto it=2; it<n-2; it+=t) {
       for (auto jt=2; jt<n-2; jt+=t) {
         for (auto i=it; i<std::min(n-2,it+t); ++i) {
@@ -170,7 +170,7 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void grid3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void grid3(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {
     for (auto it=3; it<n-3; it+=t) {
       for (auto jt=3; jt<n-3; jt+=t) {
         for (auto i=it; i<std::min(n-3,it+t); ++i) {
@@ -225,7 +225,7 @@ void grid3(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void grid4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void grid4(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {
     for (auto it=4; it<n-4; it+=t) {
       for (auto jt=4; jt<n-4; jt+=t) {
         for (auto i=it; i<std::min(n-4,it+t); ++i) {
@@ -310,7 +310,7 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
      }
 }
 
-void grid5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+void grid5(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {
     for (auto it=5; it<n-5; it+=t) {
       for (auto jt=5; jt<n-5; jt+=t) {
         for (auto i=it; i<std::min(n-5,it+t); ++i) {

From 7513e36166fb27e1238b9c8d6cf0188c34dab1fe Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 20 Apr 2019 22:01:52 -0700
Subject: [PATCH 155/245] ignore more stuff

---
 .gitignore | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.gitignore b/.gitignore
index 66948e148..5a90844a4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -277,3 +277,7 @@ FORTRAN/transpose-ornlacc
 RUST/p2p/Cargo.lock
 RUST/stencil/Cargo.lock
 RUST/transpose/Cargo.lock
+nstream
+../C1z/p2p-avx
+../C1z/p2p-sse
+../C1z/p2p-hyperplane-openmp

From 75a60ab98f4ae7b2cdbd2b9a0e3548c781732a87 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 20 Apr 2019 22:06:19 -0700
Subject: [PATCH 156/245] cleanup stencil codegen for vector classes

---
 Cxx11/generate-cxx-stencil.py |  22 +-
 Cxx11/stencil-vector.cc       |   2 +-
 Cxx11/stencil_vector.hpp      | 435 ++++++++++++++++++++++++++++++++++
 3 files changed, 453 insertions(+), 6 deletions(-)
 create mode 100644 Cxx11/stencil_vector.hpp

diff --git a/Cxx11/generate-cxx-stencil.py b/Cxx11/generate-cxx-stencil.py
index 18d826acd..2f557fe3f 100755
--- a/Cxx11/generate-cxx-stencil.py
+++ b/Cxx11/generate-cxx-stencil.py
@@ -53,7 +53,7 @@ def codegen(src,pattern,stencil_size,radius,W,model):
         src.write('       }\n')
         src.write('     }\n')
     elif (model=='taskloop'):
-        src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector<double> & in, std::vector<double> & out, const int gs) {\n')
+        src.write('void '+pattern+str(radius)+'(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out, const int gs) {\n')
         src.write('    OMP_TASKLOOP_COLLAPSE(2, firstprivate(n) shared(in,out) grainsize(gs) )\n')
         src.write('    for (auto it='+str(radius)+'; it<n-'+str(radius)+'; it+=t) {\n')
         src.write('      for (auto jt='+str(radius)+'; jt<n-'+str(radius)+'; jt+=t) {\n')
@@ -74,7 +74,7 @@ def codegen(src,pattern,stencil_size,radius,W,model):
         src.write('       }\n')
         src.write('     }\n')
     elif (model=='rangefor'):
-        src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {\n')
+        src.write('void '+pattern+str(radius)+'(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {\n')
         src.write('    auto inside = prk::range('+str(radius)+',n-'+str(radius)+');\n')
         src.write('    for (auto i : inside) {\n')
         src.write('      PRAGMA_SIMD\n')
@@ -128,7 +128,7 @@ def codegen(src,pattern,stencil_size,radius,W,model):
         bodygen(src,pattern,stencil_size,radius,W,model)
         src.write('    });\n')
     elif (model=='tbb'):
-        src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {\n')
+        src.write('void '+pattern+str(radius)+'(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {\n')
         src.write('  tbb::blocked_range2d<int> range('+str(radius)+', n-'+str(radius)+', t, '+str(radius)+', n-'+str(radius)+', t);\n')
         src.write('  tbb::parallel_for( range, [&](decltype(range)& r ) {\n')
         src.write('    for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) {\n')
@@ -151,7 +151,7 @@ def codegen(src,pattern,stencil_size,radius,W,model):
         src.write('    if ( ('+str(radius)+' <= i) && (i < n-'+str(radius)+') && ('+str(radius)+' <= j) && (j < n-'+str(radius)+') ) {\n')
         bodygen(src,pattern,stencil_size,radius,W,model)
         src.write('     }\n')
-    else:
+    elif (model=='vector'):
         src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {\n')
         src.write('    for (auto it='+str(radius)+'; it<n-'+str(radius)+'; it+=t) {\n')
         src.write('      for (auto jt='+str(radius)+'; jt<n-'+str(radius)+'; jt+=t) {\n')
@@ -163,6 +163,18 @@ def codegen(src,pattern,stencil_size,radius,W,model):
         src.write('         }\n')
         src.write('       }\n')
         src.write('     }\n')
+    else:
+        src.write('void '+pattern+str(radius)+'(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out) {\n')
+        src.write('    for (auto it='+str(radius)+'; it<n-'+str(radius)+'; it+=t) {\n')
+        src.write('      for (auto jt='+str(radius)+'; jt<n-'+str(radius)+'; jt+=t) {\n')
+        src.write('        for (auto i=it; i<std::min(n-'+str(radius)+',it+t); ++i) {\n')
+        src.write('          PRAGMA_SIMD\n')
+        src.write('          for (auto j=jt; j<std::min(n-'+str(radius)+',jt+t); ++j) {\n')
+        bodygen(src,pattern,stencil_size,radius,W,model)
+        src.write('           }\n')
+        src.write('         }\n')
+        src.write('       }\n')
+        src.write('     }\n')
     src.write('}\n\n')
 
 def instance(src,model,pattern,r):
@@ -191,7 +203,7 @@ def instance(src,model,pattern,r):
     codegen(src,pattern,stencil_size,r,W,model)
 
 def main():
-    for model in ['seq','rangefor','stl','pgnu','pstl','openmp','taskloop','target','tbb','raja','rajaview','kokkos','cuda']:
+    for model in ['seq','vector','rangefor','stl','pgnu','pstl','openmp','taskloop','target','tbb','raja','rajaview','kokkos','cuda']:
       src = open('stencil_'+model+'.hpp','w')
       if (model=='target'):
           src.write('#define RESTRICT __restrict__\n\n')
diff --git a/Cxx11/stencil-vector.cc b/Cxx11/stencil-vector.cc
index 26931780d..95c4ef860 100644
--- a/Cxx11/stencil-vector.cc
+++ b/Cxx11/stencil-vector.cc
@@ -61,7 +61,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
-#include "stencil_seq.hpp"
+#include "stencil_vector.hpp"
 
 void nothing(const int n, const int t, std::vector<double> & in, std::vector<double> & out)
 {
diff --git a/Cxx11/stencil_vector.hpp b/Cxx11/stencil_vector.hpp
new file mode 100644
index 000000000..4ed03972e
--- /dev/null
+++ b/Cxx11/stencil_vector.hpp
@@ -0,0 +1,435 @@
+void star1(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+    for (auto it=1; it<n-1; it+=t) {
+      for (auto jt=1; jt<n-1; jt+=t) {
+        for (auto i=it; i<std::min(n-1,it+t); ++i) {
+          PRAGMA_SIMD
+          for (auto j=jt; j<std::min(n-1,jt+t); ++j) {
+            out[i*n+j] += +in[(i)*n+(j-1)] * -0.5
+                          +in[(i-1)*n+(j)] * -0.5
+                          +in[(i+1)*n+(j)] * 0.5
+                          +in[(i)*n+(j+1)] * 0.5;
+           }
+         }
+       }
+     }
+}
+
+void star2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+    for (auto it=2; it<n-2; it+=t) {
+      for (auto jt=2; jt<n-2; jt+=t) {
+        for (auto i=it; i<std::min(n-2,it+t); ++i) {
+          PRAGMA_SIMD
+          for (auto j=jt; j<std::min(n-2,jt+t); ++j) {
+            out[i*n+j] += +in[(i)*n+(j-2)] * -0.125
+                          +in[(i)*n+(j-1)] * -0.25
+                          +in[(i-2)*n+(j)] * -0.125
+                          +in[(i-1)*n+(j)] * -0.25
+                          +in[(i+1)*n+(j)] * 0.25
+                          +in[(i+2)*n+(j)] * 0.125
+                          +in[(i)*n+(j+1)] * 0.25
+                          +in[(i)*n+(j+2)] * 0.125;
+           }
+         }
+       }
+     }
+}
+
+void star3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+    for (auto it=3; it<n-3; it+=t) {
+      for (auto jt=3; jt<n-3; jt+=t) {
+        for (auto i=it; i<std::min(n-3,it+t); ++i) {
+          PRAGMA_SIMD
+          for (auto j=jt; j<std::min(n-3,jt+t); ++j) {
+            out[i*n+j] += +in[(i)*n+(j-3)] * -0.0555555555556
+                          +in[(i)*n+(j-2)] * -0.0833333333333
+                          +in[(i)*n+(j-1)] * -0.166666666667
+                          +in[(i-3)*n+(j)] * -0.0555555555556
+                          +in[(i-2)*n+(j)] * -0.0833333333333
+                          +in[(i-1)*n+(j)] * -0.166666666667
+                          +in[(i+1)*n+(j)] * 0.166666666667
+                          +in[(i+2)*n+(j)] * 0.0833333333333
+                          +in[(i+3)*n+(j)] * 0.0555555555556
+                          +in[(i)*n+(j+1)] * 0.166666666667
+                          +in[(i)*n+(j+2)] * 0.0833333333333
+                          +in[(i)*n+(j+3)] * 0.0555555555556;
+           }
+         }
+       }
+     }
+}
+
+void star4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+    for (auto it=4; it<n-4; it+=t) {
+      for (auto jt=4; jt<n-4; jt+=t) {
+        for (auto i=it; i<std::min(n-4,it+t); ++i) {
+          PRAGMA_SIMD
+          for (auto j=jt; j<std::min(n-4,jt+t); ++j) {
+            out[i*n+j] += +in[(i)*n+(j-4)] * -0.03125
+                          +in[(i)*n+(j-3)] * -0.0416666666667
+                          +in[(i)*n+(j-2)] * -0.0625
+                          +in[(i)*n+(j-1)] * -0.125
+                          +in[(i-4)*n+(j)] * -0.03125
+                          +in[(i-3)*n+(j)] * -0.0416666666667
+                          +in[(i-2)*n+(j)] * -0.0625
+                          +in[(i-1)*n+(j)] * -0.125
+                          +in[(i+1)*n+(j)] * 0.125
+                          +in[(i+2)*n+(j)] * 0.0625
+                          +in[(i+3)*n+(j)] * 0.0416666666667
+                          +in[(i+4)*n+(j)] * 0.03125
+                          +in[(i)*n+(j+1)] * 0.125
+                          +in[(i)*n+(j+2)] * 0.0625
+                          +in[(i)*n+(j+3)] * 0.0416666666667
+                          +in[(i)*n+(j+4)] * 0.03125;
+           }
+         }
+       }
+     }
+}
+
+void star5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+    for (auto it=5; it<n-5; it+=t) {
+      for (auto jt=5; jt<n-5; jt+=t) {
+        for (auto i=it; i<std::min(n-5,it+t); ++i) {
+          PRAGMA_SIMD
+          for (auto j=jt; j<std::min(n-5,jt+t); ++j) {
+            out[i*n+j] += +in[(i)*n+(j-5)] * -0.02
+                          +in[(i)*n+(j-4)] * -0.025
+                          +in[(i)*n+(j-3)] * -0.0333333333333
+                          +in[(i)*n+(j-2)] * -0.05
+                          +in[(i)*n+(j-1)] * -0.1
+                          +in[(i-5)*n+(j)] * -0.02
+                          +in[(i-4)*n+(j)] * -0.025
+                          +in[(i-3)*n+(j)] * -0.0333333333333
+                          +in[(i-2)*n+(j)] * -0.05
+                          +in[(i-1)*n+(j)] * -0.1
+                          +in[(i+1)*n+(j)] * 0.1
+                          +in[(i+2)*n+(j)] * 0.05
+                          +in[(i+3)*n+(j)] * 0.0333333333333
+                          +in[(i+4)*n+(j)] * 0.025
+                          +in[(i+5)*n+(j)] * 0.02
+                          +in[(i)*n+(j+1)] * 0.1
+                          +in[(i)*n+(j+2)] * 0.05
+                          +in[(i)*n+(j+3)] * 0.0333333333333
+                          +in[(i)*n+(j+4)] * 0.025
+                          +in[(i)*n+(j+5)] * 0.02;
+           }
+         }
+       }
+     }
+}
+
+void grid1(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+    for (auto it=1; it<n-1; it+=t) {
+      for (auto jt=1; jt<n-1; jt+=t) {
+        for (auto i=it; i<std::min(n-1,it+t); ++i) {
+          PRAGMA_SIMD
+          for (auto j=jt; j<std::min(n-1,jt+t); ++j) {
+            out[i*n+j] += +in[(i-1)*n+(j-1)] * -0.25
+                          +in[(i)*n+(j-1)] * -0.25
+                          +in[(i-1)*n+(j)] * -0.25
+                          +in[(i+1)*n+(j)] * 0.25
+                          +in[(i)*n+(j+1)] * 0.25
+                          +in[(i+1)*n+(j+1)] * 0.25
+                          ;
+           }
+         }
+       }
+     }
+}
+
+void grid2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+    for (auto it=2; it<n-2; it+=t) {
+      for (auto jt=2; jt<n-2; jt+=t) {
+        for (auto i=it; i<std::min(n-2,it+t); ++i) {
+          PRAGMA_SIMD
+          for (auto j=jt; j<std::min(n-2,jt+t); ++j) {
+            out[i*n+j] += +in[(i-2)*n+(j-2)] * -0.0625
+                          +in[(i-1)*n+(j-2)] * -0.0208333333333
+                          +in[(i)*n+(j-2)] * -0.0208333333333
+                          +in[(i+1)*n+(j-2)] * -0.0208333333333
+                          +in[(i-2)*n+(j-1)] * -0.0208333333333
+                          +in[(i-1)*n+(j-1)] * -0.125
+                          +in[(i)*n+(j-1)] * -0.125
+                          +in[(i+2)*n+(j-1)] * 0.0208333333333
+                          +in[(i-2)*n+(j)] * -0.0208333333333
+                          +in[(i-1)*n+(j)] * -0.125
+                          +in[(i+1)*n+(j)] * 0.125
+                          +in[(i+2)*n+(j)] * 0.0208333333333
+                          +in[(i-2)*n+(j+1)] * -0.0208333333333
+                          +in[(i)*n+(j+1)] * 0.125
+                          +in[(i+1)*n+(j+1)] * 0.125
+                          +in[(i+2)*n+(j+1)] * 0.0208333333333
+                          +in[(i-1)*n+(j+2)] * 0.0208333333333
+                          +in[(i)*n+(j+2)] * 0.0208333333333
+                          +in[(i+1)*n+(j+2)] * 0.0208333333333
+                          +in[(i+2)*n+(j+2)] * 0.0625
+                          ;
+           }
+         }
+       }
+     }
+}
+
+void grid3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+    for (auto it=3; it<n-3; it+=t) {
+      for (auto jt=3; jt<n-3; jt+=t) {
+        for (auto i=it; i<std::min(n-3,it+t); ++i) {
+          PRAGMA_SIMD
+          for (auto j=jt; j<std::min(n-3,jt+t); ++j) {
+            out[i*n+j] += +in[(i-3)*n+(j-3)] * -0.0277777777778
+                          +in[(i-2)*n+(j-3)] * -0.00555555555556
+                          +in[(i-1)*n+(j-3)] * -0.00555555555556
+                          +in[(i)*n+(j-3)] * -0.00555555555556
+                          +in[(i+1)*n+(j-3)] * -0.00555555555556
+                          +in[(i+2)*n+(j-3)] * -0.00555555555556
+                          +in[(i-3)*n+(j-2)] * -0.00555555555556
+                          +in[(i-2)*n+(j-2)] * -0.0416666666667
+                          +in[(i-1)*n+(j-2)] * -0.0138888888889
+                          +in[(i)*n+(j-2)] * -0.0138888888889
+                          +in[(i+1)*n+(j-2)] * -0.0138888888889
+                          +in[(i+3)*n+(j-2)] * 0.00555555555556
+                          +in[(i-3)*n+(j-1)] * -0.00555555555556
+                          +in[(i-2)*n+(j-1)] * -0.0138888888889
+                          +in[(i-1)*n+(j-1)] * -0.0833333333333
+                          +in[(i)*n+(j-1)] * -0.0833333333333
+                          +in[(i+2)*n+(j-1)] * 0.0138888888889
+                          +in[(i+3)*n+(j-1)] * 0.00555555555556
+                          +in[(i-3)*n+(j)] * -0.00555555555556
+                          +in[(i-2)*n+(j)] * -0.0138888888889
+                          +in[(i-1)*n+(j)] * -0.0833333333333
+                          +in[(i+1)*n+(j)] * 0.0833333333333
+                          +in[(i+2)*n+(j)] * 0.0138888888889
+                          +in[(i+3)*n+(j)] * 0.00555555555556
+                          +in[(i-3)*n+(j+1)] * -0.00555555555556
+                          +in[(i-2)*n+(j+1)] * -0.0138888888889
+                          +in[(i)*n+(j+1)] * 0.0833333333333
+                          +in[(i+1)*n+(j+1)] * 0.0833333333333
+                          +in[(i+2)*n+(j+1)] * 0.0138888888889
+                          +in[(i+3)*n+(j+1)] * 0.00555555555556
+                          +in[(i-3)*n+(j+2)] * -0.00555555555556
+                          +in[(i-1)*n+(j+2)] * 0.0138888888889
+                          +in[(i)*n+(j+2)] * 0.0138888888889
+                          +in[(i+1)*n+(j+2)] * 0.0138888888889
+                          +in[(i+2)*n+(j+2)] * 0.0416666666667
+                          +in[(i+3)*n+(j+2)] * 0.00555555555556
+                          +in[(i-2)*n+(j+3)] * 0.00555555555556
+                          +in[(i-1)*n+(j+3)] * 0.00555555555556
+                          +in[(i)*n+(j+3)] * 0.00555555555556
+                          +in[(i+1)*n+(j+3)] * 0.00555555555556
+                          +in[(i+2)*n+(j+3)] * 0.00555555555556
+                          +in[(i+3)*n+(j+3)] * 0.0277777777778
+                          ;
+           }
+         }
+       }
+     }
+}
+
+void grid4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+    for (auto it=4; it<n-4; it+=t) {
+      for (auto jt=4; jt<n-4; jt+=t) {
+        for (auto i=it; i<std::min(n-4,it+t); ++i) {
+          PRAGMA_SIMD
+          for (auto j=jt; j<std::min(n-4,jt+t); ++j) {
+            out[i*n+j] += +in[(i-4)*n+(j-4)] * -0.015625
+                          +in[(i-3)*n+(j-4)] * -0.00223214285714
+                          +in[(i-2)*n+(j-4)] * -0.00223214285714
+                          +in[(i-1)*n+(j-4)] * -0.00223214285714
+                          +in[(i)*n+(j-4)] * -0.00223214285714
+                          +in[(i+1)*n+(j-4)] * -0.00223214285714
+                          +in[(i+2)*n+(j-4)] * -0.00223214285714
+                          +in[(i+3)*n+(j-4)] * -0.00223214285714
+                          +in[(i-4)*n+(j-3)] * -0.00223214285714
+                          +in[(i-3)*n+(j-3)] * -0.0208333333333
+                          +in[(i-2)*n+(j-3)] * -0.00416666666667
+                          +in[(i-1)*n+(j-3)] * -0.00416666666667
+                          +in[(i)*n+(j-3)] * -0.00416666666667
+                          +in[(i+1)*n+(j-3)] * -0.00416666666667
+                          +in[(i+2)*n+(j-3)] * -0.00416666666667
+                          +in[(i+4)*n+(j-3)] * 0.00223214285714
+                          +in[(i-4)*n+(j-2)] * -0.00223214285714
+                          +in[(i-3)*n+(j-2)] * -0.00416666666667
+                          +in[(i-2)*n+(j-2)] * -0.03125
+                          +in[(i-1)*n+(j-2)] * -0.0104166666667
+                          +in[(i)*n+(j-2)] * -0.0104166666667
+                          +in[(i+1)*n+(j-2)] * -0.0104166666667
+                          +in[(i+3)*n+(j-2)] * 0.00416666666667
+                          +in[(i+4)*n+(j-2)] * 0.00223214285714
+                          +in[(i-4)*n+(j-1)] * -0.00223214285714
+                          +in[(i-3)*n+(j-1)] * -0.00416666666667
+                          +in[(i-2)*n+(j-1)] * -0.0104166666667
+                          +in[(i-1)*n+(j-1)] * -0.0625
+                          +in[(i)*n+(j-1)] * -0.0625
+                          +in[(i+2)*n+(j-1)] * 0.0104166666667
+                          +in[(i+3)*n+(j-1)] * 0.00416666666667
+                          +in[(i+4)*n+(j-1)] * 0.00223214285714
+                          +in[(i-4)*n+(j)] * -0.00223214285714
+                          +in[(i-3)*n+(j)] * -0.00416666666667
+                          +in[(i-2)*n+(j)] * -0.0104166666667
+                          +in[(i-1)*n+(j)] * -0.0625
+                          +in[(i+1)*n+(j)] * 0.0625
+                          +in[(i+2)*n+(j)] * 0.0104166666667
+                          +in[(i+3)*n+(j)] * 0.00416666666667
+                          +in[(i+4)*n+(j)] * 0.00223214285714
+                          +in[(i-4)*n+(j+1)] * -0.00223214285714
+                          +in[(i-3)*n+(j+1)] * -0.00416666666667
+                          +in[(i-2)*n+(j+1)] * -0.0104166666667
+                          +in[(i)*n+(j+1)] * 0.0625
+                          +in[(i+1)*n+(j+1)] * 0.0625
+                          +in[(i+2)*n+(j+1)] * 0.0104166666667
+                          +in[(i+3)*n+(j+1)] * 0.00416666666667
+                          +in[(i+4)*n+(j+1)] * 0.00223214285714
+                          +in[(i-4)*n+(j+2)] * -0.00223214285714
+                          +in[(i-3)*n+(j+2)] * -0.00416666666667
+                          +in[(i-1)*n+(j+2)] * 0.0104166666667
+                          +in[(i)*n+(j+2)] * 0.0104166666667
+                          +in[(i+1)*n+(j+2)] * 0.0104166666667
+                          +in[(i+2)*n+(j+2)] * 0.03125
+                          +in[(i+3)*n+(j+2)] * 0.00416666666667
+                          +in[(i+4)*n+(j+2)] * 0.00223214285714
+                          +in[(i-4)*n+(j+3)] * -0.00223214285714
+                          +in[(i-2)*n+(j+3)] * 0.00416666666667
+                          +in[(i-1)*n+(j+3)] * 0.00416666666667
+                          +in[(i)*n+(j+3)] * 0.00416666666667
+                          +in[(i+1)*n+(j+3)] * 0.00416666666667
+                          +in[(i+2)*n+(j+3)] * 0.00416666666667
+                          +in[(i+3)*n+(j+3)] * 0.0208333333333
+                          +in[(i+4)*n+(j+3)] * 0.00223214285714
+                          +in[(i-3)*n+(j+4)] * 0.00223214285714
+                          +in[(i-2)*n+(j+4)] * 0.00223214285714
+                          +in[(i-1)*n+(j+4)] * 0.00223214285714
+                          +in[(i)*n+(j+4)] * 0.00223214285714
+                          +in[(i+1)*n+(j+4)] * 0.00223214285714
+                          +in[(i+2)*n+(j+4)] * 0.00223214285714
+                          +in[(i+3)*n+(j+4)] * 0.00223214285714
+                          +in[(i+4)*n+(j+4)] * 0.015625
+                          ;
+           }
+         }
+       }
+     }
+}
+
+void grid5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
+    for (auto it=5; it<n-5; it+=t) {
+      for (auto jt=5; jt<n-5; jt+=t) {
+        for (auto i=it; i<std::min(n-5,it+t); ++i) {
+          PRAGMA_SIMD
+          for (auto j=jt; j<std::min(n-5,jt+t); ++j) {
+            out[i*n+j] += +in[(i-5)*n+(j-5)] * -0.01
+                          +in[(i-4)*n+(j-5)] * -0.00111111111111
+                          +in[(i-3)*n+(j-5)] * -0.00111111111111
+                          +in[(i-2)*n+(j-5)] * -0.00111111111111
+                          +in[(i-1)*n+(j-5)] * -0.00111111111111
+                          +in[(i)*n+(j-5)] * -0.00111111111111
+                          +in[(i+1)*n+(j-5)] * -0.00111111111111
+                          +in[(i+2)*n+(j-5)] * -0.00111111111111
+                          +in[(i+3)*n+(j-5)] * -0.00111111111111
+                          +in[(i+4)*n+(j-5)] * -0.00111111111111
+                          +in[(i-5)*n+(j-4)] * -0.00111111111111
+                          +in[(i-4)*n+(j-4)] * -0.0125
+                          +in[(i-3)*n+(j-4)] * -0.00178571428571
+                          +in[(i-2)*n+(j-4)] * -0.00178571428571
+                          +in[(i-1)*n+(j-4)] * -0.00178571428571
+                          +in[(i)*n+(j-4)] * -0.00178571428571
+                          +in[(i+1)*n+(j-4)] * -0.00178571428571
+                          +in[(i+2)*n+(j-4)] * -0.00178571428571
+                          +in[(i+3)*n+(j-4)] * -0.00178571428571
+                          +in[(i+5)*n+(j-4)] * 0.00111111111111
+                          +in[(i-5)*n+(j-3)] * -0.00111111111111
+                          +in[(i-4)*n+(j-3)] * -0.00178571428571
+                          +in[(i-3)*n+(j-3)] * -0.0166666666667
+                          +in[(i-2)*n+(j-3)] * -0.00333333333333
+                          +in[(i-1)*n+(j-3)] * -0.00333333333333
+                          +in[(i)*n+(j-3)] * -0.00333333333333
+                          +in[(i+1)*n+(j-3)] * -0.00333333333333
+                          +in[(i+2)*n+(j-3)] * -0.00333333333333
+                          +in[(i+4)*n+(j-3)] * 0.00178571428571
+                          +in[(i+5)*n+(j-3)] * 0.00111111111111
+                          +in[(i-5)*n+(j-2)] * -0.00111111111111
+                          +in[(i-4)*n+(j-2)] * -0.00178571428571
+                          +in[(i-3)*n+(j-2)] * -0.00333333333333
+                          +in[(i-2)*n+(j-2)] * -0.025
+                          +in[(i-1)*n+(j-2)] * -0.00833333333333
+                          +in[(i)*n+(j-2)] * -0.00833333333333
+                          +in[(i+1)*n+(j-2)] * -0.00833333333333
+                          +in[(i+3)*n+(j-2)] * 0.00333333333333
+                          +in[(i+4)*n+(j-2)] * 0.00178571428571
+                          +in[(i+5)*n+(j-2)] * 0.00111111111111
+                          +in[(i-5)*n+(j-1)] * -0.00111111111111
+                          +in[(i-4)*n+(j-1)] * -0.00178571428571
+                          +in[(i-3)*n+(j-1)] * -0.00333333333333
+                          +in[(i-2)*n+(j-1)] * -0.00833333333333
+                          +in[(i-1)*n+(j-1)] * -0.05
+                          +in[(i)*n+(j-1)] * -0.05
+                          +in[(i+2)*n+(j-1)] * 0.00833333333333
+                          +in[(i+3)*n+(j-1)] * 0.00333333333333
+                          +in[(i+4)*n+(j-1)] * 0.00178571428571
+                          +in[(i+5)*n+(j-1)] * 0.00111111111111
+                          +in[(i-5)*n+(j)] * -0.00111111111111
+                          +in[(i-4)*n+(j)] * -0.00178571428571
+                          +in[(i-3)*n+(j)] * -0.00333333333333
+                          +in[(i-2)*n+(j)] * -0.00833333333333
+                          +in[(i-1)*n+(j)] * -0.05
+                          +in[(i+1)*n+(j)] * 0.05
+                          +in[(i+2)*n+(j)] * 0.00833333333333
+                          +in[(i+3)*n+(j)] * 0.00333333333333
+                          +in[(i+4)*n+(j)] * 0.00178571428571
+                          +in[(i+5)*n+(j)] * 0.00111111111111
+                          +in[(i-5)*n+(j+1)] * -0.00111111111111
+                          +in[(i-4)*n+(j+1)] * -0.00178571428571
+                          +in[(i-3)*n+(j+1)] * -0.00333333333333
+                          +in[(i-2)*n+(j+1)] * -0.00833333333333
+                          +in[(i)*n+(j+1)] * 0.05
+                          +in[(i+1)*n+(j+1)] * 0.05
+                          +in[(i+2)*n+(j+1)] * 0.00833333333333
+                          +in[(i+3)*n+(j+1)] * 0.00333333333333
+                          +in[(i+4)*n+(j+1)] * 0.00178571428571
+                          +in[(i+5)*n+(j+1)] * 0.00111111111111
+                          +in[(i-5)*n+(j+2)] * -0.00111111111111
+                          +in[(i-4)*n+(j+2)] * -0.00178571428571
+                          +in[(i-3)*n+(j+2)] * -0.00333333333333
+                          +in[(i-1)*n+(j+2)] * 0.00833333333333
+                          +in[(i)*n+(j+2)] * 0.00833333333333
+                          +in[(i+1)*n+(j+2)] * 0.00833333333333
+                          +in[(i+2)*n+(j+2)] * 0.025
+                          +in[(i+3)*n+(j+2)] * 0.00333333333333
+                          +in[(i+4)*n+(j+2)] * 0.00178571428571
+                          +in[(i+5)*n+(j+2)] * 0.00111111111111
+                          +in[(i-5)*n+(j+3)] * -0.00111111111111
+                          +in[(i-4)*n+(j+3)] * -0.00178571428571
+                          +in[(i-2)*n+(j+3)] * 0.00333333333333
+                          +in[(i-1)*n+(j+3)] * 0.00333333333333
+                          +in[(i)*n+(j+3)] * 0.00333333333333
+                          +in[(i+1)*n+(j+3)] * 0.00333333333333
+                          +in[(i+2)*n+(j+3)] * 0.00333333333333
+                          +in[(i+3)*n+(j+3)] * 0.0166666666667
+                          +in[(i+4)*n+(j+3)] * 0.00178571428571
+                          +in[(i+5)*n+(j+3)] * 0.00111111111111
+                          +in[(i-5)*n+(j+4)] * -0.00111111111111
+                          +in[(i-3)*n+(j+4)] * 0.00178571428571
+                          +in[(i-2)*n+(j+4)] * 0.00178571428571
+                          +in[(i-1)*n+(j+4)] * 0.00178571428571
+                          +in[(i)*n+(j+4)] * 0.00178571428571
+                          +in[(i+1)*n+(j+4)] * 0.00178571428571
+                          +in[(i+2)*n+(j+4)] * 0.00178571428571
+                          +in[(i+3)*n+(j+4)] * 0.00178571428571
+                          +in[(i+4)*n+(j+4)] * 0.0125
+                          +in[(i+5)*n+(j+4)] * 0.00111111111111
+                          +in[(i-4)*n+(j+5)] * 0.00111111111111
+                          +in[(i-3)*n+(j+5)] * 0.00111111111111
+                          +in[(i-2)*n+(j+5)] * 0.00111111111111
+                          +in[(i-1)*n+(j+5)] * 0.00111111111111
+                          +in[(i)*n+(j+5)] * 0.00111111111111
+                          +in[(i+1)*n+(j+5)] * 0.00111111111111
+                          +in[(i+2)*n+(j+5)] * 0.00111111111111
+                          +in[(i+3)*n+(j+5)] * 0.00111111111111
+                          +in[(i+4)*n+(j+5)] * 0.00111111111111
+                          +in[(i+5)*n+(j+5)] * 0.01
+                          ;
+           }
+         }
+       }
+     }
+}
+

From 73179d5065286f0d210c580d509f5288488d74fe Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Mon, 22 Apr 2019 11:40:20 -0700
Subject: [PATCH 157/245] clean new targets (prk::vector sequential)

---
 Cxx11/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index 3953c4d80..508ee8e47 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -222,6 +222,7 @@ clean:
 	-rm -f *.optrpt
 	-rm -f *.dwarf
 	-rm -rf *.dSYM # Mac
+	-rm -f nstream transpose stencil p2p sparse dgemm
 	-rm -f *-vector
 	-rm -f *-valarray
 	-rm -f *-openmp

From 1636d95a7181d74855041513a026e0f84afb08e1 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Mon, 22 Apr 2019 11:40:35 -0700
Subject: [PATCH 158/245] silence GCC warning

---
 Cxx11/nstream-vector-taskloop.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cxx11/nstream-vector-taskloop.cc b/Cxx11/nstream-vector-taskloop.cc
index 3f4c8f1d6..3f5385a37 100644
--- a/Cxx11/nstream-vector-taskloop.cc
+++ b/Cxx11/nstream-vector-taskloop.cc
@@ -73,8 +73,8 @@ int main(int argc, char * argv[])
   /// Read and test input parameters
   //////////////////////////////////////////////////////////////////////
 
-  int iterations, gs, offset;
-  size_t length;
+  int iterations;
+  size_t length, gs, offset;
   try {
       if (argc < 3) {
         throw "Usage: <# iterations> <vector length>";

From 801a315359000c8b404a680d5ed189099708fcb5 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Mon, 22 Apr 2019 11:40:53 -0700
Subject: [PATCH 159/245] silence GCC warning

---
 Cxx11/sparse-vector.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cxx11/sparse-vector.cc b/Cxx11/sparse-vector.cc
index c521528e8..93f80b863 100644
--- a/Cxx11/sparse-vector.cc
+++ b/Cxx11/sparse-vector.cc
@@ -124,7 +124,7 @@ int main(int argc, char* argv[])
       if (lsize < 1) {
         throw "ERROR: grid dimension must be positive";
       }
-      size_t lsize2 = 2*lsize;
+      //size_t lsize2 = 2*lsize;
       size = 1L<<lsize;
       size2 = size*size;
 

From 722a7b5737e849c71a0bd07a39391ea1f1468d62 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Mon, 22 Apr 2019 12:11:14 -0700
Subject: [PATCH 160/245] add new impls

---
 Cxx11/Makefile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index 508ee8e47..040fc6c7f 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -66,7 +66,7 @@ else
   EXTRA += target
 endif
 
-all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sycl boost-compute $(EXTRA)
+all: sequential vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sycl boost-compute $(EXTRA)
 
 #p2p: p2p-vector p2p-doacross-openmp p2p-hyperplane-openmp p2p-tasks-openmp p2p-openmp-target \
      p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl p2p-hyperplane-vector-tbb \
@@ -87,6 +87,8 @@ all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sy
 
 #dgemm: dgemm-vector dgemm-cblas dgemm-cublas
 
+sequential: p2p stencil transpose nstream dgemm sparse
+
 vector: p2p-vector p2p-hyperplane-vector stencil-vector transpose-vector nstream-vector sparse-vector dgemm-vector \
 	transpose-vector-async transpose-vector-thread
 

From 7ffd12c65d012c216ff922894c3ab3fa3024293b Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Mon, 22 Apr 2019 12:11:25 -0700
Subject: [PATCH 161/245] reorder loops

---
 Cxx11/dgemm.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cxx11/dgemm.cc b/Cxx11/dgemm.cc
index 5d7fa7897..8b1560e55 100644
--- a/Cxx11/dgemm.cc
+++ b/Cxx11/dgemm.cc
@@ -83,8 +83,8 @@ void prk_dgemm(const int order, const int tile_size,
                      prk::vector<double> & C)
 {
     for (auto it=0; it<order; it+=tile_size) {
-      for (auto kt=0; kt<order; kt+=tile_size) {
-        for (auto jt=0; jt<order; jt+=tile_size) {
+      for (auto jt=0; jt<order; jt+=tile_size) {
+        for (auto kt=0; kt<order; kt+=tile_size) {
           // ICC will not hoist these on its own...
           auto iend = std::min(order,it+tile_size);
           auto jend = std::min(order,jt+tile_size);

From 30c9616390ee44f071de12a26dc7a5e8d1bdec78 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Mon, 22 Apr 2019 12:11:49 -0700
Subject: [PATCH 162/245] silent compiler warning

---
 Cxx11/sparse-vector.cc | 3 ++-
 Cxx11/sparse.cc        | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/Cxx11/sparse-vector.cc b/Cxx11/sparse-vector.cc
index 93f80b863..9b96ae8b8 100644
--- a/Cxx11/sparse-vector.cc
+++ b/Cxx11/sparse-vector.cc
@@ -105,7 +105,8 @@ int main(int argc, char* argv[])
   // Process and test input parameters
   //////////////////////////////////////////////////////////////////////
 
-  int iterations, lsize, radius, stencil_size;
+  int iterations, lsize;
+  unsigned radius, stencil_size;
   size_t size, size2, nent;
   double sparsity;
   try {
diff --git a/Cxx11/sparse.cc b/Cxx11/sparse.cc
index 38fb68deb..79c76de72 100644
--- a/Cxx11/sparse.cc
+++ b/Cxx11/sparse.cc
@@ -105,7 +105,8 @@ int main(int argc, char* argv[])
   // Process and test input parameters
   //////////////////////////////////////////////////////////////////////
 
-  int iterations, lsize, radius, stencil_size;
+  int iterations, lsize;
+  unsigned radius, stencil_size;
   size_t size, size2, nent;
   double sparsity;
   try {

From 892d24711f03ff4aa940ec48b5dcfdf56dc08f19 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Mon, 22 Apr 2019 12:16:28 -0700
Subject: [PATCH 163/245] prk::vector impl seems to be working

---
 Cxx11/prk_util.h | 41 +++++++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h
index 3ad580f55..6638abb87 100644
--- a/Cxx11/prk_util.h
+++ b/Cxx11/prk_util.h
@@ -84,14 +84,14 @@
 
 namespace prk {
 
-    int get_alignment(void)
+    const int get_alignment(void)
     {
         /* a := alignment */
 #ifdef PRK_ALIGNMENT
         int a = PRK_ALIGNMENT;
 #else
-        char* temp = getenv("PRK_ALIGNMENT");
-        int a = (temp!=NULL) ? atoi(temp) : 64;
+        const char* temp = std::getenv("PRK_ALIGNMENT");
+        int a = (temp!=nullptr) ? std::atoi(temp) : 64;
         if (a < 8) a = 8;
         assert( (a & (~a+1)) == a ); /* is power of 2? */
 #endif
@@ -101,7 +101,7 @@ namespace prk {
 #if defined(__INTEL_COMPILER)
 
     template <typename T>
-    T * malloc(size_t n)
+    T * malloc<T>(size_t n)
     {
         const int alignment = prk::get_alignment();
         const size_t bytes = n * sizeof(T);
@@ -109,24 +109,26 @@ namespace prk {
     }
 
     template <typename T>
-    void free(T * p)
+    void free<T>(T * p)
     {
         _mm_free(p);
+        p = nullptr;
     }
 
 #else // !__INTEL_COMPILER
 
     template <typename T>
-    void * malloc(size_t bytes)
+    T * malloc(size_t n)
     {
-        const int alignment = prk_get_alignment();
+        const int alignment = prk::get_alignment();
+        const size_t bytes = n * sizeof(T);
 
         // We cannot use C11 aligned_alloc on Mac.
         // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69680 */
         // GCC claims to be C11 without knowing if glibc is compliant...
 #if !defined(__GNUC__) && \
     !defined(__APPLE__) && \
-     defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
+     defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && 0 \
 
         // From ISO C11:
         //
@@ -140,16 +142,16 @@ namespace prk {
         //  Thus, if we do not round up the bytes to be a multiple
         //  of the alignment, we violate ISO C.
 
-        size_t padded = bytes;
-        size_t excess = bytes % alignment;
+        const size_t padded = bytes;
+        const size_t excess = bytes % alignment;
         if (excess>0) padded += (alignment - excess);
         return aligned_alloc(alignment,padded);
 
 #else
 
-        T * ptr = NULL;
-        int ret = posix_memalign(&ptr,alignment,bytes);
-        if (ret!=0) ptr = NULL;
+        T * ptr = nullptr;
+        const int ret = posix_memalign((void**)&ptr,alignment,bytes);
+        if (ret!=0) ptr = nullptr;
         return ptr;
 
 #endif
@@ -157,9 +159,10 @@ namespace prk {
     }
 
     template <typename T>
-    void free(void * p)
+    void free(T * p)
     {
-        free(p);
+        std::free(p);
+        p = nullptr;
     }
 
 #endif // __INTEL_COMPILER
@@ -192,17 +195,23 @@ namespace prk {
             vector(size_t n) {
                 //this->data_ = new T[n];
                 this->data_ = prk::malloc<T>(n);
+                this->size_ = n;
             }
 
             vector(size_t n, T v) {
                 //this->data_ = new T[n];
                 this->data_ = prk::malloc<T>(n);
                 for (size_t i=0; i<n; ++i) this->data_[i] = v;
+                this->size_ = n;
             }
 
             ~vector() {
                 //delete[] this->data_;
-                prk::free(this->data_);
+                prk::free<T>(this->data_);
+            }
+
+            void operator~() {
+                this->~vector();
             }
 
             T * data() {

From 61c1598a0ea23771638bd9cd4eface34a1f287e9 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Mon, 22 Apr 2019 12:16:46 -0700
Subject: [PATCH 164/245] silence compiler warning

---
 Cxx11/sparse.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cxx11/sparse.cc b/Cxx11/sparse.cc
index 79c76de72..f9baa63a5 100644
--- a/Cxx11/sparse.cc
+++ b/Cxx11/sparse.cc
@@ -125,7 +125,7 @@ int main(int argc, char* argv[])
       if (lsize < 1) {
         throw "ERROR: grid dimension must be positive";
       }
-      size_t lsize2 = 2*lsize;
+      //size_t lsize2 = 2*lsize;
       size = 1L<<lsize;
       size2 = size*size;
 

From 8661ea432f6e7a52e9f08fc45a419a000dddee10 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Mon, 22 Apr 2019 12:21:29 -0700
Subject: [PATCH 165/245] clean example for Intel toolchain

---
 common/make.defs.intel | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/common/make.defs.intel b/common/make.defs.intel
index 664d79e0c..cab461c08 100644
--- a/common/make.defs.intel
+++ b/common/make.defs.intel
@@ -14,7 +14,7 @@ CC=icc -std=c11 -pthread
 # All of the Fortran code is written for the 2008 standard and requires preprocessing.
 FC=ifort -std08 -fpp
 # C++11 may not be required but does no harm here.
-CXX=icpc -std=c++14 -pthread
+CXX=icpc -std=c++17 -pthread
 #
 # Compiler flags
 #
@@ -24,7 +24,7 @@ DEFAULT_OPT_FLAGS=-g -O3 -xHOST
 # If you are compiling for KNL on a Xeon login node, use the following:
 # DEFAULT_OPT_FLAGS=-g -O3 -xMIC-AVX512
 #
-DEFAULT_OPT_FLAGS+=-qopt-report=5
+#DEFAULT_OPT_FLAGS+=-qopt-report=5
 #
 # OpenMP flags
 #
@@ -36,6 +36,9 @@ OFFLOADFLAG=-qopenmp-offload=host
 #
 # MacOS
 #OPENCLFLAG=-framework OpenCL
+# POCL
+# http://portablecl.org/docs/html/using.html#linking-your-program-directly-with-pocl is not correct...
+#OPENCLFLAG=-I/opt/pocl/latest/include -L/opt/pocl/latest/lib -lpoclu -I/opt/pocl/latest/share/pocl/include -lOpenCL
 # Linux
 OPENCLDIR=/etc/alternatives/opencl-intel-tools
 OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL
@@ -51,7 +54,7 @@ SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include
 # https://github.com/ProGTX/sycl-gtx
 #SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx
 #SYCLCXX=${CXX} ${OPENMPFLAG}
-#SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG}
+#SYCLFLAG=-I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG}
 #
 # OCCA
 #
@@ -89,8 +92,9 @@ CBLASFLAG=-DMKL -mkl
 # Mac w/ CUDA emulation via https://github.com/hughperkins/coriander
 #NVCC=/opt/llvm/cocl/bin/cocl
 # Linux w/ NVIDIA CUDA
-NVCC=nvcc -arch=sm_50
+NVCC=nvcc
 CUDAFLAGS=-g -O3 -std=c++11
+CUDAFLAGS+=-arch=sm_50
 # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233
 CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED
 #
@@ -115,6 +119,8 @@ MPICC=mpiicc -std=c99
 COARRAYFLAG=-coarray
 # multi-node
 # COARRAYFLAG=-coarray=distributed
-
+#
+# MEMKIND (used in C1z)
+#
 MEMKINDDIR=/home/parallels/PRK/deps
 MEMKINDFLAGS=-I${MEMKINDDIR}/include -L${MEMKINDDIR}/lib -lmemkind -Wl,-rpath=${MEMKINDDIR}/lib

From 7b170b11c0b23d70f6fad8f952030afded86b5dc Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Mon, 22 Apr 2019 12:29:46 -0700
Subject: [PATCH 166/245] use .data() instead of &([0]) and dynamic schedule
 loop in DGEMM CBLAS

---
 Cxx11/dgemm-cblas.cc | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/Cxx11/dgemm-cblas.cc b/Cxx11/dgemm-cblas.cc
index 24ae52bae..b17b84785 100644
--- a/Cxx11/dgemm-cblas.cc
+++ b/Cxx11/dgemm-cblas.cc
@@ -104,7 +104,7 @@ void prk_dgemm(const int order,
     const double beta  = 1.0;
 
     cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
-                n, n, n, alpha, &(A[0]), n, &(B[0]), n, beta, &(C[0]), n);
+                n, n, n, alpha, A.data(), n, B.data(), n, beta, C.data(), n);
 }
 
 void prk_dgemm(const int order, const int batches,
@@ -132,11 +132,11 @@ void prk_dgemm(const int order, const int batches, const int nt,
     const double beta  = 1.0;
 
 #ifdef _OPENMP
-#pragma omp parallel for schedule(static) num_threads(nt)
+#pragma omp parallel for schedule(dynamic) num_threads(nt)
 #endif
     for (int b=0; b<batches; ++b) {
         cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
-                    n, n, n, alpha, &(A[b][0]), n, &(B[b][0]), n, beta, &(C[b][0]), n);
+                    n, n, n, alpha, A[b].data(), n, B[b].data(), n, beta, C[b].data(), n);
     }
 }
 
@@ -242,8 +242,7 @@ int main(int argc, char * argv[])
 #endif
   } else if (batches < 0) {
       if (batch_threads > 1) {
-          std::cout << "Batch size           = " << std::abs(batches) << " (loop over legacy BLAS with "
-                    << batch_threads << " threads)" << std::endl;
+          std::cout << "Batch size           = " << std::abs(batches) << " (loop over legacy BLAS with " << batch_threads << " threads)" << std::endl;
       } else {
           std::cout << "Batch size           = " << std::abs(batches) << " (loop over legacy BLAS sequentially)" << std::endl;
       }

From 036fcd57cc2c4df5cb65364f3e474f18fd22aedb Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sun, 28 Apr 2019 10:24:37 -0700
Subject: [PATCH 167/245] Update make.defs.llvm

default to CodePlay
disable OCCA
---
 common/make.defs.llvm | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/common/make.defs.llvm b/common/make.defs.llvm
index fda35f476..224edb8d9 100644
--- a/common/make.defs.llvm
+++ b/common/make.defs.llvm
@@ -75,9 +75,9 @@ SYCLFLAG+=-std=c++14
 #
 # triSYCL
 # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
-SYCLDIR=./triSYCL
-SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS)
-SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include
+#SYCLDIR=./triSYCL
+#SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS)
+#SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include
 # ProGTX
 # https://github.com/ProGTX/sycl-gtx
 #SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx
@@ -86,7 +86,7 @@ SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include
 #
 # OCCA
 #
-OCCADIR=${HOME}/prk-repo/Cxx11/occa
+#OCCADIR=${HOME}/prk-repo/Cxx11/occa
 #
 # TBB
 #

From e1efa89ef35675a3ed8d68bab2b44563ba701dd2 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Fri, 26 Apr 2019 16:02:49 -0700
Subject: [PATCH 168/245] add kokkos::fence where appropriate

---
 Cxx11/nstream-kokkos.cc   |  4 +++-
 Cxx11/stencil-kokkos.cc   | 31 ++++++++++++++++----------
 Cxx11/transpose-kokkos.cc | 46 ++++++++++++++++++++++-----------------
 3 files changed, 48 insertions(+), 33 deletions(-)

diff --git a/Cxx11/nstream-kokkos.cc b/Cxx11/nstream-kokkos.cc
index 9e0af56bd..be425e75b 100644
--- a/Cxx11/nstream-kokkos.cc
+++ b/Cxx11/nstream-kokkos.cc
@@ -139,6 +139,7 @@ int main(int argc, char * argv[])
           C[i] = 2.0;
       });
       Kokkos::fence();
+
       for (int iter = 0; iter<=iterations; ++iter) {
 
         if (iter==1) {
@@ -169,8 +170,9 @@ int main(int argc, char * argv[])
 
     double asum(0);
     Kokkos::parallel_reduce(length, KOKKOS_LAMBDA(size_t const i, double & inner) {
-      inner += std::fabs(A(i));
+        inner += std::fabs(A(i));
     }, asum);
+    Kokkos::fence();
 
     double epsilon(1.e-8);
     if (std::fabs(ar-asum)/asum > epsilon) {
diff --git a/Cxx11/stencil-kokkos.cc b/Cxx11/stencil-kokkos.cc
index b92bd4a57..f5c3365ba 100644
--- a/Cxx11/stencil-kokkos.cc
+++ b/Cxx11/stencil-kokkos.cc
@@ -180,24 +180,30 @@ int main(int argc, char* argv[])
     auto tile2  = {tile_size,tile_size};
     auto full   = Kokkos::MDRangePolicy<Kokkos::Rank<2>>(z2,n2,tile2);
 
-    Kokkos::parallel_for(full, KOKKOS_LAMBDA(int i, int j) {
-        in(i,j)  = static_cast<double>(i+j);
-        out(i,j) = 0.0;
-    });
+    {
+      Kokkos::parallel_for(full, KOKKOS_LAMBDA(int i, int j) {
+          in(i,j)  = static_cast<double>(i+j);
+          out(i,j) = 0.0;
+      });
+      Kokkos::fence();
 
-    for (int iter = 0; iter<=iterations; ++iter) {
+      for (int iter = 0; iter<=iterations; ++iter) {
 
-      if (iter==1) stencil_time = prk::wtime();
+        if (iter==1) {
+          Kokkos::fence();
+          stencil_time = prk::wtime();
+        }
 
-      stencil(n, tile_size, in, out);
+        stencil(n, tile_size, in, out);
 
-      Kokkos::parallel_for(full, KOKKOS_LAMBDA(int i, int j) {
-          in(i,j) += 1.0;
-      });
+        Kokkos::parallel_for(full, KOKKOS_LAMBDA(int i, int j) {
+            in(i,j) += 1.0;
+        });
+      }
+      Kokkos::fence();
+      stencil_time = prk::wtime() - stencil_time;
     }
 
-    stencil_time = prk::wtime() - stencil_time;
-
     //////////////////////////////////////////////////////////////////////
     // Analyze and output results.
     //////////////////////////////////////////////////////////////////////
@@ -211,6 +217,7 @@ int main(int argc, char* argv[])
     Kokkos::parallel_reduce(inside, KOKKOS_LAMBDA(int i, int j, double & norm) {
         norm += std::fabs(out(i,j));
     }, norm);
+    Kokkos::fence();
     norm /= active_points;
 
     // verify correctness
diff --git a/Cxx11/transpose-kokkos.cc b/Cxx11/transpose-kokkos.cc
index fa35ebb6e..9b5a4f6c0 100644
--- a/Cxx11/transpose-kokkos.cc
+++ b/Cxx11/transpose-kokkos.cc
@@ -117,6 +117,8 @@ int main(int argc, char * argv[])
     // Allocate space and perform the computation
     //////////////////////////////////////////////////////////////////////
 
+    double trans_time(0);
+
     matrix A("A", order, order);
     matrix B("B", order, order);
 
@@ -129,32 +131,36 @@ int main(int argc, char * argv[])
     auto policy_lr = Kokkos::MDRangePolicy<rl>({0,0},order2,tile2);
     auto policy_rl = Kokkos::MDRangePolicy<lr>({0,0},order2,tile2);
 
-    Kokkos::parallel_for(policy, KOKKOS_LAMBDA(int i, int j) {
-        A(i,j) = static_cast<double>(i*order+j);
-        B(i,j) = 0.0;
-    });
-
-    double trans_time(0);
+    {
+      Kokkos::parallel_for(policy, KOKKOS_LAMBDA(int i, int j) {
+          A(i,j) = static_cast<double>(i*order+j);
+          B(i,j) = 0.0;
+      });
+      Kokkos::fence();
 
-    for (int iter = 0; iter<=iterations; ++iter) {
+      for (int iter = 0; iter<=iterations; ++iter) {
 
-      if (iter==1) trans_time = prk::wtime();
+        if (iter==1) {
+          Kokkos::fence();
+          trans_time = prk::wtime();
+        }
 
-      if (permute) {
-          Kokkos::parallel_for(policy_rl, KOKKOS_LAMBDA(int i, int j) {
-              B(i,j) += A(j,i);
-              A(j,i) += 1.0;
-          });
-      } else {
-          Kokkos::parallel_for(policy_lr, KOKKOS_LAMBDA(int i, int j) {
-              B(i,j) += A(j,i);
-              A(j,i) += 1.0;
-          });
+        if (permute) {
+            Kokkos::parallel_for(policy_rl, KOKKOS_LAMBDA(int i, int j) {
+                B(i,j) += A(j,i);
+                A(j,i) += 1.0;
+            });
+        } else {
+            Kokkos::parallel_for(policy_lr, KOKKOS_LAMBDA(int i, int j) {
+                B(i,j) += A(j,i);
+                A(j,i) += 1.0;
+            });
+        }
       }
+      Kokkos::fence();
+      trans_time = prk::wtime() - trans_time;
     }
 
-    trans_time = prk::wtime() - trans_time;
-
     //////////////////////////////////////////////////////////////////////
     /// Analyze and output results
     //////////////////////////////////////////////////////////////////////

From 576b332e8226a9f3f7219f520ffdb0e88fc212e4 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Wed, 24 Apr 2019 10:09:16 -0500
Subject: [PATCH 169/245] Flang is mostly Fortran 2008 complete now

---
 .gitignore                        |  3 +++
 FORTRAN/Makefile                  |  1 +
 FORTRAN/dgemm-openmp-target.f90   |  1 -
 FORTRAN/dgemm-pretty.f90          |  7 +------
 FORTRAN/dgemm-taskloop-openmp.f90 |  1 -
 FORTRAN/dgemm.f90                 | 11 -----------
 FORTRAN/nstream.f90               |  6 +++---
 FORTRAN/stencil-pretty.f90        |  8 --------
 FORTRAN/transpose-pretty.f90      | 20 +++++---------------
 FORTRAN/transpose.f90             | 17 -----------------
 10 files changed, 13 insertions(+), 62 deletions(-)

diff --git a/.gitignore b/.gitignore
index 5a90844a4..a7e76eb32 100644
--- a/.gitignore
+++ b/.gitignore
@@ -31,6 +31,9 @@ octave-workspace                # Octave crashes
 */*__genmod.mod
 *.patch
 */*.patch
+*.dbg                           # Flang
+*/*.dbg
+*/*/*.dbg
 
 common/make.defs
 scripts/small/runfgmpi
diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile
index 551c8fc36..26226b648 100644
--- a/FORTRAN/Makefile
+++ b/FORTRAN/Makefile
@@ -90,6 +90,7 @@ dgemm-pretty: dgemm-pretty.f90
 clean:
 	-rm -f *.o
 	-rm -f *.i90
+	-rm -f *.dbg
 	-rm -f *__genmod.f90 # Intel Fortran
 	-rm -f *__genmod.mod # Intel Fortran
 	-rm -f *.optrpt
diff --git a/FORTRAN/dgemm-openmp-target.f90 b/FORTRAN/dgemm-openmp-target.f90
index ed2193bba..3c8ffbeec 100644
--- a/FORTRAN/dgemm-openmp-target.f90
+++ b/FORTRAN/dgemm-openmp-target.f90
@@ -181,7 +181,6 @@ program main
 
   forder = real(order,REAL64)
   reference = 0.25d0 * forder**3 * (forder-1)**2 * (iterations+1)
-  ! TODO: use intrinsic here (except PGI)
   checksum = 0.0d0
   !$omp parallel do simd reduction(+:checksum)
   do j=1,order
diff --git a/FORTRAN/dgemm-pretty.f90 b/FORTRAN/dgemm-pretty.f90
index e1e6ac7c2..59983f924 100644
--- a/FORTRAN/dgemm-pretty.f90
+++ b/FORTRAN/dgemm-pretty.f90
@@ -77,11 +77,7 @@ program main
   real(kind=REAL64), allocatable ::  C(:,:)         ! buffer to hold output matrix
   integer(kind=INT64) :: nflops
   ! runtime variables
-#if 1 || defined(PGI)
-  integer(kind=INT32) :: i
-#endif
-  integer(kind=INT64) :: j
-  integer(kind=INT32) :: k
+  integer(kind=INT32) :: i,j,k
   real(kind=REAL64) ::  checksum, reference, residuum
   real(kind=REAL64) ::  t0, t1, dgemm_time, avgtime ! timing parameters
   real(kind=REAL64), parameter ::  epsilon=1.D-8    ! error tolerance
@@ -166,7 +162,6 @@ program main
 
   forder = real(order,REAL64)
   reference = 0.25d0 * forder**3 * (forder-1)**2 * (iterations+1)
-  ! TODO: use intrinsic here (except PGI)
   checksum = 0.0d0
   do j=1,order
     do i=1,order
diff --git a/FORTRAN/dgemm-taskloop-openmp.f90 b/FORTRAN/dgemm-taskloop-openmp.f90
index b127dd356..2b64413d3 100644
--- a/FORTRAN/dgemm-taskloop-openmp.f90
+++ b/FORTRAN/dgemm-taskloop-openmp.f90
@@ -236,7 +236,6 @@ program main
 
   forder = real(order,REAL64)
   reference = 0.25d0 * forder**3 * (forder-1)**2 * (iterations+1)
-  ! TODO: use intrinsic here (except PGI)
   checksum = 0.0d0
   !$omp parallel do simd reduction(+:checksum)
   do j=1,order
diff --git a/FORTRAN/dgemm.f90 b/FORTRAN/dgemm.f90
index a68eff104..7123882a6 100644
--- a/FORTRAN/dgemm.f90
+++ b/FORTRAN/dgemm.f90
@@ -75,11 +75,6 @@ subroutine prk_dgemm(order, tile_size, A, B, C)
     do jt=1,order,tile_size
       do kt=1,order,tile_size
         do it=1,order,tile_size
-#elif defined(PGI)
-    ! PGI does not support DO CONCURRENT.
-    do jt=1,order,tile_size
-      do kt=1,order,tile_size
-        do it=1,order,tile_size
 #else
     do concurrent (jt=1:order:tile_size)
       do concurrent (kt=1:order:tile_size)
@@ -111,11 +106,6 @@ subroutine prk_dgemm(order, tile_size, A, B, C)
       do k=1,order
         !$omp simd
         do i=1,order
-#elif defined(PGI)
-    ! PGI does not support DO CONCURRENT.
-    do j=1,order
-      do k=1,order
-        do i=1,order
 #else
     do concurrent (j=1:order)
       do concurrent (k=1:order)
@@ -288,7 +278,6 @@ program main
 
   forder = real(order,REAL64)
   reference = 0.25d0 * forder**3 * (forder-1)**2 * (iterations+1)
-  ! TODO: use intrinsic here (except PGI)
   checksum = 0.0d0
   !$omp parallel do simd reduction(+:checksum)
   do j=1,order
diff --git a/FORTRAN/nstream.f90 b/FORTRAN/nstream.f90
index 6aa9c1529..dc4ee8744 100644
--- a/FORTRAN/nstream.f90
+++ b/FORTRAN/nstream.f90
@@ -192,7 +192,7 @@ program main
     C(i) = 2
   enddo
   !$omp end do
-#elif defined(PGI)
+#elif 0
   forall (i=1:length)
     A(i) = 0
     B(i) = 2
@@ -229,7 +229,7 @@ program main
       A(i) = A(i) + B(i) + scalar * C(i)
     enddo
     !$omp end do
-#elif defined(PGI)
+#elif 0
     forall (i=1:length)
       A(i) = A(i) + B(i) + scalar * C(i)
     end forall
@@ -267,7 +267,7 @@ program main
   ar = ar * length
 
   asum = 0
-#if defined(_OPENMP) || defined(PGI)
+#if defined(_OPENMP)
   !$omp parallel do reduction(+:asum)
   do i=1,length
     asum = asum + abs(A(i))
diff --git a/FORTRAN/stencil-pretty.f90 b/FORTRAN/stencil-pretty.f90
index e1ec242bf..1119ff731 100644
--- a/FORTRAN/stencil-pretty.f90
+++ b/FORTRAN/stencil-pretty.f90
@@ -285,17 +285,9 @@ program main
   call initialize_w(is_star,r,W)
 
   ! initialize the input and output arrays
-#if defined(PGI)
-  forall (i=1:n, j=1:n)
-#else
   do concurrent (i=1:n, j=1:n)
-#endif
     A(i,j) = cx*(i-1)+cy*(j-1)
-#if defined(PGI)
-  endforall
-#else
   enddo
-#endif
   !B(r+1:n-r,r+1:n-r) = 0 ! minimal
   B = 0 ! sufficient
 
diff --git a/FORTRAN/transpose-pretty.f90 b/FORTRAN/transpose-pretty.f90
index ee6676401..31c88b378 100644
--- a/FORTRAN/transpose-pretty.f90
+++ b/FORTRAN/transpose-pretty.f90
@@ -73,11 +73,8 @@ program main
   real(kind=REAL64), allocatable ::  B(:,:)         ! buffer to hold transposed matrix
   integer(kind=INT64) ::  bytes                     ! combined size of matrices
   ! runtime variables
-#if defined(PGI)
-  integer(kind=INT32) :: i
-#endif
-  integer(kind=INT32) :: k
-  integer(kind=INT64) :: j, o2                      ! for loop over order**2
+  integer(kind=INT32) :: i,j,k
+  integer(kind=INT64) :: j2, o2                      ! for loop over order**2
   real(kind=REAL64) ::  abserr                      ! squared error
   real(kind=REAL64) ::  t0, t1, trans_time, avgtime ! timing parameters
   real(kind=REAL64), parameter ::  epsilon=1.D-8    ! error tolerance
@@ -132,7 +129,7 @@ program main
 
   ! Fill the original matrix
   o2 = int(order,INT64)**2
-  A = reshape((/ (j, j = 0,o2) /),(/order, order/))
+  A = reshape((/ (j2, j2 = 0,o2) /),(/order, order/))
   B = 0
 
   t0 = 0
@@ -152,17 +149,10 @@ program main
   ! ********************************************************************
 
   ! we reuse A here as the reference matrix, to compute the error
-  A = ( transpose(reshape((/ (j, j = 0,o2) /),(/order, order/))) &
+  A = ( transpose(reshape((/ (j2, j2 = 0,o2) /),(/order, order/))) &
         * real(iterations+1,REAL64) ) &
       + real((iterations*(iterations+1))/2,REAL64)
-#if 0 && defined(PGI)
-  ! PGI generates a segfault here...
-  abserr = 0.0d0
-  forall (j=1:order,i=1:order)
-      abserr = abserr + (B(i,j) - A(i,j))**2
-  endforall
-  abserr = sqrt(abserr)
-#elif defined(PGI)
+#if defined(PGI)
   abserr = 0.0d0
   do j=1,order
     do i=1,order
diff --git a/FORTRAN/transpose.f90 b/FORTRAN/transpose.f90
index fdcbde105..d66d56715 100644
--- a/FORTRAN/transpose.f90
+++ b/FORTRAN/transpose.f90
@@ -172,10 +172,6 @@ program main
 #endif
     do jt=1,order,tile_size
       do it=1,order,tile_size
-#elif defined(PGI)
-    ! PGI does not support DO CONCURRENT.
-    do jt=1,order,tile_size
-      do it=1,order,tile_size
 #else
     do concurrent (jt=1:order:tile_size)
       do concurrent (it=1:order:tile_size)
@@ -196,11 +192,7 @@ program main
     !$omp do collapse(2)
     do j=1,order
       do i=1,order
-#elif defined(PGI)
-    do j=1,order
-      do i=1,order
 #else
-    ! PGI does not support DO CONCURRENT.
     do concurrent (j=1:order)
       do concurrent (i=1:order)
 #endif
@@ -242,9 +234,6 @@ program main
 #endif
       do jt=1,order,tile_size
         do it=1,order,tile_size
-#elif defined(PGI)
-      do jt=1,order,tile_size
-        do it=1,order,tile_size
 #else
       do concurrent (jt=1:order:tile_size)
         do concurrent (it=1:order:tile_size)
@@ -265,9 +254,6 @@ program main
       !$omp do collapse(2)
       do j=1,order
         do i=1,order
-#elif defined(PGI)
-      do j=1,order
-        do i=1,order
 #else
       do concurrent (j=1:order)
         do concurrent (i=1:order)
@@ -314,9 +300,6 @@ program main
   !$omp& reduction(+:abserr)
   do j=1,order
     do i=1,order
-#elif defined(PGI)
-  do j=1,order
-    do i=1,order
 #else
   do concurrent (j=1:order)
     do concurrent (i=1:order)

From 390d536fcb0dcc472b3230d67a21911e4c03abc6 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Tue, 30 Apr 2019 19:02:24 -0700
Subject: [PATCH 170/245] pointless reordering of string

---
 FORTRAN/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile
index 26226b648..d96f87cce 100644
--- a/FORTRAN/Makefile
+++ b/FORTRAN/Makefile
@@ -24,7 +24,7 @@ ifeq ($(findstring ifort,$(FC)),ifort)
   ifeq ($(shell uname -s),Darwin)
     EXTRA = taskloop
   else
-    EXTRA = target coarray taskloop
+    EXTRA = taskloop target coarray
   endif
 endif
 # GCC (also matches pgfortran so PGI must come after)

From c37976c1959ce6c4d8baf5f43389b32ded1c2559 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Tue, 30 Apr 2019 19:02:49 -0700
Subject: [PATCH 171/245] add PGI support for IVDEP

---
 Cxx11/prk_simd.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Cxx11/prk_simd.h b/Cxx11/prk_simd.h
index 742bc4fcb..7daed0911 100644
--- a/Cxx11/prk_simd.h
+++ b/Cxx11/prk_simd.h
@@ -38,6 +38,8 @@
 # define PRAGMA_SIMD PRAGMA(vector) PRAGMA(ivdep)
 // According to https://github.com/LLNL/RAJA/pull/310, this improves lambda performance
 # define PRAGMA_INLINE PRAGMA(forceinline recursive)
+#elif defined(__PGI)
+# define PRAGMA_SIMD PRAGMA(vector) PRAGMA(ivdep)
 #elif defined(__GNUC__) && defined(__GNUC_MINOR__) && ( ( (__GNUC__ == 4) && (__GNUC_MINOR__ == 9) ) || (__GNUC__ >= 5) )
 # define PRAGMA_SIMD PRAGMA(GCC ivdep)
 # define PRAGMA_INLINE PRAGMA(inline)

From f851bcc7ee43cd327beea4f8126c6b9ec562a821 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Tue, 30 Apr 2019 19:03:05 -0700
Subject: [PATCH 172/245] return value qualified is ignored

---
 Cxx11/prk_util.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h
index 6638abb87..81ca5006f 100644
--- a/Cxx11/prk_util.h
+++ b/Cxx11/prk_util.h
@@ -84,7 +84,7 @@
 
 namespace prk {
 
-    const int get_alignment(void)
+    int get_alignment(void)
     {
         /* a := alignment */
 #ifdef PRK_ALIGNMENT

From 9ecc6594e2380df62a4db6ef3debb8b2d420a2a1 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Tue, 30 Apr 2019 19:03:23 -0700
Subject: [PATCH 173/245] TBB does not support PGI

---
 Cxx11/Makefile | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index 040fc6c7f..e050c7abd 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -65,8 +65,12 @@ ifeq ($(shell uname -s),Darwin)
 else
   EXTRA += target
 endif
+ifneq ($(findstring pgc++,$(CXX)),pgc++)
+    @echo CXX=$(CXX)
+  EXTRA += tbb pstl
+endif
 
-all: sequential vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sycl boost-compute $(EXTRA)
+all: sequential vector valarray openmp taskloop stl rangefor kokkos opencl sycl boost-compute $(EXTRA) # raja
 
 #p2p: p2p-vector p2p-doacross-openmp p2p-hyperplane-openmp p2p-tasks-openmp p2p-openmp-target \
      p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl p2p-hyperplane-vector-tbb \

From 0d811ddfe23562e0d0bcfd26598288e59d74a921 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Tue, 30 Apr 2019 19:05:02 -0700
Subject: [PATCH 174/245] update PGI example flags

---
 common/make.defs.pgi | 50 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 39 insertions(+), 11 deletions(-)

diff --git a/common/make.defs.pgi b/common/make.defs.pgi
index 1205afff2..78447462d 100644
--- a/common/make.defs.pgi
+++ b/common/make.defs.pgi
@@ -9,7 +9,7 @@ CC=pgcc -c11
 FC=pgfortran -Mpreprocess -Mfreeform
 #FC=/opt/llvm/pgi-flang/bin/flang -Mpreprocess -Wl,-rpath=/opt/llvm/pgi-flang/lib
 # C++11 may not be required but does no harm here.
-CXX=pgc++ --c++11
+CXX=pgc++ --c++17
 #
 # Compiler flags
 #
@@ -18,9 +18,10 @@ DEFAULT_OPT_FLAGS=-O2 -tp=haswell
 # OpenMP flags
 #
 OPENMPFLAG=-mp #-Minfo=mp,vect
-OFFLOADFLAG=-mp #-Minfo=mp,vect
-#ORNLACCFLAG=-acc -ta=multicore -Minfo=accel
-ORNLACCFLAG=-acc -ta=tesla:cc70 -Minfo=accel
+OPENMPSIMDFLAG=-mp #-Minfo=mp,vect
+OFFLOADFLAG=-mp -ta=multicore #-Minfo=mp,vect
+ORNLACCFLAG=-acc -ta=multicore #-Minfo=accel
+#ORNLACCFLAG=-acc -ta=tesla:cc70 -Minfo=accel
 ORNLACCFLAG+=-Mlarge_arrays
 #
 # OpenCL flags
@@ -31,18 +32,45 @@ ORNLACCFLAG+=-Mlarge_arrays
 OPENCLDIR=/etc/alternatives/opencl-intel-tools
 OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL
 #
+# SYCL flags
+#
+# triSYCL
+# https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
+SYCLDIR=./triSYCL
+SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS)
+SYCLFLAG=-I$(SYCLDIR)/include
+# ProGTX
+# https://github.com/ProGTX/sycl-gtx
+#SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx
+#SYCLCXX=${CXX} ${OPENMPFLAG}
+#SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG}
+#
+# OCCA
+#
+#OCCADIR=${HOME}/prk-repo/Cxx11/occa
+#
+# TBB
+#
+TBBDIR=./tbb
+TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
+#
 # Parallel STL, Boost, etc.
 #
 BOOSTFLAG=-DUSE_BOOST -I.
-PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG}
-KOKKOSDIR=./kokkos
-KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG}
-RAJADIR=./raja
-RAJAFLAG=-DUSE_RAJA -I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
+RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
+#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
+PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG}
+KOKKOSDIR=/opt/kokkos/pgi
+KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl
+RAJADIR=/opt/raja/pgi
+RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
+THRUSTDIR=/opt/nvidia/thrust
+THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG}
 #
 # CBLAS for C++ DGEMM
 #
-CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions
+BLASFLAG=
+CBLASFLAG=
 #
 # CUDA flags
 #
@@ -78,4 +106,4 @@ CUDAFLAGS+=-D_AVX512ERINTRIN_H_INCLUDED
 # MPI
 #
 # Needs PATH and LD_LIBRARY_PATH set appropriately...
-MPICC=/opt/pgi/linux86-64/2017/mpi/openmpi/bin/mpicc
+MPICC=/opt/pgi/linux86-64/2019/mpi/openmpi/bin/mpicc

From d6f307191b0f2c99a92eefe9f3764c485ba0ae65 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Thu, 21 Mar 2019 20:29:54 -0700
Subject: [PATCH 175/245] work around Clang FE issue

---
 Cxx11/stencil-openmp-target.cc | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/Cxx11/stencil-openmp-target.cc b/Cxx11/stencil-openmp-target.cc
index 76a4fab3c..62c5b73b0 100644
--- a/Cxx11/stencil-openmp-target.cc
+++ b/Cxx11/stencil-openmp-target.cc
@@ -65,11 +65,8 @@
 
 void nothing(const int n, const int t, const double * RESTRICT in, double * RESTRICT out)
 {
-    std::cout << "You are trying to use a stencil that does not exist." << std::endl;
-    std::cout << "Please generate the new stencil using the code generator." << std::endl;
-    // n will never be zero - this is to silence compiler warnings.
-    if (n==0) std::cout << in << out << std::endl;
-    std::abort();
+    // use arguments to silence compiler warnings
+    out[0] = in[0] + n + t;
 }
 
 int main(int argc, char* argv[])

From a3eadc2f5bd7e15a3a4955e33e5787b343b23eea Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Wed, 1 May 2019 14:57:27 -0700
Subject: [PATCH 176/245] fix errors

---
 Cxx11/Makefile   | 1 -
 Cxx11/prk_util.h | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index e050c7abd..b166d65d4 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -66,7 +66,6 @@ else
   EXTRA += target
 endif
 ifneq ($(findstring pgc++,$(CXX)),pgc++)
-    @echo CXX=$(CXX)
   EXTRA += tbb pstl
 endif
 
diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h
index 81ca5006f..d2caae1b7 100644
--- a/Cxx11/prk_util.h
+++ b/Cxx11/prk_util.h
@@ -101,7 +101,7 @@ namespace prk {
 #if defined(__INTEL_COMPILER)
 
     template <typename T>
-    T * malloc<T>(size_t n)
+    T * malloc(size_t n)
     {
         const int alignment = prk::get_alignment();
         const size_t bytes = n * sizeof(T);
@@ -109,7 +109,7 @@ namespace prk {
     }
 
     template <typename T>
-    void free<T>(T * p)
+    void free(T * p)
     {
         _mm_free(p);
         p = nullptr;

From bd2dc5f1f21022eca5b3af670e5f0d06bdb3a68d Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Tue, 16 Apr 2019 15:27:28 -0700
Subject: [PATCH 177/245] add hyperplane OpenMP to C1z

---
 C1z/Makefile                |   2 +-
 C1z/p2p-hyperplane-openmp.c | 201 ++++++++++++++++++++++++++++++++++++
 2 files changed, 202 insertions(+), 1 deletion(-)
 create mode 100644 C1z/p2p-hyperplane-openmp.c

diff --git a/C1z/Makefile b/C1z/Makefile
index aac123acc..20619ccaf 100644
--- a/C1z/Makefile
+++ b/C1z/Makefile
@@ -49,7 +49,7 @@ serial: nstream p2p p2p-innerloop stencil transpose
 
 thread: transpose-thread
 
-openmp: nstream-openmp p2p-simd-openmp p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp transpose-openmp
+openmp: nstream-openmp p2p-simd-openmp p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp transpose-openmp p2p-hyperplane-openmp
 
 mpi: nstream-mpi
 
diff --git a/C1z/p2p-hyperplane-openmp.c b/C1z/p2p-hyperplane-openmp.c
new file mode 100644
index 000000000..a24d73f89
--- /dev/null
+++ b/C1z/p2p-hyperplane-openmp.c
@@ -0,0 +1,201 @@
+///
+/// Copyright (c) 2013, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    Pipeline
+///
+/// PURPOSE: This program tests the efficiency with which point-to-point
+///          synchronization can be carried out. It does so by executing
+///          a pipelined algorithm on an n^2 grid. The first array dimension
+///          is distributed among the threads (stripwise decomposition).
+///
+/// USAGE:   The program takes as input the
+///          dimensions of the grid, and the number of iterations on the grid
+///
+///                <progname> <iterations> <n>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// FUNCTIONS CALLED:
+///
+///          Other than standard C functions, the following
+///          functions are used in this program:
+///
+///          wtime()
+///
+/// HISTORY: - Written by Rob Van der Wijngaart, February 2009.
+///          - C99-ification by Jeff Hammond, February 2016.
+///          - C11-ification by Jeff Hammond, June 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+#include "p2p-kernel.h"
+
+int main(int argc, char* argv[])
+{
+  printf("Parallel Research Kernels version %.2f\n", PRKVERSION);
+#ifdef _OPENMP
+  printf("C11/OpenMP HYPERPLANE pipeline execution on 2D grid\n");
+#else
+  printf("C11/Serial HYPERPLANE pipeline execution on 2D grid\n");
+#endif
+
+  //////////////////////////////////////////////////////////////////////
+  // Process and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  if (argc < 3) {
+    printf("Usage: <# iterations> <array dimension> <chunk size>\n");
+    return 1;
+  }
+
+  // number of times to run the pipeline algorithm
+  int iterations = atoi(argv[1]);
+  if (iterations < 1) {
+    printf("ERROR: iterations must be >= 1\n");
+    return 1;
+  }
+
+  // grid dimensions
+  int n = atoi(argv[2]);
+  if (n < 1) {
+    printf("ERROR: grid dimension must be positive: %d\n", n);
+    return 1;
+  }
+
+  // grid chunk dimensions
+  int nc = (argc > 3) ? atoi(argv[3]) : 1;
+  nc = MAX(1,nc);
+  nc = MIN(n,nc);
+
+  // number of grid blocks
+  int nb = (n-1)/nc;
+  if ((n-1)%nc) nb++;
+
+#ifdef _OPENMP
+  printf("Number of threads (max)   = %d\n", omp_get_max_threads());
+#endif
+  printf("Number of iterations      = %d\n", iterations);
+  printf("Grid sizes                = %d,%d\n", n, n);
+  printf("Grid chunk sizes, blocks  = %d,%d\n", nc, nb);
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  double pipeline_time = 0.0; // silence compiler warning
+
+  size_t bytes = n*n*sizeof(double);
+  double * restrict grid = prk_malloc(bytes);
+
+  OMP_PARALLEL()
+  {
+    OMP_FOR()
+    for (int i=0; i<n; i++) {
+      OMP_SIMD
+      for (int j=0; j<n; j++) {
+        grid[i*n+j] = 0.0;
+      }
+    }
+
+    // set boundary values (bottom and left side of grid)
+    OMP_MASTER
+    {
+      for (int j=0; j<n; j++) {
+        grid[0*n+j] = (double)j;
+      }
+      for (int i=0; i<n; i++) {
+        grid[i*n+0] = (double)i;
+      }
+    }
+    OMP_BARRIER
+
+    for (int iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) {
+          OMP_BARRIER
+          OMP_MASTER
+          pipeline_time = prk_wtime();
+      }
+
+      if (nc==1) {
+        for (int i=2; i<=2*n-2; i++) {
+          OMP_FOR_SIMD()
+          for (int j=MAX(2,i-n+2); j<=MIN(i,n); j++) {
+            const int x = i-j+1;
+            const int y = j-1;
+            grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)];
+          }
+        }
+      } else {
+        for (int i=2; i<=2*(nb+1)-2; i++) {
+          OMP_FOR()
+          for (int j=MAX(2,i-(nb+1)+2); j<=MIN(i,nb+1); j++) {
+            const int ib = nc*(i-j+1-1)+1;
+            const int jb = nc*(j-1-1)+1;
+            sweep_tile(ib, MIN(n,ib+nc), jb, MIN(n,jb+nc), n, grid);
+          }
+        }
+      }
+      OMP_MASTER
+      grid[0*n+0] = -grid[(n-1)*n+(n-1)];
+    }
+    OMP_BARRIER
+    OMP_MASTER
+    pipeline_time = prk_wtime() - pipeline_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  // Analyze and output results.
+  //////////////////////////////////////////////////////////////////////
+
+  const double epsilon = 1.e-8;
+  const double corner_val = ((iterations+1.)*(n+n-2.));
+  if ( (fabs(grid[(n-1)*n+(n-1)] - corner_val)/corner_val) > epsilon) {
+    printf("ERROR: checksum %lf does not match verification value %lf\n", grid[(n-1)*n+(n-1)], corner_val);
+    return 1;
+  }
+
+  prk_free(grid);
+
+#ifdef VERBOSE
+  printf("Solution validates; verification value = %lf\n", corner_val );
+#else
+  printf("Solution validates\n" );
+#endif
+  double avgtime = pipeline_time/iterations;
+  printf("Rate (MFlops/s): %lf Avg time (s): %lf\n", 2.0e-6 * ( (n-1)*(n-1) )/avgtime, avgtime );
+
+  return 0;
+}

From 0f390e9cd21cfdd31d7be7208ff8bb424ebb66b0 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Wed, 1 May 2019 15:10:51 -0700
Subject: [PATCH 178/245] add hyperplane to make and travis

---
 C1z/Makefile            | 3 +++
 travis/build-run-prk.sh | 4 ++++
 2 files changed, 7 insertions(+)

diff --git a/C1z/Makefile b/C1z/Makefile
index 20619ccaf..0bcd9a95f 100644
--- a/C1z/Makefile
+++ b/C1z/Makefile
@@ -68,6 +68,9 @@ ispc: transpose-ispc
 p2p-innerloop: p2p-innerloop-openmp.c prk_util.h
 	$(CC) $(CFLAGS) $< $(EXTRA_CLIBS) -o $@
 
+p2p-hyperplane: p2p-hyperplane-openmp.c prk_util.h
+	$(CC) $(CFLAGS) $< $(EXTRA_CLIBS) -o $@
+
 %-mpi: %-mpi.c prk_util.h
 	$(MPICC) $(CFLAGS) $< $(EXTRA_CLIBS) -o $@
 
diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index 194e7ca51..6f9ced772 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -152,6 +152,8 @@ case "$PRK_TARGET" in
         $PRK_TARGET_PATH/p2p             10 1024 1024
         $PRK_TARGET_PATH/p2p             10 1024 1024 100 100
         $PRK_TARGET_PATH/p2p-innerloop   10 1024
+        $PRK_TARGET_PATH/p2p-hyperplane  10 1024
+        $PRK_TARGET_PATH/p2p-hyperplane  10 1024 32
         $PRK_TARGET_PATH/stencil         10 1000
         $PRK_TARGET_PATH/transpose       10 1024 32
         #echo "Test stencil code generator"
@@ -174,6 +176,8 @@ case "$PRK_TARGET" in
                 ${MAKE} -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp transpose-openmp
                 $PRK_TARGET_PATH/p2p-tasks-openmp         10 1024 1024 100 100
                 $PRK_TARGET_PATH/p2p-innerloop-openmp     10 1024
+                $PRK_TARGET_PATH/p2p-hyperplane-openmp    10 1024
+                $PRK_TARGET_PATH/p2p-hyperplane-openmp    10 1024 32
                 $PRK_TARGET_PATH/stencil-openmp           10 1000
                 $PRK_TARGET_PATH/transpose-openmp         10 1024 32
                 #echo "Test stencil code generator"

From 78247f4935d6de290ba846da478fa9332d2d084a Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Thu, 2 May 2019 09:36:53 -0700
Subject: [PATCH 179/245] fix Fortran OpenMP p2p tasks

---
 FORTRAN/p2p-tasks-openmp.f90 | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/FORTRAN/p2p-tasks-openmp.f90 b/FORTRAN/p2p-tasks-openmp.f90
index 74c7dcd90..fcebe0c82 100644
--- a/FORTRAN/p2p-tasks-openmp.f90
+++ b/FORTRAN/p2p-tasks-openmp.f90
@@ -173,12 +173,17 @@ program main
     enddo
   enddo
 
-  do j=1,n
+  !$omp task private(j) firstprivate(n) shared(grid)
+  do j=2,n
     grid(1,j) = real(j-1,REAL64)
   enddo
-  do i=1,m
+  !$omp end task
+  !$omp task private(i) firstprivate(m) shared(grid)
+  do i=2,m
     grid(i,1) = real(i-1,REAL64)
   enddo
+  !$omp end task
+  !$omp taskwait
 
   do k=0,iterations
 

From 24972d2b4ee7a4842fb626c962f9126874be47b3 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Mon, 6 May 2019 08:48:04 -0700
Subject: [PATCH 180/245] fix Travis and regular builds

---
 C1z/Makefile            | 2 +-
 travis/build-run-prk.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/C1z/Makefile b/C1z/Makefile
index 0bcd9a95f..535ed9eaa 100644
--- a/C1z/Makefile
+++ b/C1z/Makefile
@@ -45,7 +45,7 @@ endif
 
 all: serial thread openmp taskloop $(EXTRA)
 
-serial: nstream p2p p2p-innerloop stencil transpose
+serial: nstream p2p p2p-innerloop p2p-hyperplane stencil transpose
 
 thread: transpose-thread
 
diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index 6f9ced772..962ecc1f4 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -148,7 +148,7 @@ case "$PRK_TARGET" in
         echo "EXTRA_CLIBS=-lm -lpthread" >> common/make.defs
 
         # C11 without external parallelism
-        ${MAKE} -C $PRK_TARGET_PATH p2p stencil transpose p2p-innerloop
+        ${MAKE} -C $PRK_TARGET_PATH p2p stencil transpose p2p-innerloop p2p-hyperplane
         $PRK_TARGET_PATH/p2p             10 1024 1024
         $PRK_TARGET_PATH/p2p             10 1024 1024 100 100
         $PRK_TARGET_PATH/p2p-innerloop   10 1024

From bcf4e4d29e366845f2a6dfc245fb21624bbe7fe7 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Thu, 30 Nov 2017 11:41:21 -0800
Subject: [PATCH 181/245] coefficients need to be double precision

---
 FORTRAN/generate-fortran-stencil.py |    2 +-
 FORTRAN/stencil_openmp.f90          | 2196 +++++++++++++-------------
 FORTRAN/stencil_pretty.f90          | 2160 +++++++++++++-------------
 FORTRAN/stencil_serial.f90          | 2160 +++++++++++++-------------
 FORTRAN/stencil_target.f90          | 2214 +++++++++++++--------------
 FORTRAN/stencil_taskloop.f90        | 2160 +++++++++++++-------------
 6 files changed, 5455 insertions(+), 5437 deletions(-)

diff --git a/FORTRAN/generate-fortran-stencil.py b/FORTRAN/generate-fortran-stencil.py
index 0f915163e..aabf96dcb 100755
--- a/FORTRAN/generate-fortran-stencil.py
+++ b/FORTRAN/generate-fortran-stencil.py
@@ -40,7 +40,7 @@ def codegen(src,pattern,stencil_size,radius,W,model):
             if i-radius>=0: opi='+'
             else: opi=''
             if ( W[j][i] != 0.0):
-                src.write('                 + in(i'+opi+str(i-radius)+',j'+opj+str(j-radius)+') * ('+str(W[j][i])+') &\n')
+                src.write('                 + in(i'+opi+str(i-radius)+',j'+opj+str(j-radius)+') * ('+str(W[j][i])+'d0) &\n')
     src.write('+0.0\n')
     src.write('      end do\n')
     if (model=='openmp' or model=='target' or model=='taskloop'):
diff --git a/FORTRAN/stencil_openmp.f90 b/FORTRAN/stencil_openmp.f90
index e5e0a1ce9..0b5ea36bd 100644
--- a/FORTRAN/stencil_openmp.f90
+++ b/FORTRAN/stencil_openmp.f90
@@ -8,12 +8,14 @@ subroutine star1(n, in, out)
     !$omp do
     do i=1,n-1-1
       !$omp simd
+      do j=1,n-1-1
+    do i=1,n-1-1
       do j=1,n-1-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-1) * (-0.5) &
-                 + in(i-1,j+0) * (-0.5) &
-                 + in(i+1,j+0) * (0.5) &
-                 + in(i+0,j+1) * (0.5) &
+                 + in(i+0,j-1) * (-0.5d0) &
+                 + in(i-1,j+0) * (-0.5d0) &
+                 + in(i+1,j+0) * (0.5d0) &
+                 + in(i+0,j+1) * (0.5d0) &
 +0.0
       end do
       !$omp end simd
@@ -31,16 +33,18 @@ subroutine star2(n, in, out)
     !$omp do
     do i=2,n-2-1
       !$omp simd
+      do j=2,n-2-1
+    do i=2,n-2-1
       do j=2,n-2-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-2) * (-0.125) &
-                 + in(i+0,j-1) * (-0.25) &
-                 + in(i-2,j+0) * (-0.125) &
-                 + in(i-1,j+0) * (-0.25) &
-                 + in(i+1,j+0) * (0.25) &
-                 + in(i+2,j+0) * (0.125) &
-                 + in(i+0,j+1) * (0.25) &
-                 + in(i+0,j+2) * (0.125) &
+                 + in(i+0,j-2) * (-0.125d0) &
+                 + in(i+0,j-1) * (-0.25d0) &
+                 + in(i-2,j+0) * (-0.125d0) &
+                 + in(i-1,j+0) * (-0.25d0) &
+                 + in(i+1,j+0) * (0.25d0) &
+                 + in(i+2,j+0) * (0.125d0) &
+                 + in(i+0,j+1) * (0.25d0) &
+                 + in(i+0,j+2) * (0.125d0) &
 +0.0
       end do
       !$omp end simd
@@ -58,20 +62,22 @@ subroutine star3(n, in, out)
     !$omp do
     do i=3,n-3-1
       !$omp simd
+      do j=3,n-3-1
+    do i=3,n-3-1
       do j=3,n-3-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-3) * (-0.05555555555555555) &
-                 + in(i+0,j-2) * (-0.08333333333333333) &
-                 + in(i+0,j-1) * (-0.16666666666666666) &
-                 + in(i-3,j+0) * (-0.05555555555555555) &
-                 + in(i-2,j+0) * (-0.08333333333333333) &
-                 + in(i-1,j+0) * (-0.16666666666666666) &
-                 + in(i+1,j+0) * (0.16666666666666666) &
-                 + in(i+2,j+0) * (0.08333333333333333) &
-                 + in(i+3,j+0) * (0.05555555555555555) &
-                 + in(i+0,j+1) * (0.16666666666666666) &
-                 + in(i+0,j+2) * (0.08333333333333333) &
-                 + in(i+0,j+3) * (0.05555555555555555) &
+                 + in(i+0,j-3) * (-0.05555555555555555d0) &
+                 + in(i+0,j-2) * (-0.08333333333333333d0) &
+                 + in(i+0,j-1) * (-0.16666666666666666d0) &
+                 + in(i-3,j+0) * (-0.05555555555555555d0) &
+                 + in(i-2,j+0) * (-0.08333333333333333d0) &
+                 + in(i-1,j+0) * (-0.16666666666666666d0) &
+                 + in(i+1,j+0) * (0.16666666666666666d0) &
+                 + in(i+2,j+0) * (0.08333333333333333d0) &
+                 + in(i+3,j+0) * (0.05555555555555555d0) &
+                 + in(i+0,j+1) * (0.16666666666666666d0) &
+                 + in(i+0,j+2) * (0.08333333333333333d0) &
+                 + in(i+0,j+3) * (0.05555555555555555d0) &
 +0.0
       end do
       !$omp end simd
@@ -89,24 +95,26 @@ subroutine star4(n, in, out)
     !$omp do
     do i=4,n-4-1
       !$omp simd
+      do j=4,n-4-1
+    do i=4,n-4-1
       do j=4,n-4-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-4) * (-0.03125) &
-                 + in(i+0,j-3) * (-0.041666666666666664) &
-                 + in(i+0,j-2) * (-0.0625) &
-                 + in(i+0,j-1) * (-0.125) &
-                 + in(i-4,j+0) * (-0.03125) &
-                 + in(i-3,j+0) * (-0.041666666666666664) &
-                 + in(i-2,j+0) * (-0.0625) &
-                 + in(i-1,j+0) * (-0.125) &
-                 + in(i+1,j+0) * (0.125) &
-                 + in(i+2,j+0) * (0.0625) &
-                 + in(i+3,j+0) * (0.041666666666666664) &
-                 + in(i+4,j+0) * (0.03125) &
-                 + in(i+0,j+1) * (0.125) &
-                 + in(i+0,j+2) * (0.0625) &
-                 + in(i+0,j+3) * (0.041666666666666664) &
-                 + in(i+0,j+4) * (0.03125) &
+                 + in(i+0,j-4) * (-0.03125d0) &
+                 + in(i+0,j-3) * (-0.041666666666666664d0) &
+                 + in(i+0,j-2) * (-0.0625d0) &
+                 + in(i+0,j-1) * (-0.125d0) &
+                 + in(i-4,j+0) * (-0.03125d0) &
+                 + in(i-3,j+0) * (-0.041666666666666664d0) &
+                 + in(i-2,j+0) * (-0.0625d0) &
+                 + in(i-1,j+0) * (-0.125d0) &
+                 + in(i+1,j+0) * (0.125d0) &
+                 + in(i+2,j+0) * (0.0625d0) &
+                 + in(i+3,j+0) * (0.041666666666666664d0) &
+                 + in(i+4,j+0) * (0.03125d0) &
+                 + in(i+0,j+1) * (0.125d0) &
+                 + in(i+0,j+2) * (0.0625d0) &
+                 + in(i+0,j+3) * (0.041666666666666664d0) &
+                 + in(i+0,j+4) * (0.03125d0) &
 +0.0
       end do
       !$omp end simd
@@ -124,28 +132,30 @@ subroutine star5(n, in, out)
     !$omp do
     do i=5,n-5-1
       !$omp simd
+      do j=5,n-5-1
+    do i=5,n-5-1
       do j=5,n-5-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-5) * (-0.02) &
-                 + in(i+0,j-4) * (-0.025) &
-                 + in(i+0,j-3) * (-0.03333333333333333) &
-                 + in(i+0,j-2) * (-0.05) &
-                 + in(i+0,j-1) * (-0.1) &
-                 + in(i-5,j+0) * (-0.02) &
-                 + in(i-4,j+0) * (-0.025) &
-                 + in(i-3,j+0) * (-0.03333333333333333) &
-                 + in(i-2,j+0) * (-0.05) &
-                 + in(i-1,j+0) * (-0.1) &
-                 + in(i+1,j+0) * (0.1) &
-                 + in(i+2,j+0) * (0.05) &
-                 + in(i+3,j+0) * (0.03333333333333333) &
-                 + in(i+4,j+0) * (0.025) &
-                 + in(i+5,j+0) * (0.02) &
-                 + in(i+0,j+1) * (0.1) &
-                 + in(i+0,j+2) * (0.05) &
-                 + in(i+0,j+3) * (0.03333333333333333) &
-                 + in(i+0,j+4) * (0.025) &
-                 + in(i+0,j+5) * (0.02) &
+                 + in(i+0,j-5) * (-0.02d0) &
+                 + in(i+0,j-4) * (-0.025d0) &
+                 + in(i+0,j-3) * (-0.03333333333333333d0) &
+                 + in(i+0,j-2) * (-0.05d0) &
+                 + in(i+0,j-1) * (-0.1d0) &
+                 + in(i-5,j+0) * (-0.02d0) &
+                 + in(i-4,j+0) * (-0.025d0) &
+                 + in(i-3,j+0) * (-0.03333333333333333d0) &
+                 + in(i-2,j+0) * (-0.05d0) &
+                 + in(i-1,j+0) * (-0.1d0) &
+                 + in(i+1,j+0) * (0.1d0) &
+                 + in(i+2,j+0) * (0.05d0) &
+                 + in(i+3,j+0) * (0.03333333333333333d0) &
+                 + in(i+4,j+0) * (0.025d0) &
+                 + in(i+5,j+0) * (0.02d0) &
+                 + in(i+0,j+1) * (0.1d0) &
+                 + in(i+0,j+2) * (0.05d0) &
+                 + in(i+0,j+3) * (0.03333333333333333d0) &
+                 + in(i+0,j+4) * (0.025d0) &
+                 + in(i+0,j+5) * (0.02d0) &
 +0.0
       end do
       !$omp end simd
@@ -163,32 +173,34 @@ subroutine star6(n, in, out)
     !$omp do
     do i=6,n-6-1
       !$omp simd
+      do j=6,n-6-1
+    do i=6,n-6-1
       do j=6,n-6-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-6) * (-0.013888888888888888) &
-                 + in(i+0,j-5) * (-0.016666666666666666) &
-                 + in(i+0,j-4) * (-0.020833333333333332) &
-                 + in(i+0,j-3) * (-0.027777777777777776) &
-                 + in(i+0,j-2) * (-0.041666666666666664) &
-                 + in(i+0,j-1) * (-0.08333333333333333) &
-                 + in(i-6,j+0) * (-0.013888888888888888) &
-                 + in(i-5,j+0) * (-0.016666666666666666) &
-                 + in(i-4,j+0) * (-0.020833333333333332) &
-                 + in(i-3,j+0) * (-0.027777777777777776) &
-                 + in(i-2,j+0) * (-0.041666666666666664) &
-                 + in(i-1,j+0) * (-0.08333333333333333) &
-                 + in(i+1,j+0) * (0.08333333333333333) &
-                 + in(i+2,j+0) * (0.041666666666666664) &
-                 + in(i+3,j+0) * (0.027777777777777776) &
-                 + in(i+4,j+0) * (0.020833333333333332) &
-                 + in(i+5,j+0) * (0.016666666666666666) &
-                 + in(i+6,j+0) * (0.013888888888888888) &
-                 + in(i+0,j+1) * (0.08333333333333333) &
-                 + in(i+0,j+2) * (0.041666666666666664) &
-                 + in(i+0,j+3) * (0.027777777777777776) &
-                 + in(i+0,j+4) * (0.020833333333333332) &
-                 + in(i+0,j+5) * (0.016666666666666666) &
-                 + in(i+0,j+6) * (0.013888888888888888) &
+                 + in(i+0,j-6) * (-0.013888888888888888d0) &
+                 + in(i+0,j-5) * (-0.016666666666666666d0) &
+                 + in(i+0,j-4) * (-0.020833333333333332d0) &
+                 + in(i+0,j-3) * (-0.027777777777777776d0) &
+                 + in(i+0,j-2) * (-0.041666666666666664d0) &
+                 + in(i+0,j-1) * (-0.08333333333333333d0) &
+                 + in(i-6,j+0) * (-0.013888888888888888d0) &
+                 + in(i-5,j+0) * (-0.016666666666666666d0) &
+                 + in(i-4,j+0) * (-0.020833333333333332d0) &
+                 + in(i-3,j+0) * (-0.027777777777777776d0) &
+                 + in(i-2,j+0) * (-0.041666666666666664d0) &
+                 + in(i-1,j+0) * (-0.08333333333333333d0) &
+                 + in(i+1,j+0) * (0.08333333333333333d0) &
+                 + in(i+2,j+0) * (0.041666666666666664d0) &
+                 + in(i+3,j+0) * (0.027777777777777776d0) &
+                 + in(i+4,j+0) * (0.020833333333333332d0) &
+                 + in(i+5,j+0) * (0.016666666666666666d0) &
+                 + in(i+6,j+0) * (0.013888888888888888d0) &
+                 + in(i+0,j+1) * (0.08333333333333333d0) &
+                 + in(i+0,j+2) * (0.041666666666666664d0) &
+                 + in(i+0,j+3) * (0.027777777777777776d0) &
+                 + in(i+0,j+4) * (0.020833333333333332d0) &
+                 + in(i+0,j+5) * (0.016666666666666666d0) &
+                 + in(i+0,j+6) * (0.013888888888888888d0) &
 +0.0
       end do
       !$omp end simd
@@ -206,36 +218,38 @@ subroutine star7(n, in, out)
     !$omp do
     do i=7,n-7-1
       !$omp simd
+      do j=7,n-7-1
+    do i=7,n-7-1
       do j=7,n-7-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-7) * (-0.01020408163265306) &
-                 + in(i+0,j-6) * (-0.011904761904761904) &
-                 + in(i+0,j-5) * (-0.014285714285714285) &
-                 + in(i+0,j-4) * (-0.017857142857142856) &
-                 + in(i+0,j-3) * (-0.023809523809523808) &
-                 + in(i+0,j-2) * (-0.03571428571428571) &
-                 + in(i+0,j-1) * (-0.07142857142857142) &
-                 + in(i-7,j+0) * (-0.01020408163265306) &
-                 + in(i-6,j+0) * (-0.011904761904761904) &
-                 + in(i-5,j+0) * (-0.014285714285714285) &
-                 + in(i-4,j+0) * (-0.017857142857142856) &
-                 + in(i-3,j+0) * (-0.023809523809523808) &
-                 + in(i-2,j+0) * (-0.03571428571428571) &
-                 + in(i-1,j+0) * (-0.07142857142857142) &
-                 + in(i+1,j+0) * (0.07142857142857142) &
-                 + in(i+2,j+0) * (0.03571428571428571) &
-                 + in(i+3,j+0) * (0.023809523809523808) &
-                 + in(i+4,j+0) * (0.017857142857142856) &
-                 + in(i+5,j+0) * (0.014285714285714285) &
-                 + in(i+6,j+0) * (0.011904761904761904) &
-                 + in(i+7,j+0) * (0.01020408163265306) &
-                 + in(i+0,j+1) * (0.07142857142857142) &
-                 + in(i+0,j+2) * (0.03571428571428571) &
-                 + in(i+0,j+3) * (0.023809523809523808) &
-                 + in(i+0,j+4) * (0.017857142857142856) &
-                 + in(i+0,j+5) * (0.014285714285714285) &
-                 + in(i+0,j+6) * (0.011904761904761904) &
-                 + in(i+0,j+7) * (0.01020408163265306) &
+                 + in(i+0,j-7) * (-0.01020408163265306d0) &
+                 + in(i+0,j-6) * (-0.011904761904761904d0) &
+                 + in(i+0,j-5) * (-0.014285714285714285d0) &
+                 + in(i+0,j-4) * (-0.017857142857142856d0) &
+                 + in(i+0,j-3) * (-0.023809523809523808d0) &
+                 + in(i+0,j-2) * (-0.03571428571428571d0) &
+                 + in(i+0,j-1) * (-0.07142857142857142d0) &
+                 + in(i-7,j+0) * (-0.01020408163265306d0) &
+                 + in(i-6,j+0) * (-0.011904761904761904d0) &
+                 + in(i-5,j+0) * (-0.014285714285714285d0) &
+                 + in(i-4,j+0) * (-0.017857142857142856d0) &
+                 + in(i-3,j+0) * (-0.023809523809523808d0) &
+                 + in(i-2,j+0) * (-0.03571428571428571d0) &
+                 + in(i-1,j+0) * (-0.07142857142857142d0) &
+                 + in(i+1,j+0) * (0.07142857142857142d0) &
+                 + in(i+2,j+0) * (0.03571428571428571d0) &
+                 + in(i+3,j+0) * (0.023809523809523808d0) &
+                 + in(i+4,j+0) * (0.017857142857142856d0) &
+                 + in(i+5,j+0) * (0.014285714285714285d0) &
+                 + in(i+6,j+0) * (0.011904761904761904d0) &
+                 + in(i+7,j+0) * (0.01020408163265306d0) &
+                 + in(i+0,j+1) * (0.07142857142857142d0) &
+                 + in(i+0,j+2) * (0.03571428571428571d0) &
+                 + in(i+0,j+3) * (0.023809523809523808d0) &
+                 + in(i+0,j+4) * (0.017857142857142856d0) &
+                 + in(i+0,j+5) * (0.014285714285714285d0) &
+                 + in(i+0,j+6) * (0.011904761904761904d0) &
+                 + in(i+0,j+7) * (0.01020408163265306d0) &
 +0.0
       end do
       !$omp end simd
@@ -253,40 +267,42 @@ subroutine star8(n, in, out)
     !$omp do
     do i=8,n-8-1
       !$omp simd
+      do j=8,n-8-1
+    do i=8,n-8-1
       do j=8,n-8-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-8) * (-0.0078125) &
-                 + in(i+0,j-7) * (-0.008928571428571428) &
-                 + in(i+0,j-6) * (-0.010416666666666666) &
-                 + in(i+0,j-5) * (-0.0125) &
-                 + in(i+0,j-4) * (-0.015625) &
-                 + in(i+0,j-3) * (-0.020833333333333332) &
-                 + in(i+0,j-2) * (-0.03125) &
-                 + in(i+0,j-1) * (-0.0625) &
-                 + in(i-8,j+0) * (-0.0078125) &
-                 + in(i-7,j+0) * (-0.008928571428571428) &
-                 + in(i-6,j+0) * (-0.010416666666666666) &
-                 + in(i-5,j+0) * (-0.0125) &
-                 + in(i-4,j+0) * (-0.015625) &
-                 + in(i-3,j+0) * (-0.020833333333333332) &
-                 + in(i-2,j+0) * (-0.03125) &
-                 + in(i-1,j+0) * (-0.0625) &
-                 + in(i+1,j+0) * (0.0625) &
-                 + in(i+2,j+0) * (0.03125) &
-                 + in(i+3,j+0) * (0.020833333333333332) &
-                 + in(i+4,j+0) * (0.015625) &
-                 + in(i+5,j+0) * (0.0125) &
-                 + in(i+6,j+0) * (0.010416666666666666) &
-                 + in(i+7,j+0) * (0.008928571428571428) &
-                 + in(i+8,j+0) * (0.0078125) &
-                 + in(i+0,j+1) * (0.0625) &
-                 + in(i+0,j+2) * (0.03125) &
-                 + in(i+0,j+3) * (0.020833333333333332) &
-                 + in(i+0,j+4) * (0.015625) &
-                 + in(i+0,j+5) * (0.0125) &
-                 + in(i+0,j+6) * (0.010416666666666666) &
-                 + in(i+0,j+7) * (0.008928571428571428) &
-                 + in(i+0,j+8) * (0.0078125) &
+                 + in(i+0,j-8) * (-0.0078125d0) &
+                 + in(i+0,j-7) * (-0.008928571428571428d0) &
+                 + in(i+0,j-6) * (-0.010416666666666666d0) &
+                 + in(i+0,j-5) * (-0.0125d0) &
+                 + in(i+0,j-4) * (-0.015625d0) &
+                 + in(i+0,j-3) * (-0.020833333333333332d0) &
+                 + in(i+0,j-2) * (-0.03125d0) &
+                 + in(i+0,j-1) * (-0.0625d0) &
+                 + in(i-8,j+0) * (-0.0078125d0) &
+                 + in(i-7,j+0) * (-0.008928571428571428d0) &
+                 + in(i-6,j+0) * (-0.010416666666666666d0) &
+                 + in(i-5,j+0) * (-0.0125d0) &
+                 + in(i-4,j+0) * (-0.015625d0) &
+                 + in(i-3,j+0) * (-0.020833333333333332d0) &
+                 + in(i-2,j+0) * (-0.03125d0) &
+                 + in(i-1,j+0) * (-0.0625d0) &
+                 + in(i+1,j+0) * (0.0625d0) &
+                 + in(i+2,j+0) * (0.03125d0) &
+                 + in(i+3,j+0) * (0.020833333333333332d0) &
+                 + in(i+4,j+0) * (0.015625d0) &
+                 + in(i+5,j+0) * (0.0125d0) &
+                 + in(i+6,j+0) * (0.010416666666666666d0) &
+                 + in(i+7,j+0) * (0.008928571428571428d0) &
+                 + in(i+8,j+0) * (0.0078125d0) &
+                 + in(i+0,j+1) * (0.0625d0) &
+                 + in(i+0,j+2) * (0.03125d0) &
+                 + in(i+0,j+3) * (0.020833333333333332d0) &
+                 + in(i+0,j+4) * (0.015625d0) &
+                 + in(i+0,j+5) * (0.0125d0) &
+                 + in(i+0,j+6) * (0.010416666666666666d0) &
+                 + in(i+0,j+7) * (0.008928571428571428d0) &
+                 + in(i+0,j+8) * (0.0078125d0) &
 +0.0
       end do
       !$omp end simd
@@ -304,44 +320,46 @@ subroutine star9(n, in, out)
     !$omp do
     do i=9,n-9-1
       !$omp simd
+      do j=9,n-9-1
+    do i=9,n-9-1
       do j=9,n-9-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-9) * (-0.006172839506172839) &
-                 + in(i+0,j-8) * (-0.006944444444444444) &
-                 + in(i+0,j-7) * (-0.007936507936507936) &
-                 + in(i+0,j-6) * (-0.009259259259259259) &
-                 + in(i+0,j-5) * (-0.011111111111111112) &
-                 + in(i+0,j-4) * (-0.013888888888888888) &
-                 + in(i+0,j-3) * (-0.018518518518518517) &
-                 + in(i+0,j-2) * (-0.027777777777777776) &
-                 + in(i+0,j-1) * (-0.05555555555555555) &
-                 + in(i-9,j+0) * (-0.006172839506172839) &
-                 + in(i-8,j+0) * (-0.006944444444444444) &
-                 + in(i-7,j+0) * (-0.007936507936507936) &
-                 + in(i-6,j+0) * (-0.009259259259259259) &
-                 + in(i-5,j+0) * (-0.011111111111111112) &
-                 + in(i-4,j+0) * (-0.013888888888888888) &
-                 + in(i-3,j+0) * (-0.018518518518518517) &
-                 + in(i-2,j+0) * (-0.027777777777777776) &
-                 + in(i-1,j+0) * (-0.05555555555555555) &
-                 + in(i+1,j+0) * (0.05555555555555555) &
-                 + in(i+2,j+0) * (0.027777777777777776) &
-                 + in(i+3,j+0) * (0.018518518518518517) &
-                 + in(i+4,j+0) * (0.013888888888888888) &
-                 + in(i+5,j+0) * (0.011111111111111112) &
-                 + in(i+6,j+0) * (0.009259259259259259) &
-                 + in(i+7,j+0) * (0.007936507936507936) &
-                 + in(i+8,j+0) * (0.006944444444444444) &
-                 + in(i+9,j+0) * (0.006172839506172839) &
-                 + in(i+0,j+1) * (0.05555555555555555) &
-                 + in(i+0,j+2) * (0.027777777777777776) &
-                 + in(i+0,j+3) * (0.018518518518518517) &
-                 + in(i+0,j+4) * (0.013888888888888888) &
-                 + in(i+0,j+5) * (0.011111111111111112) &
-                 + in(i+0,j+6) * (0.009259259259259259) &
-                 + in(i+0,j+7) * (0.007936507936507936) &
-                 + in(i+0,j+8) * (0.006944444444444444) &
-                 + in(i+0,j+9) * (0.006172839506172839) &
+                 + in(i+0,j-9) * (-0.006172839506172839d0) &
+                 + in(i+0,j-8) * (-0.006944444444444444d0) &
+                 + in(i+0,j-7) * (-0.007936507936507936d0) &
+                 + in(i+0,j-6) * (-0.009259259259259259d0) &
+                 + in(i+0,j-5) * (-0.011111111111111112d0) &
+                 + in(i+0,j-4) * (-0.013888888888888888d0) &
+                 + in(i+0,j-3) * (-0.018518518518518517d0) &
+                 + in(i+0,j-2) * (-0.027777777777777776d0) &
+                 + in(i+0,j-1) * (-0.05555555555555555d0) &
+                 + in(i-9,j+0) * (-0.006172839506172839d0) &
+                 + in(i-8,j+0) * (-0.006944444444444444d0) &
+                 + in(i-7,j+0) * (-0.007936507936507936d0) &
+                 + in(i-6,j+0) * (-0.009259259259259259d0) &
+                 + in(i-5,j+0) * (-0.011111111111111112d0) &
+                 + in(i-4,j+0) * (-0.013888888888888888d0) &
+                 + in(i-3,j+0) * (-0.018518518518518517d0) &
+                 + in(i-2,j+0) * (-0.027777777777777776d0) &
+                 + in(i-1,j+0) * (-0.05555555555555555d0) &
+                 + in(i+1,j+0) * (0.05555555555555555d0) &
+                 + in(i+2,j+0) * (0.027777777777777776d0) &
+                 + in(i+3,j+0) * (0.018518518518518517d0) &
+                 + in(i+4,j+0) * (0.013888888888888888d0) &
+                 + in(i+5,j+0) * (0.011111111111111112d0) &
+                 + in(i+6,j+0) * (0.009259259259259259d0) &
+                 + in(i+7,j+0) * (0.007936507936507936d0) &
+                 + in(i+8,j+0) * (0.006944444444444444d0) &
+                 + in(i+9,j+0) * (0.006172839506172839d0) &
+                 + in(i+0,j+1) * (0.05555555555555555d0) &
+                 + in(i+0,j+2) * (0.027777777777777776d0) &
+                 + in(i+0,j+3) * (0.018518518518518517d0) &
+                 + in(i+0,j+4) * (0.013888888888888888d0) &
+                 + in(i+0,j+5) * (0.011111111111111112d0) &
+                 + in(i+0,j+6) * (0.009259259259259259d0) &
+                 + in(i+0,j+7) * (0.007936507936507936d0) &
+                 + in(i+0,j+8) * (0.006944444444444444d0) &
+                 + in(i+0,j+9) * (0.006172839506172839d0) &
 +0.0
       end do
       !$omp end simd
@@ -359,12 +377,14 @@ subroutine grid1(n, in, out)
     !$omp do
     do i=1,n-1-1
       !$omp simd
+      do j=1,n-1-1
+    do i=1,n-1-1
       do j=1,n-1-1
         out(i,j) = out(i,j) &
-                 + in(i-1,j-1) * (-0.25) &
-                 + in(i+1,j-1) * (-0.25) &
-                 + in(i-1,j+1) * (-0.25) &
-                 + in(i+1,j+1) * (0.25) &
+                 + in(i-1,j-1) * (-0.25d0) &
+                 + in(i+1,j-1) * (-0.25d0) &
+                 + in(i-1,j+1) * (-0.25d0) &
+                 + in(i+1,j+1) * (0.25d0) &
 +0.0
       end do
       !$omp end simd
@@ -382,22 +402,24 @@ subroutine grid2(n, in, out)
     !$omp do
     do i=2,n-2-1
       !$omp simd
+      do j=2,n-2-1
+    do i=2,n-2-1
       do j=2,n-2-1
         out(i,j) = out(i,j) &
-                 + in(i-2,j-2) * (-0.0625) &
-                 + in(i+1,j-2) * (-0.020833333333333332) &
-                 + in(i+2,j-2) * (-0.020833333333333332) &
-                 + in(i-1,j-1) * (-0.125) &
-                 + in(i+1,j-1) * (-0.125) &
-                 + in(i+2,j-1) * (-0.125) &
-                 + in(i-2,j+1) * (-0.020833333333333332) &
-                 + in(i-1,j+1) * (-0.125) &
-                 + in(i+1,j+1) * (0.125) &
-                 + in(i+2,j+1) * (0.020833333333333332) &
-                 + in(i-2,j+2) * (-0.020833333333333332) &
-                 + in(i-1,j+2) * (-0.125) &
-                 + in(i+1,j+2) * (0.020833333333333332) &
-                 + in(i+2,j+2) * (0.0625) &
+                 + in(i-2,j-2) * (-0.0625d0) &
+                 + in(i+1,j-2) * (-0.020833333333333332d0) &
+                 + in(i+2,j-2) * (-0.020833333333333332d0) &
+                 + in(i-1,j-1) * (-0.125d0) &
+                 + in(i+1,j-1) * (-0.125d0) &
+                 + in(i+2,j-1) * (-0.125d0) &
+                 + in(i-2,j+1) * (-0.020833333333333332d0) &
+                 + in(i-1,j+1) * (-0.125d0) &
+                 + in(i+1,j+1) * (0.125d0) &
+                 + in(i+2,j+1) * (0.020833333333333332d0) &
+                 + in(i-2,j+2) * (-0.020833333333333332d0) &
+                 + in(i-1,j+2) * (-0.125d0) &
+                 + in(i+1,j+2) * (0.020833333333333332d0) &
+                 + in(i+2,j+2) * (0.0625d0) &
 +0.0
       end do
       !$omp end simd
@@ -415,38 +437,40 @@ subroutine grid3(n, in, out)
     !$omp do
     do i=3,n-3-1
       !$omp simd
+      do j=3,n-3-1
+    do i=3,n-3-1
       do j=3,n-3-1
         out(i,j) = out(i,j) &
-                 + in(i-3,j-3) * (-0.027777777777777776) &
-                 + in(i+1,j-3) * (-0.005555555555555556) &
-                 + in(i+2,j-3) * (-0.005555555555555556) &
-                 + in(i+3,j-3) * (-0.005555555555555556) &
-                 + in(i-2,j-2) * (-0.041666666666666664) &
-                 + in(i+1,j-2) * (-0.013888888888888888) &
-                 + in(i+2,j-2) * (-0.013888888888888888) &
-                 + in(i+3,j-2) * (-0.013888888888888888) &
-                 + in(i-1,j-1) * (-0.08333333333333333) &
-                 + in(i+1,j-1) * (-0.08333333333333333) &
-                 + in(i+2,j-1) * (-0.08333333333333333) &
-                 + in(i+3,j-1) * (-0.08333333333333333) &
-                 + in(i-3,j+1) * (-0.005555555555555556) &
-                 + in(i-2,j+1) * (-0.013888888888888888) &
-                 + in(i-1,j+1) * (-0.08333333333333333) &
-                 + in(i+1,j+1) * (0.08333333333333333) &
-                 + in(i+2,j+1) * (0.013888888888888888) &
-                 + in(i+3,j+1) * (0.005555555555555556) &
-                 + in(i-3,j+2) * (-0.005555555555555556) &
-                 + in(i-2,j+2) * (-0.013888888888888888) &
-                 + in(i-1,j+2) * (-0.08333333333333333) &
-                 + in(i+1,j+2) * (0.013888888888888888) &
-                 + in(i+2,j+2) * (0.041666666666666664) &
-                 + in(i+3,j+2) * (0.005555555555555556) &
-                 + in(i-3,j+3) * (-0.005555555555555556) &
-                 + in(i-2,j+3) * (-0.013888888888888888) &
-                 + in(i-1,j+3) * (-0.08333333333333333) &
-                 + in(i+1,j+3) * (0.005555555555555556) &
-                 + in(i+2,j+3) * (0.005555555555555556) &
-                 + in(i+3,j+3) * (0.027777777777777776) &
+                 + in(i-3,j-3) * (-0.027777777777777776d0) &
+                 + in(i+1,j-3) * (-0.005555555555555556d0) &
+                 + in(i+2,j-3) * (-0.005555555555555556d0) &
+                 + in(i+3,j-3) * (-0.005555555555555556d0) &
+                 + in(i-2,j-2) * (-0.041666666666666664d0) &
+                 + in(i+1,j-2) * (-0.013888888888888888d0) &
+                 + in(i+2,j-2) * (-0.013888888888888888d0) &
+                 + in(i+3,j-2) * (-0.013888888888888888d0) &
+                 + in(i-1,j-1) * (-0.08333333333333333d0) &
+                 + in(i+1,j-1) * (-0.08333333333333333d0) &
+                 + in(i+2,j-1) * (-0.08333333333333333d0) &
+                 + in(i+3,j-1) * (-0.08333333333333333d0) &
+                 + in(i-3,j+1) * (-0.005555555555555556d0) &
+                 + in(i-2,j+1) * (-0.013888888888888888d0) &
+                 + in(i-1,j+1) * (-0.08333333333333333d0) &
+                 + in(i+1,j+1) * (0.08333333333333333d0) &
+                 + in(i+2,j+1) * (0.013888888888888888d0) &
+                 + in(i+3,j+1) * (0.005555555555555556d0) &
+                 + in(i-3,j+2) * (-0.005555555555555556d0) &
+                 + in(i-2,j+2) * (-0.013888888888888888d0) &
+                 + in(i-1,j+2) * (-0.08333333333333333d0) &
+                 + in(i+1,j+2) * (0.013888888888888888d0) &
+                 + in(i+2,j+2) * (0.041666666666666664d0) &
+                 + in(i+3,j+2) * (0.005555555555555556d0) &
+                 + in(i-3,j+3) * (-0.005555555555555556d0) &
+                 + in(i-2,j+3) * (-0.013888888888888888d0) &
+                 + in(i-1,j+3) * (-0.08333333333333333d0) &
+                 + in(i+1,j+3) * (0.005555555555555556d0) &
+                 + in(i+2,j+3) * (0.005555555555555556d0) &
+                 + in(i+3,j+3) * (0.027777777777777776d0) &
 +0.0
       end do
       !$omp end simd
@@ -464,60 +488,62 @@ subroutine grid4(n, in, out)
     !$omp do
     do i=4,n-4-1
       !$omp simd
+      do j=4,n-4-1
+    do i=4,n-4-1
       do j=4,n-4-1
         out(i,j) = out(i,j) &
-                 + in(i-4,j-4) * (-0.015625) &
-                 + in(i+1,j-4) * (-0.002232142857142857) &
-                 + in(i+2,j-4) * (-0.002232142857142857) &
-                 + in(i+3,j-4) * (-0.002232142857142857) &
-                 + in(i+4,j-4) * (-0.002232142857142857) &
-                 + in(i-3,j-3) * (-0.020833333333333332) &
-                 + in(i+1,j-3) * (-0.004166666666666667) &
-                 + in(i+2,j-3) * (-0.004166666666666667) &
-                 + in(i+3,j-3) * (-0.004166666666666667) &
-                 + in(i+4,j-3) * (-0.004166666666666667) &
-                 + in(i-2,j-2) * (-0.03125) &
-                 + in(i+1,j-2) * (-0.010416666666666666) &
-                 + in(i+2,j-2) * (-0.010416666666666666) &
-                 + in(i+3,j-2) * (-0.010416666666666666) &
-                 + in(i+4,j-2) * (-0.010416666666666666) &
-                 + in(i-1,j-1) * (-0.0625) &
-                 + in(i+1,j-1) * (-0.0625) &
-                 + in(i+2,j-1) * (-0.0625) &
-                 + in(i+3,j-1) * (-0.0625) &
-                 + in(i+4,j-1) * (-0.0625) &
-                 + in(i-4,j+1) * (-0.002232142857142857) &
-                 + in(i-3,j+1) * (-0.004166666666666667) &
-                 + in(i-2,j+1) * (-0.010416666666666666) &
-                 + in(i-1,j+1) * (-0.0625) &
-                 + in(i+1,j+1) * (0.0625) &
-                 + in(i+2,j+1) * (0.010416666666666666) &
-                 + in(i+3,j+1) * (0.004166666666666667) &
-                 + in(i+4,j+1) * (0.002232142857142857) &
-                 + in(i-4,j+2) * (-0.002232142857142857) &
-                 + in(i-3,j+2) * (-0.004166666666666667) &
-                 + in(i-2,j+2) * (-0.010416666666666666) &
-                 + in(i-1,j+2) * (-0.0625) &
-                 + in(i+1,j+2) * (0.010416666666666666) &
-                 + in(i+2,j+2) * (0.03125) &
-                 + in(i+3,j+2) * (0.004166666666666667) &
-                 + in(i+4,j+2) * (0.002232142857142857) &
-                 + in(i-4,j+3) * (-0.002232142857142857) &
-                 + in(i-3,j+3) * (-0.004166666666666667) &
-                 + in(i-2,j+3) * (-0.010416666666666666) &
-                 + in(i-1,j+3) * (-0.0625) &
-                 + in(i+1,j+3) * (0.004166666666666667) &
-                 + in(i+2,j+3) * (0.004166666666666667) &
-                 + in(i+3,j+3) * (0.020833333333333332) &
-                 + in(i+4,j+3) * (0.002232142857142857) &
-                 + in(i-4,j+4) * (-0.002232142857142857) &
-                 + in(i-3,j+4) * (-0.004166666666666667) &
-                 + in(i-2,j+4) * (-0.010416666666666666) &
-                 + in(i-1,j+4) * (-0.0625) &
-                 + in(i+1,j+4) * (0.002232142857142857) &
-                 + in(i+2,j+4) * (0.002232142857142857) &
-                 + in(i+3,j+4) * (0.002232142857142857) &
-                 + in(i+4,j+4) * (0.015625) &
+                 + in(i-4,j-4) * (-0.015625d0) &
+                 + in(i+1,j-4) * (-0.002232142857142857d0) &
+                 + in(i+2,j-4) * (-0.002232142857142857d0) &
+                 + in(i+3,j-4) * (-0.002232142857142857d0) &
+                 + in(i+4,j-4) * (-0.002232142857142857d0) &
+                 + in(i-3,j-3) * (-0.020833333333333332d0) &
+                 + in(i+1,j-3) * (-0.004166666666666667d0) &
+                 + in(i+2,j-3) * (-0.004166666666666667d0) &
+                 + in(i+3,j-3) * (-0.004166666666666667d0) &
+                 + in(i+4,j-3) * (-0.004166666666666667d0) &
+                 + in(i-2,j-2) * (-0.03125d0) &
+                 + in(i+1,j-2) * (-0.010416666666666666d0) &
+                 + in(i+2,j-2) * (-0.010416666666666666d0) &
+                 + in(i+3,j-2) * (-0.010416666666666666d0) &
+                 + in(i+4,j-2) * (-0.010416666666666666d0) &
+                 + in(i-1,j-1) * (-0.0625d0) &
+                 + in(i+1,j-1) * (-0.0625d0) &
+                 + in(i+2,j-1) * (-0.0625d0) &
+                 + in(i+3,j-1) * (-0.0625d0) &
+                 + in(i+4,j-1) * (-0.0625d0) &
+                 + in(i-4,j+1) * (-0.002232142857142857d0) &
+                 + in(i-3,j+1) * (-0.004166666666666667d0) &
+                 + in(i-2,j+1) * (-0.010416666666666666d0) &
+                 + in(i-1,j+1) * (-0.0625d0) &
+                 + in(i+1,j+1) * (0.0625d0) &
+                 + in(i+2,j+1) * (0.010416666666666666d0) &
+                 + in(i+3,j+1) * (0.004166666666666667d0) &
+                 + in(i+4,j+1) * (0.002232142857142857d0) &
+                 + in(i-4,j+2) * (-0.002232142857142857d0) &
+                 + in(i-3,j+2) * (-0.004166666666666667d0) &
+                 + in(i-2,j+2) * (-0.010416666666666666d0) &
+                 + in(i-1,j+2) * (-0.0625d0) &
+                 + in(i+1,j+2) * (0.010416666666666666d0) &
+                 + in(i+2,j+2) * (0.03125d0) &
+                 + in(i+3,j+2) * (0.004166666666666667d0) &
+                 + in(i+4,j+2) * (0.002232142857142857d0) &
+                 + in(i-4,j+3) * (-0.002232142857142857d0) &
+                 + in(i-3,j+3) * (-0.004166666666666667d0) &
+                 + in(i-2,j+3) * (-0.010416666666666666d0) &
+                 + in(i-1,j+3) * (-0.0625d0) &
+                 + in(i+1,j+3) * (0.004166666666666667d0) &
+                 + in(i+2,j+3) * (0.004166666666666667d0) &
+                 + in(i+3,j+3) * (0.020833333333333332d0) &
+                 + in(i+4,j+3) * (0.002232142857142857d0) &
+                 + in(i-4,j+4) * (-0.002232142857142857d0) &
+                 + in(i-3,j+4) * (-0.004166666666666667d0) &
+                 + in(i-2,j+4) * (-0.010416666666666666d0) &
+                 + in(i-1,j+4) * (-0.0625d0) &
+                 + in(i+1,j+4) * (0.002232142857142857d0) &
+                 + in(i+2,j+4) * (0.002232142857142857d0) &
+                 + in(i+3,j+4) * (0.002232142857142857d0) &
+                 + in(i+4,j+4) * (0.015625d0) &
 +0.0
       end do
       !$omp end simd
@@ -535,88 +561,90 @@ subroutine grid5(n, in, out)
     !$omp do
     do i=5,n-5-1
       !$omp simd
+      do j=5,n-5-1
+    do i=5,n-5-1
       do j=5,n-5-1
         out(i,j) = out(i,j) &
-                 + in(i-5,j-5) * (-0.01) &
-                 + in(i+1,j-5) * (-0.0011111111111111111) &
-                 + in(i+2,j-5) * (-0.0011111111111111111) &
-                 + in(i+3,j-5) * (-0.0011111111111111111) &
-                 + in(i+4,j-5) * (-0.0011111111111111111) &
-                 + in(i+5,j-5) * (-0.0011111111111111111) &
-                 + in(i-4,j-4) * (-0.0125) &
-                 + in(i+1,j-4) * (-0.0017857142857142857) &
-                 + in(i+2,j-4) * (-0.0017857142857142857) &
-                 + in(i+3,j-4) * (-0.0017857142857142857) &
-                 + in(i+4,j-4) * (-0.0017857142857142857) &
-                 + in(i+5,j-4) * (-0.0017857142857142857) &
-                 + in(i-3,j-3) * (-0.016666666666666666) &
-                 + in(i+1,j-3) * (-0.0033333333333333335) &
-                 + in(i+2,j-3) * (-0.0033333333333333335) &
-                 + in(i+3,j-3) * (-0.0033333333333333335) &
-                 + in(i+4,j-3) * (-0.0033333333333333335) &
-                 + in(i+5,j-3) * (-0.0033333333333333335) &
-                 + in(i-2,j-2) * (-0.025) &
-                 + in(i+1,j-2) * (-0.008333333333333333) &
-                 + in(i+2,j-2) * (-0.008333333333333333) &
-                 + in(i+3,j-2) * (-0.008333333333333333) &
-                 + in(i+4,j-2) * (-0.008333333333333333) &
-                 + in(i+5,j-2) * (-0.008333333333333333) &
-                 + in(i-1,j-1) * (-0.05) &
-                 + in(i+1,j-1) * (-0.05) &
-                 + in(i+2,j-1) * (-0.05) &
-                 + in(i+3,j-1) * (-0.05) &
-                 + in(i+4,j-1) * (-0.05) &
-                 + in(i+5,j-1) * (-0.05) &
-                 + in(i-5,j+1) * (-0.0011111111111111111) &
-                 + in(i-4,j+1) * (-0.0017857142857142857) &
-                 + in(i-3,j+1) * (-0.0033333333333333335) &
-                 + in(i-2,j+1) * (-0.008333333333333333) &
-                 + in(i-1,j+1) * (-0.05) &
-                 + in(i+1,j+1) * (0.05) &
-                 + in(i+2,j+1) * (0.008333333333333333) &
-                 + in(i+3,j+1) * (0.0033333333333333335) &
-                 + in(i+4,j+1) * (0.0017857142857142857) &
-                 + in(i+5,j+1) * (0.0011111111111111111) &
-                 + in(i-5,j+2) * (-0.0011111111111111111) &
-                 + in(i-4,j+2) * (-0.0017857142857142857) &
-                 + in(i-3,j+2) * (-0.0033333333333333335) &
-                 + in(i-2,j+2) * (-0.008333333333333333) &
-                 + in(i-1,j+2) * (-0.05) &
-                 + in(i+1,j+2) * (0.008333333333333333) &
-                 + in(i+2,j+2) * (0.025) &
-                 + in(i+3,j+2) * (0.0033333333333333335) &
-                 + in(i+4,j+2) * (0.0017857142857142857) &
-                 + in(i+5,j+2) * (0.0011111111111111111) &
-                 + in(i-5,j+3) * (-0.0011111111111111111) &
-                 + in(i-4,j+3) * (-0.0017857142857142857) &
-                 + in(i-3,j+3) * (-0.0033333333333333335) &
-                 + in(i-2,j+3) * (-0.008333333333333333) &
-                 + in(i-1,j+3) * (-0.05) &
-                 + in(i+1,j+3) * (0.0033333333333333335) &
-                 + in(i+2,j+3) * (0.0033333333333333335) &
-                 + in(i+3,j+3) * (0.016666666666666666) &
-                 + in(i+4,j+3) * (0.0017857142857142857) &
-                 + in(i+5,j+3) * (0.0011111111111111111) &
-                 + in(i-5,j+4) * (-0.0011111111111111111) &
-                 + in(i-4,j+4) * (-0.0017857142857142857) &
-                 + in(i-3,j+4) * (-0.0033333333333333335) &
-                 + in(i-2,j+4) * (-0.008333333333333333) &
-                 + in(i-1,j+4) * (-0.05) &
-                 + in(i+1,j+4) * (0.0017857142857142857) &
-                 + in(i+2,j+4) * (0.0017857142857142857) &
-                 + in(i+3,j+4) * (0.0017857142857142857) &
-                 + in(i+4,j+4) * (0.0125) &
-                 + in(i+5,j+4) * (0.0011111111111111111) &
-                 + in(i-5,j+5) * (-0.0011111111111111111) &
-                 + in(i-4,j+5) * (-0.0017857142857142857) &
-                 + in(i-3,j+5) * (-0.0033333333333333335) &
-                 + in(i-2,j+5) * (-0.008333333333333333) &
-                 + in(i-1,j+5) * (-0.05) &
-                 + in(i+1,j+5) * (0.0011111111111111111) &
-                 + in(i+2,j+5) * (0.0011111111111111111) &
-                 + in(i+3,j+5) * (0.0011111111111111111) &
-                 + in(i+4,j+5) * (0.0011111111111111111) &
-                 + in(i+5,j+5) * (0.01) &
+                 + in(i-5,j-5) * (-0.01d0) &
+                 + in(i+1,j-5) * (-0.0011111111111111111d0) &
+                 + in(i+2,j-5) * (-0.0011111111111111111d0) &
+                 + in(i+3,j-5) * (-0.0011111111111111111d0) &
+                 + in(i+4,j-5) * (-0.0011111111111111111d0) &
+                 + in(i+5,j-5) * (-0.0011111111111111111d0) &
+                 + in(i-4,j-4) * (-0.0125d0) &
+                 + in(i+1,j-4) * (-0.0017857142857142857d0) &
+                 + in(i+2,j-4) * (-0.0017857142857142857d0) &
+                 + in(i+3,j-4) * (-0.0017857142857142857d0) &
+                 + in(i+4,j-4) * (-0.0017857142857142857d0) &
+                 + in(i+5,j-4) * (-0.0017857142857142857d0) &
+                 + in(i-3,j-3) * (-0.016666666666666666d0) &
+                 + in(i+1,j-3) * (-0.0033333333333333335d0) &
+                 + in(i+2,j-3) * (-0.0033333333333333335d0) &
+                 + in(i+3,j-3) * (-0.0033333333333333335d0) &
+                 + in(i+4,j-3) * (-0.0033333333333333335d0) &
+                 + in(i+5,j-3) * (-0.0033333333333333335d0) &
+                 + in(i-2,j-2) * (-0.025d0) &
+                 + in(i+1,j-2) * (-0.008333333333333333d0) &
+                 + in(i+2,j-2) * (-0.008333333333333333d0) &
+                 + in(i+3,j-2) * (-0.008333333333333333d0) &
+                 + in(i+4,j-2) * (-0.008333333333333333d0) &
+                 + in(i+5,j-2) * (-0.008333333333333333d0) &
+                 + in(i-1,j-1) * (-0.05d0) &
+                 + in(i+1,j-1) * (-0.05d0) &
+                 + in(i+2,j-1) * (-0.05d0) &
+                 + in(i+3,j-1) * (-0.05d0) &
+                 + in(i+4,j-1) * (-0.05d0) &
+                 + in(i+5,j-1) * (-0.05d0) &
+                 + in(i-5,j+1) * (-0.0011111111111111111d0) &
+                 + in(i-4,j+1) * (-0.0017857142857142857d0) &
+                 + in(i-3,j+1) * (-0.0033333333333333335d0) &
+                 + in(i-2,j+1) * (-0.008333333333333333d0) &
+                 + in(i-1,j+1) * (-0.05d0) &
+                 + in(i+1,j+1) * (0.05d0) &
+                 + in(i+2,j+1) * (0.008333333333333333d0) &
+                 + in(i+3,j+1) * (0.0033333333333333335d0) &
+                 + in(i+4,j+1) * (0.0017857142857142857d0) &
+                 + in(i+5,j+1) * (0.0011111111111111111d0) &
+                 + in(i-5,j+2) * (-0.0011111111111111111d0) &
+                 + in(i-4,j+2) * (-0.0017857142857142857d0) &
+                 + in(i-3,j+2) * (-0.0033333333333333335d0) &
+                 + in(i-2,j+2) * (-0.008333333333333333d0) &
+                 + in(i-1,j+2) * (-0.05d0) &
+                 + in(i+1,j+2) * (0.008333333333333333d0) &
+                 + in(i+2,j+2) * (0.025d0) &
+                 + in(i+3,j+2) * (0.0033333333333333335d0) &
+                 + in(i+4,j+2) * (0.0017857142857142857d0) &
+                 + in(i+5,j+2) * (0.0011111111111111111d0) &
+                 + in(i-5,j+3) * (-0.0011111111111111111d0) &
+                 + in(i-4,j+3) * (-0.0017857142857142857d0) &
+                 + in(i-3,j+3) * (-0.0033333333333333335d0) &
+                 + in(i-2,j+3) * (-0.008333333333333333d0) &
+                 + in(i-1,j+3) * (-0.05d0) &
+                 + in(i+1,j+3) * (0.0033333333333333335d0) &
+                 + in(i+2,j+3) * (0.0033333333333333335d0) &
+                 + in(i+3,j+3) * (0.016666666666666666d0) &
+                 + in(i+4,j+3) * (0.0017857142857142857d0) &
+                 + in(i+5,j+3) * (0.0011111111111111111d0) &
+                 + in(i-5,j+4) * (-0.0011111111111111111d0) &
+                 + in(i-4,j+4) * (-0.0017857142857142857d0) &
+                 + in(i-3,j+4) * (-0.0033333333333333335d0) &
+                 + in(i-2,j+4) * (-0.008333333333333333d0) &
+                 + in(i-1,j+4) * (-0.05d0) &
+                 + in(i+1,j+4) * (0.0017857142857142857d0) &
+                 + in(i+2,j+4) * (0.0017857142857142857d0) &
+                 + in(i+3,j+4) * (0.0017857142857142857d0) &
+                 + in(i+4,j+4) * (0.0125d0) &
+                 + in(i+5,j+4) * (0.0011111111111111111d0) &
+                 + in(i-5,j+5) * (-0.0011111111111111111d0) &
+                 + in(i-4,j+5) * (-0.0017857142857142857d0) &
+                 + in(i-3,j+5) * (-0.0033333333333333335d0) &
+                 + in(i-2,j+5) * (-0.008333333333333333d0) &
+                 + in(i-1,j+5) * (-0.05d0) &
+                 + in(i+1,j+5) * (0.0011111111111111111d0) &
+                 + in(i+2,j+5) * (0.0011111111111111111d0) &
+                 + in(i+3,j+5) * (0.0011111111111111111d0) &
+                 + in(i+4,j+5) * (0.0011111111111111111d0) &
+                 + in(i+5,j+5) * (0.01d0) &
 +0.0
       end do
       !$omp end simd
@@ -634,122 +662,124 @@ subroutine grid6(n, in, out)
     !$omp do
     do i=6,n-6-1
       !$omp simd
+      do j=6,n-6-1
+    do i=6,n-6-1
       do j=6,n-6-1
         out(i,j) = out(i,j) &
-                 + in(i-6,j-6) * (-0.006944444444444444) &
-                 + in(i+1,j-6) * (-0.0006313131313131314) &
-                 + in(i+2,j-6) * (-0.0006313131313131314) &
-                 + in(i+3,j-6) * (-0.0006313131313131314) &
-                 + in(i+4,j-6) * (-0.0006313131313131314) &
-                 + in(i+5,j-6) * (-0.0006313131313131314) &
-                 + in(i+6,j-6) * (-0.0006313131313131314) &
-                 + in(i-5,j-5) * (-0.008333333333333333) &
-                 + in(i+1,j-5) * (-0.000925925925925926) &
-                 + in(i+2,j-5) * (-0.000925925925925926) &
-                 + in(i+3,j-5) * (-0.000925925925925926) &
-                 + in(i+4,j-5) * (-0.000925925925925926) &
-                 + in(i+5,j-5) * (-0.000925925925925926) &
-                 + in(i+6,j-5) * (-0.000925925925925926) &
-                 + in(i-4,j-4) * (-0.010416666666666666) &
-                 + in(i+1,j-4) * (-0.001488095238095238) &
-                 + in(i+2,j-4) * (-0.001488095238095238) &
-                 + in(i+3,j-4) * (-0.001488095238095238) &
-                 + in(i+4,j-4) * (-0.001488095238095238) &
-                 + in(i+5,j-4) * (-0.001488095238095238) &
-                 + in(i+6,j-4) * (-0.001488095238095238) &
-                 + in(i-3,j-3) * (-0.013888888888888888) &
-                 + in(i+1,j-3) * (-0.002777777777777778) &
-                 + in(i+2,j-3) * (-0.002777777777777778) &
-                 + in(i+3,j-3) * (-0.002777777777777778) &
-                 + in(i+4,j-3) * (-0.002777777777777778) &
-                 + in(i+5,j-3) * (-0.002777777777777778) &
-                 + in(i+6,j-3) * (-0.002777777777777778) &
-                 + in(i-2,j-2) * (-0.020833333333333332) &
-                 + in(i+1,j-2) * (-0.006944444444444444) &
-                 + in(i+2,j-2) * (-0.006944444444444444) &
-                 + in(i+3,j-2) * (-0.006944444444444444) &
-                 + in(i+4,j-2) * (-0.006944444444444444) &
-                 + in(i+5,j-2) * (-0.006944444444444444) &
-                 + in(i+6,j-2) * (-0.006944444444444444) &
-                 + in(i-1,j-1) * (-0.041666666666666664) &
-                 + in(i+1,j-1) * (-0.041666666666666664) &
-                 + in(i+2,j-1) * (-0.041666666666666664) &
-                 + in(i+3,j-1) * (-0.041666666666666664) &
-                 + in(i+4,j-1) * (-0.041666666666666664) &
-                 + in(i+5,j-1) * (-0.041666666666666664) &
-                 + in(i+6,j-1) * (-0.041666666666666664) &
-                 + in(i-6,j+1) * (-0.0006313131313131314) &
-                 + in(i-5,j+1) * (-0.000925925925925926) &
-                 + in(i-4,j+1) * (-0.001488095238095238) &
-                 + in(i-3,j+1) * (-0.002777777777777778) &
-                 + in(i-2,j+1) * (-0.006944444444444444) &
-                 + in(i-1,j+1) * (-0.041666666666666664) &
-                 + in(i+1,j+1) * (0.041666666666666664) &
-                 + in(i+2,j+1) * (0.006944444444444444) &
-                 + in(i+3,j+1) * (0.002777777777777778) &
-                 + in(i+4,j+1) * (0.001488095238095238) &
-                 + in(i+5,j+1) * (0.000925925925925926) &
-                 + in(i+6,j+1) * (0.0006313131313131314) &
-                 + in(i-6,j+2) * (-0.0006313131313131314) &
-                 + in(i-5,j+2) * (-0.000925925925925926) &
-                 + in(i-4,j+2) * (-0.001488095238095238) &
-                 + in(i-3,j+2) * (-0.002777777777777778) &
-                 + in(i-2,j+2) * (-0.006944444444444444) &
-                 + in(i-1,j+2) * (-0.041666666666666664) &
-                 + in(i+1,j+2) * (0.006944444444444444) &
-                 + in(i+2,j+2) * (0.020833333333333332) &
-                 + in(i+3,j+2) * (0.002777777777777778) &
-                 + in(i+4,j+2) * (0.001488095238095238) &
-                 + in(i+5,j+2) * (0.000925925925925926) &
-                 + in(i+6,j+2) * (0.0006313131313131314) &
-                 + in(i-6,j+3) * (-0.0006313131313131314) &
-                 + in(i-5,j+3) * (-0.000925925925925926) &
-                 + in(i-4,j+3) * (-0.001488095238095238) &
-                 + in(i-3,j+3) * (-0.002777777777777778) &
-                 + in(i-2,j+3) * (-0.006944444444444444) &
-                 + in(i-1,j+3) * (-0.041666666666666664) &
-                 + in(i+1,j+3) * (0.002777777777777778) &
-                 + in(i+2,j+3) * (0.002777777777777778) &
-                 + in(i+3,j+3) * (0.013888888888888888) &
-                 + in(i+4,j+3) * (0.001488095238095238) &
-                 + in(i+5,j+3) * (0.000925925925925926) &
-                 + in(i+6,j+3) * (0.0006313131313131314) &
-                 + in(i-6,j+4) * (-0.0006313131313131314) &
-                 + in(i-5,j+4) * (-0.000925925925925926) &
-                 + in(i-4,j+4) * (-0.001488095238095238) &
-                 + in(i-3,j+4) * (-0.002777777777777778) &
-                 + in(i-2,j+4) * (-0.006944444444444444) &
-                 + in(i-1,j+4) * (-0.041666666666666664) &
-                 + in(i+1,j+4) * (0.001488095238095238) &
-                 + in(i+2,j+4) * (0.001488095238095238) &
-                 + in(i+3,j+4) * (0.001488095238095238) &
-                 + in(i+4,j+4) * (0.010416666666666666) &
-                 + in(i+5,j+4) * (0.000925925925925926) &
-                 + in(i+6,j+4) * (0.0006313131313131314) &
-                 + in(i-6,j+5) * (-0.0006313131313131314) &
-                 + in(i-5,j+5) * (-0.000925925925925926) &
-                 + in(i-4,j+5) * (-0.001488095238095238) &
-                 + in(i-3,j+5) * (-0.002777777777777778) &
-                 + in(i-2,j+5) * (-0.006944444444444444) &
-                 + in(i-1,j+5) * (-0.041666666666666664) &
-                 + in(i+1,j+5) * (0.000925925925925926) &
-                 + in(i+2,j+5) * (0.000925925925925926) &
-                 + in(i+3,j+5) * (0.000925925925925926) &
-                 + in(i+4,j+5) * (0.000925925925925926) &
-                 + in(i+5,j+5) * (0.008333333333333333) &
-                 + in(i+6,j+5) * (0.0006313131313131314) &
-                 + in(i-6,j+6) * (-0.0006313131313131314) &
-                 + in(i-5,j+6) * (-0.000925925925925926) &
-                 + in(i-4,j+6) * (-0.001488095238095238) &
-                 + in(i-3,j+6) * (-0.002777777777777778) &
-                 + in(i-2,j+6) * (-0.006944444444444444) &
-                 + in(i-1,j+6) * (-0.041666666666666664) &
-                 + in(i+1,j+6) * (0.0006313131313131314) &
-                 + in(i+2,j+6) * (0.0006313131313131314) &
-                 + in(i+3,j+6) * (0.0006313131313131314) &
-                 + in(i+4,j+6) * (0.0006313131313131314) &
-                 + in(i+5,j+6) * (0.0006313131313131314) &
-                 + in(i+6,j+6) * (0.006944444444444444) &
+                 + in(i-6,j-6) * (-0.006944444444444444d0) &
+                 + in(i+1,j-6) * (-0.0006313131313131314d0) &
+                 + in(i+2,j-6) * (-0.0006313131313131314d0) &
+                 + in(i+3,j-6) * (-0.0006313131313131314d0) &
+                 + in(i+4,j-6) * (-0.0006313131313131314d0) &
+                 + in(i+5,j-6) * (-0.0006313131313131314d0) &
+                 + in(i+6,j-6) * (-0.0006313131313131314d0) &
+                 + in(i-5,j-5) * (-0.008333333333333333d0) &
+                 + in(i+1,j-5) * (-0.000925925925925926d0) &
+                 + in(i+2,j-5) * (-0.000925925925925926d0) &
+                 + in(i+3,j-5) * (-0.000925925925925926d0) &
+                 + in(i+4,j-5) * (-0.000925925925925926d0) &
+                 + in(i+5,j-5) * (-0.000925925925925926d0) &
+                 + in(i+6,j-5) * (-0.000925925925925926d0) &
+                 + in(i-4,j-4) * (-0.010416666666666666d0) &
+                 + in(i+1,j-4) * (-0.001488095238095238d0) &
+                 + in(i+2,j-4) * (-0.001488095238095238d0) &
+                 + in(i+3,j-4) * (-0.001488095238095238d0) &
+                 + in(i+4,j-4) * (-0.001488095238095238d0) &
+                 + in(i+5,j-4) * (-0.001488095238095238d0) &
+                 + in(i+6,j-4) * (-0.001488095238095238d0) &
+                 + in(i-3,j-3) * (-0.013888888888888888d0) &
+                 + in(i+1,j-3) * (-0.002777777777777778d0) &
+                 + in(i+2,j-3) * (-0.002777777777777778d0) &
+                 + in(i+3,j-3) * (-0.002777777777777778d0) &
+                 + in(i+4,j-3) * (-0.002777777777777778d0) &
+                 + in(i+5,j-3) * (-0.002777777777777778d0) &
+                 + in(i+6,j-3) * (-0.002777777777777778d0) &
+                 + in(i-2,j-2) * (-0.020833333333333332d0) &
+                 + in(i+1,j-2) * (-0.006944444444444444d0) &
+                 + in(i+2,j-2) * (-0.006944444444444444d0) &
+                 + in(i+3,j-2) * (-0.006944444444444444d0) &
+                 + in(i+4,j-2) * (-0.006944444444444444d0) &
+                 + in(i+5,j-2) * (-0.006944444444444444d0) &
+                 + in(i+6,j-2) * (-0.006944444444444444d0) &
+                 + in(i-1,j-1) * (-0.041666666666666664d0) &
+                 + in(i+1,j-1) * (-0.041666666666666664d0) &
+                 + in(i+2,j-1) * (-0.041666666666666664d0) &
+                 + in(i+3,j-1) * (-0.041666666666666664d0) &
+                 + in(i+4,j-1) * (-0.041666666666666664d0) &
+                 + in(i+5,j-1) * (-0.041666666666666664d0) &
+                 + in(i+6,j-1) * (-0.041666666666666664d0) &
+                 + in(i-6,j+1) * (-0.0006313131313131314d0) &
+                 + in(i-5,j+1) * (-0.000925925925925926d0) &
+                 + in(i-4,j+1) * (-0.001488095238095238d0) &
+                 + in(i-3,j+1) * (-0.002777777777777778d0) &
+                 + in(i-2,j+1) * (-0.006944444444444444d0) &
+                 + in(i-1,j+1) * (-0.041666666666666664d0) &
+                 + in(i+1,j+1) * (0.041666666666666664d0) &
+                 + in(i+2,j+1) * (0.006944444444444444d0) &
+                 + in(i+3,j+1) * (0.002777777777777778d0) &
+                 + in(i+4,j+1) * (0.001488095238095238d0) &
+                 + in(i+5,j+1) * (0.000925925925925926d0) &
+                 + in(i+6,j+1) * (0.0006313131313131314d0) &
+                 + in(i-6,j+2) * (-0.0006313131313131314d0) &
+                 + in(i-5,j+2) * (-0.000925925925925926d0) &
+                 + in(i-4,j+2) * (-0.001488095238095238d0) &
+                 + in(i-3,j+2) * (-0.002777777777777778d0) &
+                 + in(i-2,j+2) * (-0.006944444444444444d0) &
+                 + in(i-1,j+2) * (-0.041666666666666664d0) &
+                 + in(i+1,j+2) * (0.006944444444444444d0) &
+                 + in(i+2,j+2) * (0.020833333333333332d0) &
+                 + in(i+3,j+2) * (0.002777777777777778d0) &
+                 + in(i+4,j+2) * (0.001488095238095238d0) &
+                 + in(i+5,j+2) * (0.000925925925925926d0) &
+                 + in(i+6,j+2) * (0.0006313131313131314d0) &
+                 + in(i-6,j+3) * (-0.0006313131313131314d0) &
+                 + in(i-5,j+3) * (-0.000925925925925926d0) &
+                 + in(i-4,j+3) * (-0.001488095238095238d0) &
+                 + in(i-3,j+3) * (-0.002777777777777778d0) &
+                 + in(i-2,j+3) * (-0.006944444444444444d0) &
+                 + in(i-1,j+3) * (-0.041666666666666664d0) &
+                 + in(i+1,j+3) * (0.002777777777777778d0) &
+                 + in(i+2,j+3) * (0.002777777777777778d0) &
+                 + in(i+3,j+3) * (0.013888888888888888d0) &
+                 + in(i+4,j+3) * (0.001488095238095238d0) &
+                 + in(i+5,j+3) * (0.000925925925925926d0) &
+                 + in(i+6,j+3) * (0.0006313131313131314d0) &
+                 + in(i-6,j+4) * (-0.0006313131313131314d0) &
+                 + in(i-5,j+4) * (-0.000925925925925926d0) &
+                 + in(i-4,j+4) * (-0.001488095238095238d0) &
+                 + in(i-3,j+4) * (-0.002777777777777778d0) &
+                 + in(i-2,j+4) * (-0.006944444444444444d0) &
+                 + in(i-1,j+4) * (-0.041666666666666664d0) &
+                 + in(i+1,j+4) * (0.001488095238095238d0) &
+                 + in(i+2,j+4) * (0.001488095238095238d0) &
+                 + in(i+3,j+4) * (0.001488095238095238d0) &
+                 + in(i+4,j+4) * (0.010416666666666666d0) &
+                 + in(i+5,j+4) * (0.000925925925925926d0) &
+                 + in(i+6,j+4) * (0.0006313131313131314d0) &
+                 + in(i-6,j+5) * (-0.0006313131313131314d0) &
+                 + in(i-5,j+5) * (-0.000925925925925926d0) &
+                 + in(i-4,j+5) * (-0.001488095238095238d0) &
+                 + in(i-3,j+5) * (-0.002777777777777778d0) &
+                 + in(i-2,j+5) * (-0.006944444444444444d0) &
+                 + in(i-1,j+5) * (-0.041666666666666664d0) &
+                 + in(i+1,j+5) * (0.000925925925925926d0) &
+                 + in(i+2,j+5) * (0.000925925925925926d0) &
+                 + in(i+3,j+5) * (0.000925925925925926d0) &
+                 + in(i+4,j+5) * (0.000925925925925926d0) &
+                 + in(i+5,j+5) * (0.008333333333333333d0) &
+                 + in(i+6,j+5) * (0.0006313131313131314d0) &
+                 + in(i-6,j+6) * (-0.0006313131313131314d0) &
+                 + in(i-5,j+6) * (-0.000925925925925926d0) &
+                 + in(i-4,j+6) * (-0.001488095238095238d0) &
+                 + in(i-3,j+6) * (-0.002777777777777778d0) &
+                 + in(i-2,j+6) * (-0.006944444444444444d0) &
+                 + in(i-1,j+6) * (-0.041666666666666664d0) &
+                 + in(i+1,j+6) * (0.0006313131313131314d0) &
+                 + in(i+2,j+6) * (0.0006313131313131314d0) &
+                 + in(i+3,j+6) * (0.0006313131313131314d0) &
+                 + in(i+4,j+6) * (0.0006313131313131314d0) &
+                 + in(i+5,j+6) * (0.0006313131313131314d0) &
+                 + in(i+6,j+6) * (0.006944444444444444d0) &
 +0.0
       end do
       !$omp end simd
@@ -767,162 +797,164 @@ subroutine grid7(n, in, out)
     !$omp do
     do i=7,n-7-1
       !$omp simd
+      do j=7,n-7-1
+    do i=7,n-7-1
       do j=7,n-7-1
         out(i,j) = out(i,j) &
-                 + in(i-7,j-7) * (-0.00510204081632653) &
-                 + in(i+1,j-7) * (-0.0003924646781789639) &
-                 + in(i+2,j-7) * (-0.0003924646781789639) &
-                 + in(i+3,j-7) * (-0.0003924646781789639) &
-                 + in(i+4,j-7) * (-0.0003924646781789639) &
-                 + in(i+5,j-7) * (-0.0003924646781789639) &
-                 + in(i+6,j-7) * (-0.0003924646781789639) &
-                 + in(i+7,j-7) * (-0.0003924646781789639) &
-                 + in(i-6,j-6) * (-0.005952380952380952) &
-                 + in(i+1,j-6) * (-0.0005411255411255411) &
-                 + in(i+2,j-6) * (-0.0005411255411255411) &
-                 + in(i+3,j-6) * (-0.0005411255411255411) &
-                 + in(i+4,j-6) * (-0.0005411255411255411) &
-                 + in(i+5,j-6) * (-0.0005411255411255411) &
-                 + in(i+6,j-6) * (-0.0005411255411255411) &
-                 + in(i+7,j-6) * (-0.0005411255411255411) &
-                 + in(i-5,j-5) * (-0.007142857142857143) &
-                 + in(i+1,j-5) * (-0.0007936507936507937) &
-                 + in(i+2,j-5) * (-0.0007936507936507937) &
-                 + in(i+3,j-5) * (-0.0007936507936507937) &
-                 + in(i+4,j-5) * (-0.0007936507936507937) &
-                 + in(i+5,j-5) * (-0.0007936507936507937) &
-                 + in(i+6,j-5) * (-0.0007936507936507937) &
-                 + in(i+7,j-5) * (-0.0007936507936507937) &
-                 + in(i-4,j-4) * (-0.008928571428571428) &
-                 + in(i+1,j-4) * (-0.0012755102040816326) &
-                 + in(i+2,j-4) * (-0.0012755102040816326) &
-                 + in(i+3,j-4) * (-0.0012755102040816326) &
-                 + in(i+4,j-4) * (-0.0012755102040816326) &
-                 + in(i+5,j-4) * (-0.0012755102040816326) &
-                 + in(i+6,j-4) * (-0.0012755102040816326) &
-                 + in(i+7,j-4) * (-0.0012755102040816326) &
-                 + in(i-3,j-3) * (-0.011904761904761904) &
-                 + in(i+1,j-3) * (-0.002380952380952381) &
-                 + in(i+2,j-3) * (-0.002380952380952381) &
-                 + in(i+3,j-3) * (-0.002380952380952381) &
-                 + in(i+4,j-3) * (-0.002380952380952381) &
-                 + in(i+5,j-3) * (-0.002380952380952381) &
-                 + in(i+6,j-3) * (-0.002380952380952381) &
-                 + in(i+7,j-3) * (-0.002380952380952381) &
-                 + in(i-2,j-2) * (-0.017857142857142856) &
-                 + in(i+1,j-2) * (-0.005952380952380952) &
-                 + in(i+2,j-2) * (-0.005952380952380952) &
-                 + in(i+3,j-2) * (-0.005952380952380952) &
-                 + in(i+4,j-2) * (-0.005952380952380952) &
-                 + in(i+5,j-2) * (-0.005952380952380952) &
-                 + in(i+6,j-2) * (-0.005952380952380952) &
-                 + in(i+7,j-2) * (-0.005952380952380952) &
-                 + in(i-1,j-1) * (-0.03571428571428571) &
-                 + in(i+1,j-1) * (-0.03571428571428571) &
-                 + in(i+2,j-1) * (-0.03571428571428571) &
-                 + in(i+3,j-1) * (-0.03571428571428571) &
-                 + in(i+4,j-1) * (-0.03571428571428571) &
-                 + in(i+5,j-1) * (-0.03571428571428571) &
-                 + in(i+6,j-1) * (-0.03571428571428571) &
-                 + in(i+7,j-1) * (-0.03571428571428571) &
-                 + in(i-7,j+1) * (-0.0003924646781789639) &
-                 + in(i-6,j+1) * (-0.0005411255411255411) &
-                 + in(i-5,j+1) * (-0.0007936507936507937) &
-                 + in(i-4,j+1) * (-0.0012755102040816326) &
-                 + in(i-3,j+1) * (-0.002380952380952381) &
-                 + in(i-2,j+1) * (-0.005952380952380952) &
-                 + in(i-1,j+1) * (-0.03571428571428571) &
-                 + in(i+1,j+1) * (0.03571428571428571) &
-                 + in(i+2,j+1) * (0.005952380952380952) &
-                 + in(i+3,j+1) * (0.002380952380952381) &
-                 + in(i+4,j+1) * (0.0012755102040816326) &
-                 + in(i+5,j+1) * (0.0007936507936507937) &
-                 + in(i+6,j+1) * (0.0005411255411255411) &
-                 + in(i+7,j+1) * (0.0003924646781789639) &
-                 + in(i-7,j+2) * (-0.0003924646781789639) &
-                 + in(i-6,j+2) * (-0.0005411255411255411) &
-                 + in(i-5,j+2) * (-0.0007936507936507937) &
-                 + in(i-4,j+2) * (-0.0012755102040816326) &
-                 + in(i-3,j+2) * (-0.002380952380952381) &
-                 + in(i-2,j+2) * (-0.005952380952380952) &
-                 + in(i-1,j+2) * (-0.03571428571428571) &
-                 + in(i+1,j+2) * (0.005952380952380952) &
-                 + in(i+2,j+2) * (0.017857142857142856) &
-                 + in(i+3,j+2) * (0.002380952380952381) &
-                 + in(i+4,j+2) * (0.0012755102040816326) &
-                 + in(i+5,j+2) * (0.0007936507936507937) &
-                 + in(i+6,j+2) * (0.0005411255411255411) &
-                 + in(i+7,j+2) * (0.0003924646781789639) &
-                 + in(i-7,j+3) * (-0.0003924646781789639) &
-                 + in(i-6,j+3) * (-0.0005411255411255411) &
-                 + in(i-5,j+3) * (-0.0007936507936507937) &
-                 + in(i-4,j+3) * (-0.0012755102040816326) &
-                 + in(i-3,j+3) * (-0.002380952380952381) &
-                 + in(i-2,j+3) * (-0.005952380952380952) &
-                 + in(i-1,j+3) * (-0.03571428571428571) &
-                 + in(i+1,j+3) * (0.002380952380952381) &
-                 + in(i+2,j+3) * (0.002380952380952381) &
-                 + in(i+3,j+3) * (0.011904761904761904) &
-                 + in(i+4,j+3) * (0.0012755102040816326) &
-                 + in(i+5,j+3) * (0.0007936507936507937) &
-                 + in(i+6,j+3) * (0.0005411255411255411) &
-                 + in(i+7,j+3) * (0.0003924646781789639) &
-                 + in(i-7,j+4) * (-0.0003924646781789639) &
-                 + in(i-6,j+4) * (-0.0005411255411255411) &
-                 + in(i-5,j+4) * (-0.0007936507936507937) &
-                 + in(i-4,j+4) * (-0.0012755102040816326) &
-                 + in(i-3,j+4) * (-0.002380952380952381) &
-                 + in(i-2,j+4) * (-0.005952380952380952) &
-                 + in(i-1,j+4) * (-0.03571428571428571) &
-                 + in(i+1,j+4) * (0.0012755102040816326) &
-                 + in(i+2,j+4) * (0.0012755102040816326) &
-                 + in(i+3,j+4) * (0.0012755102040816326) &
-                 + in(i+4,j+4) * (0.008928571428571428) &
-                 + in(i+5,j+4) * (0.0007936507936507937) &
-                 + in(i+6,j+4) * (0.0005411255411255411) &
-                 + in(i+7,j+4) * (0.0003924646781789639) &
-                 + in(i-7,j+5) * (-0.0003924646781789639) &
-                 + in(i-6,j+5) * (-0.0005411255411255411) &
-                 + in(i-5,j+5) * (-0.0007936507936507937) &
-                 + in(i-4,j+5) * (-0.0012755102040816326) &
-                 + in(i-3,j+5) * (-0.002380952380952381) &
-                 + in(i-2,j+5) * (-0.005952380952380952) &
-                 + in(i-1,j+5) * (-0.03571428571428571) &
-                 + in(i+1,j+5) * (0.0007936507936507937) &
-                 + in(i+2,j+5) * (0.0007936507936507937) &
-                 + in(i+3,j+5) * (0.0007936507936507937) &
-                 + in(i+4,j+5) * (0.0007936507936507937) &
-                 + in(i+5,j+5) * (0.007142857142857143) &
-                 + in(i+6,j+5) * (0.0005411255411255411) &
-                 + in(i+7,j+5) * (0.0003924646781789639) &
-                 + in(i-7,j+6) * (-0.0003924646781789639) &
-                 + in(i-6,j+6) * (-0.0005411255411255411) &
-                 + in(i-5,j+6) * (-0.0007936507936507937) &
-                 + in(i-4,j+6) * (-0.0012755102040816326) &
-                 + in(i-3,j+6) * (-0.002380952380952381) &
-                 + in(i-2,j+6) * (-0.005952380952380952) &
-                 + in(i-1,j+6) * (-0.03571428571428571) &
-                 + in(i+1,j+6) * (0.0005411255411255411) &
-                 + in(i+2,j+6) * (0.0005411255411255411) &
-                 + in(i+3,j+6) * (0.0005411255411255411) &
-                 + in(i+4,j+6) * (0.0005411255411255411) &
-                 + in(i+5,j+6) * (0.0005411255411255411) &
-                 + in(i+6,j+6) * (0.005952380952380952) &
-                 + in(i+7,j+6) * (0.0003924646781789639) &
-                 + in(i-7,j+7) * (-0.0003924646781789639) &
-                 + in(i-6,j+7) * (-0.0005411255411255411) &
-                 + in(i-5,j+7) * (-0.0007936507936507937) &
-                 + in(i-4,j+7) * (-0.0012755102040816326) &
-                 + in(i-3,j+7) * (-0.002380952380952381) &
-                 + in(i-2,j+7) * (-0.005952380952380952) &
-                 + in(i-1,j+7) * (-0.03571428571428571) &
-                 + in(i+1,j+7) * (0.0003924646781789639) &
-                 + in(i+2,j+7) * (0.0003924646781789639) &
-                 + in(i+3,j+7) * (0.0003924646781789639) &
-                 + in(i+4,j+7) * (0.0003924646781789639) &
-                 + in(i+5,j+7) * (0.0003924646781789639) &
-                 + in(i+6,j+7) * (0.0003924646781789639) &
-                 + in(i+7,j+7) * (0.00510204081632653) &
+                 + in(i-7,j-7) * (-0.00510204081632653d0) &
+                 + in(i+1,j-7) * (-0.0003924646781789639d0) &
+                 + in(i+2,j-7) * (-0.0003924646781789639d0) &
+                 + in(i+3,j-7) * (-0.0003924646781789639d0) &
+                 + in(i+4,j-7) * (-0.0003924646781789639d0) &
+                 + in(i+5,j-7) * (-0.0003924646781789639d0) &
+                 + in(i+6,j-7) * (-0.0003924646781789639d0) &
+                 + in(i+7,j-7) * (-0.0003924646781789639d0) &
+                 + in(i-6,j-6) * (-0.005952380952380952d0) &
+                 + in(i+1,j-6) * (-0.0005411255411255411d0) &
+                 + in(i+2,j-6) * (-0.0005411255411255411d0) &
+                 + in(i+3,j-6) * (-0.0005411255411255411d0) &
+                 + in(i+4,j-6) * (-0.0005411255411255411d0) &
+                 + in(i+5,j-6) * (-0.0005411255411255411d0) &
+                 + in(i+6,j-6) * (-0.0005411255411255411d0) &
+                 + in(i+7,j-6) * (-0.0005411255411255411d0) &
+                 + in(i-5,j-5) * (-0.007142857142857143d0) &
+                 + in(i+1,j-5) * (-0.0007936507936507937d0) &
+                 + in(i+2,j-5) * (-0.0007936507936507937d0) &
+                 + in(i+3,j-5) * (-0.0007936507936507937d0) &
+                 + in(i+4,j-5) * (-0.0007936507936507937d0) &
+                 + in(i+5,j-5) * (-0.0007936507936507937d0) &
+                 + in(i+6,j-5) * (-0.0007936507936507937d0) &
+                 + in(i+7,j-5) * (-0.0007936507936507937d0) &
+                 + in(i-4,j-4) * (-0.008928571428571428d0) &
+                 + in(i+1,j-4) * (-0.0012755102040816326d0) &
+                 + in(i+2,j-4) * (-0.0012755102040816326d0) &
+                 + in(i+3,j-4) * (-0.0012755102040816326d0) &
+                 + in(i+4,j-4) * (-0.0012755102040816326d0) &
+                 + in(i+5,j-4) * (-0.0012755102040816326d0) &
+                 + in(i+6,j-4) * (-0.0012755102040816326d0) &
+                 + in(i+7,j-4) * (-0.0012755102040816326d0) &
+                 + in(i-3,j-3) * (-0.011904761904761904d0) &
+                 + in(i+1,j-3) * (-0.002380952380952381d0) &
+                 + in(i+2,j-3) * (-0.002380952380952381d0) &
+                 + in(i+3,j-3) * (-0.002380952380952381d0) &
+                 + in(i+4,j-3) * (-0.002380952380952381d0) &
+                 + in(i+5,j-3) * (-0.002380952380952381d0) &
+                 + in(i+6,j-3) * (-0.002380952380952381d0) &
+                 + in(i+7,j-3) * (-0.002380952380952381d0) &
+                 + in(i-2,j-2) * (-0.017857142857142856d0) &
+                 + in(i+1,j-2) * (-0.005952380952380952d0) &
+                 + in(i+2,j-2) * (-0.005952380952380952d0) &
+                 + in(i+3,j-2) * (-0.005952380952380952d0) &
+                 + in(i+4,j-2) * (-0.005952380952380952d0) &
+                 + in(i+5,j-2) * (-0.005952380952380952d0) &
+                 + in(i+6,j-2) * (-0.005952380952380952d0) &
+                 + in(i+7,j-2) * (-0.005952380952380952d0) &
+                 + in(i-1,j-1) * (-0.03571428571428571d0) &
+                 + in(i+1,j-1) * (-0.03571428571428571d0) &
+                 + in(i+2,j-1) * (-0.03571428571428571d0) &
+                 + in(i+3,j-1) * (-0.03571428571428571d0) &
+                 + in(i+4,j-1) * (-0.03571428571428571d0) &
+                 + in(i+5,j-1) * (-0.03571428571428571d0) &
+                 + in(i+6,j-1) * (-0.03571428571428571d0) &
+                 + in(i+7,j-1) * (-0.03571428571428571d0) &
+                 + in(i-7,j+1) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+1) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+1) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+1) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+1) * (-0.002380952380952381d0) &
+                 + in(i-2,j+1) * (-0.005952380952380952d0) &
+                 + in(i-1,j+1) * (-0.03571428571428571d0) &
+                 + in(i+1,j+1) * (0.03571428571428571d0) &
+                 + in(i+2,j+1) * (0.005952380952380952d0) &
+                 + in(i+3,j+1) * (0.002380952380952381d0) &
+                 + in(i+4,j+1) * (0.0012755102040816326d0) &
+                 + in(i+5,j+1) * (0.0007936507936507937d0) &
+                 + in(i+6,j+1) * (0.0005411255411255411d0) &
+                 + in(i+7,j+1) * (0.0003924646781789639d0) &
+                 + in(i-7,j+2) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+2) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+2) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+2) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+2) * (-0.002380952380952381d0) &
+                 + in(i-2,j+2) * (-0.005952380952380952d0) &
+                 + in(i-1,j+2) * (-0.03571428571428571d0) &
+                 + in(i+1,j+2) * (0.005952380952380952d0) &
+                 + in(i+2,j+2) * (0.017857142857142856d0) &
+                 + in(i+3,j+2) * (0.002380952380952381d0) &
+                 + in(i+4,j+2) * (0.0012755102040816326d0) &
+                 + in(i+5,j+2) * (0.0007936507936507937d0) &
+                 + in(i+6,j+2) * (0.0005411255411255411d0) &
+                 + in(i+7,j+2) * (0.0003924646781789639d0) &
+                 + in(i-7,j+3) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+3) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+3) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+3) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+3) * (-0.002380952380952381d0) &
+                 + in(i-2,j+3) * (-0.005952380952380952d0) &
+                 + in(i-1,j+3) * (-0.03571428571428571d0) &
+                 + in(i+1,j+3) * (0.002380952380952381d0) &
+                 + in(i+2,j+3) * (0.002380952380952381d0) &
+                 + in(i+3,j+3) * (0.011904761904761904d0) &
+                 + in(i+4,j+3) * (0.0012755102040816326d0) &
+                 + in(i+5,j+3) * (0.0007936507936507937d0) &
+                 + in(i+6,j+3) * (0.0005411255411255411d0) &
+                 + in(i+7,j+3) * (0.0003924646781789639d0) &
+                 + in(i-7,j+4) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+4) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+4) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+4) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+4) * (-0.002380952380952381d0) &
+                 + in(i-2,j+4) * (-0.005952380952380952d0) &
+                 + in(i-1,j+4) * (-0.03571428571428571d0) &
+                 + in(i+1,j+4) * (0.0012755102040816326d0) &
+                 + in(i+2,j+4) * (0.0012755102040816326d0) &
+                 + in(i+3,j+4) * (0.0012755102040816326d0) &
+                 + in(i+4,j+4) * (0.008928571428571428d0) &
+                 + in(i+5,j+4) * (0.0007936507936507937d0) &
+                 + in(i+6,j+4) * (0.0005411255411255411d0) &
+                 + in(i+7,j+4) * (0.0003924646781789639d0) &
+                 + in(i-7,j+5) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+5) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+5) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+5) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+5) * (-0.002380952380952381d0) &
+                 + in(i-2,j+5) * (-0.005952380952380952d0) &
+                 + in(i-1,j+5) * (-0.03571428571428571d0) &
+                 + in(i+1,j+5) * (0.0007936507936507937d0) &
+                 + in(i+2,j+5) * (0.0007936507936507937d0) &
+                 + in(i+3,j+5) * (0.0007936507936507937d0) &
+                 + in(i+4,j+5) * (0.0007936507936507937d0) &
+                 + in(i+5,j+5) * (0.007142857142857143d0) &
+                 + in(i+6,j+5) * (0.0005411255411255411d0) &
+                 + in(i+7,j+5) * (0.0003924646781789639d0) &
+                 + in(i-7,j+6) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+6) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+6) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+6) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+6) * (-0.002380952380952381d0) &
+                 + in(i-2,j+6) * (-0.005952380952380952d0) &
+                 + in(i-1,j+6) * (-0.03571428571428571d0) &
+                 + in(i+1,j+6) * (0.0005411255411255411d0) &
+                 + in(i+2,j+6) * (0.0005411255411255411d0) &
+                 + in(i+3,j+6) * (0.0005411255411255411d0) &
+                 + in(i+4,j+6) * (0.0005411255411255411d0) &
+                 + in(i+5,j+6) * (0.0005411255411255411d0) &
+                 + in(i+6,j+6) * (0.005952380952380952d0) &
+                 + in(i+7,j+6) * (0.0003924646781789639d0) &
+                 + in(i-7,j+7) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+7) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+7) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+7) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+7) * (-0.002380952380952381d0) &
+                 + in(i-2,j+7) * (-0.005952380952380952d0) &
+                 + in(i-1,j+7) * (-0.03571428571428571d0) &
+                 + in(i+1,j+7) * (0.0003924646781789639d0) &
+                 + in(i+2,j+7) * (0.0003924646781789639d0) &
+                 + in(i+3,j+7) * (0.0003924646781789639d0) &
+                 + in(i+4,j+7) * (0.0003924646781789639d0) &
+                 + in(i+5,j+7) * (0.0003924646781789639d0) &
+                 + in(i+6,j+7) * (0.0003924646781789639d0) &
+                 + in(i+7,j+7) * (0.00510204081632653d0) &
 +0.0
       end do
       !$omp end simd
@@ -940,208 +972,210 @@ subroutine grid8(n, in, out)
     !$omp do
     do i=8,n-8-1
       !$omp simd
+      do j=8,n-8-1
+    do i=8,n-8-1
       do j=8,n-8-1
         out(i,j) = out(i,j) &
-                 + in(i-8,j-8) * (-0.00390625) &
-                 + in(i+1,j-8) * (-0.00026041666666666666) &
-                 + in(i+2,j-8) * (-0.00026041666666666666) &
-                 + in(i+3,j-8) * (-0.00026041666666666666) &
-                 + in(i+4,j-8) * (-0.00026041666666666666) &
-                 + in(i+5,j-8) * (-0.00026041666666666666) &
-                 + in(i+6,j-8) * (-0.00026041666666666666) &
-                 + in(i+7,j-8) * (-0.00026041666666666666) &
-                 + in(i+8,j-8) * (-0.00026041666666666666) &
-                 + in(i-7,j-7) * (-0.004464285714285714) &
-                 + in(i+1,j-7) * (-0.00034340659340659343) &
-                 + in(i+2,j-7) * (-0.00034340659340659343) &
-                 + in(i+3,j-7) * (-0.00034340659340659343) &
-                 + in(i+4,j-7) * (-0.00034340659340659343) &
-                 + in(i+5,j-7) * (-0.00034340659340659343) &
-                 + in(i+6,j-7) * (-0.00034340659340659343) &
-                 + in(i+7,j-7) * (-0.00034340659340659343) &
-                 + in(i+8,j-7) * (-0.00034340659340659343) &
-                 + in(i-6,j-6) * (-0.005208333333333333) &
-                 + in(i+1,j-6) * (-0.0004734848484848485) &
-                 + in(i+2,j-6) * (-0.0004734848484848485) &
-                 + in(i+3,j-6) * (-0.0004734848484848485) &
-                 + in(i+4,j-6) * (-0.0004734848484848485) &
-                 + in(i+5,j-6) * (-0.0004734848484848485) &
-                 + in(i+6,j-6) * (-0.0004734848484848485) &
-                 + in(i+7,j-6) * (-0.0004734848484848485) &
-                 + in(i+8,j-6) * (-0.0004734848484848485) &
-                 + in(i-5,j-5) * (-0.00625) &
-                 + in(i+1,j-5) * (-0.0006944444444444445) &
-                 + in(i+2,j-5) * (-0.0006944444444444445) &
-                 + in(i+3,j-5) * (-0.0006944444444444445) &
-                 + in(i+4,j-5) * (-0.0006944444444444445) &
-                 + in(i+5,j-5) * (-0.0006944444444444445) &
-                 + in(i+6,j-5) * (-0.0006944444444444445) &
-                 + in(i+7,j-5) * (-0.0006944444444444445) &
-                 + in(i+8,j-5) * (-0.0006944444444444445) &
-                 + in(i-4,j-4) * (-0.0078125) &
-                 + in(i+1,j-4) * (-0.0011160714285714285) &
-                 + in(i+2,j-4) * (-0.0011160714285714285) &
-                 + in(i+3,j-4) * (-0.0011160714285714285) &
-                 + in(i+4,j-4) * (-0.0011160714285714285) &
-                 + in(i+5,j-4) * (-0.0011160714285714285) &
-                 + in(i+6,j-4) * (-0.0011160714285714285) &
-                 + in(i+7,j-4) * (-0.0011160714285714285) &
-                 + in(i+8,j-4) * (-0.0011160714285714285) &
-                 + in(i-3,j-3) * (-0.010416666666666666) &
-                 + in(i+1,j-3) * (-0.0020833333333333333) &
-                 + in(i+2,j-3) * (-0.0020833333333333333) &
-                 + in(i+3,j-3) * (-0.0020833333333333333) &
-                 + in(i+4,j-3) * (-0.0020833333333333333) &
-                 + in(i+5,j-3) * (-0.0020833333333333333) &
-                 + in(i+6,j-3) * (-0.0020833333333333333) &
-                 + in(i+7,j-3) * (-0.0020833333333333333) &
-                 + in(i+8,j-3) * (-0.0020833333333333333) &
-                 + in(i-2,j-2) * (-0.015625) &
-                 + in(i+1,j-2) * (-0.005208333333333333) &
-                 + in(i+2,j-2) * (-0.005208333333333333) &
-                 + in(i+3,j-2) * (-0.005208333333333333) &
-                 + in(i+4,j-2) * (-0.005208333333333333) &
-                 + in(i+5,j-2) * (-0.005208333333333333) &
-                 + in(i+6,j-2) * (-0.005208333333333333) &
-                 + in(i+7,j-2) * (-0.005208333333333333) &
-                 + in(i+8,j-2) * (-0.005208333333333333) &
-                 + in(i-1,j-1) * (-0.03125) &
-                 + in(i+1,j-1) * (-0.03125) &
-                 + in(i+2,j-1) * (-0.03125) &
-                 + in(i+3,j-1) * (-0.03125) &
-                 + in(i+4,j-1) * (-0.03125) &
-                 + in(i+5,j-1) * (-0.03125) &
-                 + in(i+6,j-1) * (-0.03125) &
-                 + in(i+7,j-1) * (-0.03125) &
-                 + in(i+8,j-1) * (-0.03125) &
-                 + in(i-8,j+1) * (-0.00026041666666666666) &
-                 + in(i-7,j+1) * (-0.00034340659340659343) &
-                 + in(i-6,j+1) * (-0.0004734848484848485) &
-                 + in(i-5,j+1) * (-0.0006944444444444445) &
-                 + in(i-4,j+1) * (-0.0011160714285714285) &
-                 + in(i-3,j+1) * (-0.0020833333333333333) &
-                 + in(i-2,j+1) * (-0.005208333333333333) &
-                 + in(i-1,j+1) * (-0.03125) &
-                 + in(i+1,j+1) * (0.03125) &
-                 + in(i+2,j+1) * (0.005208333333333333) &
-                 + in(i+3,j+1) * (0.0020833333333333333) &
-                 + in(i+4,j+1) * (0.0011160714285714285) &
-                 + in(i+5,j+1) * (0.0006944444444444445) &
-                 + in(i+6,j+1) * (0.0004734848484848485) &
-                 + in(i+7,j+1) * (0.00034340659340659343) &
-                 + in(i+8,j+1) * (0.00026041666666666666) &
-                 + in(i-8,j+2) * (-0.00026041666666666666) &
-                 + in(i-7,j+2) * (-0.00034340659340659343) &
-                 + in(i-6,j+2) * (-0.0004734848484848485) &
-                 + in(i-5,j+2) * (-0.0006944444444444445) &
-                 + in(i-4,j+2) * (-0.0011160714285714285) &
-                 + in(i-3,j+2) * (-0.0020833333333333333) &
-                 + in(i-2,j+2) * (-0.005208333333333333) &
-                 + in(i-1,j+2) * (-0.03125) &
-                 + in(i+1,j+2) * (0.005208333333333333) &
-                 + in(i+2,j+2) * (0.015625) &
-                 + in(i+3,j+2) * (0.0020833333333333333) &
-                 + in(i+4,j+2) * (0.0011160714285714285) &
-                 + in(i+5,j+2) * (0.0006944444444444445) &
-                 + in(i+6,j+2) * (0.0004734848484848485) &
-                 + in(i+7,j+2) * (0.00034340659340659343) &
-                 + in(i+8,j+2) * (0.00026041666666666666) &
-                 + in(i-8,j+3) * (-0.00026041666666666666) &
-                 + in(i-7,j+3) * (-0.00034340659340659343) &
-                 + in(i-6,j+3) * (-0.0004734848484848485) &
-                 + in(i-5,j+3) * (-0.0006944444444444445) &
-                 + in(i-4,j+3) * (-0.0011160714285714285) &
-                 + in(i-3,j+3) * (-0.0020833333333333333) &
-                 + in(i-2,j+3) * (-0.005208333333333333) &
-                 + in(i-1,j+3) * (-0.03125) &
-                 + in(i+1,j+3) * (0.0020833333333333333) &
-                 + in(i+2,j+3) * (0.0020833333333333333) &
-                 + in(i+3,j+3) * (0.010416666666666666) &
-                 + in(i+4,j+3) * (0.0011160714285714285) &
-                 + in(i+5,j+3) * (0.0006944444444444445) &
-                 + in(i+6,j+3) * (0.0004734848484848485) &
-                 + in(i+7,j+3) * (0.00034340659340659343) &
-                 + in(i+8,j+3) * (0.00026041666666666666) &
-                 + in(i-8,j+4) * (-0.00026041666666666666) &
-                 + in(i-7,j+4) * (-0.00034340659340659343) &
-                 + in(i-6,j+4) * (-0.0004734848484848485) &
-                 + in(i-5,j+4) * (-0.0006944444444444445) &
-                 + in(i-4,j+4) * (-0.0011160714285714285) &
-                 + in(i-3,j+4) * (-0.0020833333333333333) &
-                 + in(i-2,j+4) * (-0.005208333333333333) &
-                 + in(i-1,j+4) * (-0.03125) &
-                 + in(i+1,j+4) * (0.0011160714285714285) &
-                 + in(i+2,j+4) * (0.0011160714285714285) &
-                 + in(i+3,j+4) * (0.0011160714285714285) &
-                 + in(i+4,j+4) * (0.0078125) &
-                 + in(i+5,j+4) * (0.0006944444444444445) &
-                 + in(i+6,j+4) * (0.0004734848484848485) &
-                 + in(i+7,j+4) * (0.00034340659340659343) &
-                 + in(i+8,j+4) * (0.00026041666666666666) &
-                 + in(i-8,j+5) * (-0.00026041666666666666) &
-                 + in(i-7,j+5) * (-0.00034340659340659343) &
-                 + in(i-6,j+5) * (-0.0004734848484848485) &
-                 + in(i-5,j+5) * (-0.0006944444444444445) &
-                 + in(i-4,j+5) * (-0.0011160714285714285) &
-                 + in(i-3,j+5) * (-0.0020833333333333333) &
-                 + in(i-2,j+5) * (-0.005208333333333333) &
-                 + in(i-1,j+5) * (-0.03125) &
-                 + in(i+1,j+5) * (0.0006944444444444445) &
-                 + in(i+2,j+5) * (0.0006944444444444445) &
-                 + in(i+3,j+5) * (0.0006944444444444445) &
-                 + in(i+4,j+5) * (0.0006944444444444445) &
-                 + in(i+5,j+5) * (0.00625) &
-                 + in(i+6,j+5) * (0.0004734848484848485) &
-                 + in(i+7,j+5) * (0.00034340659340659343) &
-                 + in(i+8,j+5) * (0.00026041666666666666) &
-                 + in(i-8,j+6) * (-0.00026041666666666666) &
-                 + in(i-7,j+6) * (-0.00034340659340659343) &
-                 + in(i-6,j+6) * (-0.0004734848484848485) &
-                 + in(i-5,j+6) * (-0.0006944444444444445) &
-                 + in(i-4,j+6) * (-0.0011160714285714285) &
-                 + in(i-3,j+6) * (-0.0020833333333333333) &
-                 + in(i-2,j+6) * (-0.005208333333333333) &
-                 + in(i-1,j+6) * (-0.03125) &
-                 + in(i+1,j+6) * (0.0004734848484848485) &
-                 + in(i+2,j+6) * (0.0004734848484848485) &
-                 + in(i+3,j+6) * (0.0004734848484848485) &
-                 + in(i+4,j+6) * (0.0004734848484848485) &
-                 + in(i+5,j+6) * (0.0004734848484848485) &
-                 + in(i+6,j+6) * (0.005208333333333333) &
-                 + in(i+7,j+6) * (0.00034340659340659343) &
-                 + in(i+8,j+6) * (0.00026041666666666666) &
-                 + in(i-8,j+7) * (-0.00026041666666666666) &
-                 + in(i-7,j+7) * (-0.00034340659340659343) &
-                 + in(i-6,j+7) * (-0.0004734848484848485) &
-                 + in(i-5,j+7) * (-0.0006944444444444445) &
-                 + in(i-4,j+7) * (-0.0011160714285714285) &
-                 + in(i-3,j+7) * (-0.0020833333333333333) &
-                 + in(i-2,j+7) * (-0.005208333333333333) &
-                 + in(i-1,j+7) * (-0.03125) &
-                 + in(i+1,j+7) * (0.00034340659340659343) &
-                 + in(i+2,j+7) * (0.00034340659340659343) &
-                 + in(i+3,j+7) * (0.00034340659340659343) &
-                 + in(i+4,j+7) * (0.00034340659340659343) &
-                 + in(i+5,j+7) * (0.00034340659340659343) &
-                 + in(i+6,j+7) * (0.00034340659340659343) &
-                 + in(i+7,j+7) * (0.004464285714285714) &
-                 + in(i+8,j+7) * (0.00026041666666666666) &
-                 + in(i-8,j+8) * (-0.00026041666666666666) &
-                 + in(i-7,j+8) * (-0.00034340659340659343) &
-                 + in(i-6,j+8) * (-0.0004734848484848485) &
-                 + in(i-5,j+8) * (-0.0006944444444444445) &
-                 + in(i-4,j+8) * (-0.0011160714285714285) &
-                 + in(i-3,j+8) * (-0.0020833333333333333) &
-                 + in(i-2,j+8) * (-0.005208333333333333) &
-                 + in(i-1,j+8) * (-0.03125) &
-                 + in(i+1,j+8) * (0.00026041666666666666) &
-                 + in(i+2,j+8) * (0.00026041666666666666) &
-                 + in(i+3,j+8) * (0.00026041666666666666) &
-                 + in(i+4,j+8) * (0.00026041666666666666) &
-                 + in(i+5,j+8) * (0.00026041666666666666) &
-                 + in(i+6,j+8) * (0.00026041666666666666) &
-                 + in(i+7,j+8) * (0.00026041666666666666) &
-                 + in(i+8,j+8) * (0.00390625) &
+                 + in(i-8,j-8) * (-0.00390625d0) &
+                 + in(i+1,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+2,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+3,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+4,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+5,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+6,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+7,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+8,j-8) * (-0.00026041666666666666d0) &
+                 + in(i-7,j-7) * (-0.004464285714285714d0) &
+                 + in(i+1,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+2,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+3,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+4,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+5,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+6,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+7,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+8,j-7) * (-0.00034340659340659343d0) &
+                 + in(i-6,j-6) * (-0.005208333333333333d0) &
+                 + in(i+1,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+2,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+3,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+4,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+5,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+6,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+7,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+8,j-6) * (-0.0004734848484848485d0) &
+                 + in(i-5,j-5) * (-0.00625d0) &
+                 + in(i+1,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+2,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+3,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+4,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+5,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+6,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+7,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+8,j-5) * (-0.0006944444444444445d0) &
+                 + in(i-4,j-4) * (-0.0078125d0) &
+                 + in(i+1,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+2,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+3,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+4,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+5,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+6,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+7,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+8,j-4) * (-0.0011160714285714285d0) &
+                 + in(i-3,j-3) * (-0.010416666666666666d0) &
+                 + in(i+1,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+2,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+3,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+4,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+5,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+6,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+7,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+8,j-3) * (-0.0020833333333333333d0) &
+                 + in(i-2,j-2) * (-0.015625d0) &
+                 + in(i+1,j-2) * (-0.005208333333333333d0) &
+                 + in(i+2,j-2) * (-0.005208333333333333d0) &
+                 + in(i+3,j-2) * (-0.005208333333333333d0) &
+                 + in(i+4,j-2) * (-0.005208333333333333d0) &
+                 + in(i+5,j-2) * (-0.005208333333333333d0) &
+                 + in(i+6,j-2) * (-0.005208333333333333d0) &
+                 + in(i+7,j-2) * (-0.005208333333333333d0) &
+                 + in(i+8,j-2) * (-0.005208333333333333d0) &
+                 + in(i-1,j-1) * (-0.03125d0) &
+                 + in(i+1,j-1) * (-0.03125d0) &
+                 + in(i+2,j-1) * (-0.03125d0) &
+                 + in(i+3,j-1) * (-0.03125d0) &
+                 + in(i+4,j-1) * (-0.03125d0) &
+                 + in(i+5,j-1) * (-0.03125d0) &
+                 + in(i+6,j-1) * (-0.03125d0) &
+                 + in(i+7,j-1) * (-0.03125d0) &
+                 + in(i+8,j-1) * (-0.03125d0) &
+                 + in(i-8,j+1) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+1) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+1) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+1) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+1) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+1) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+1) * (-0.005208333333333333d0) &
+                 + in(i-1,j+1) * (-0.03125d0) &
+                 + in(i+1,j+1) * (0.03125d0) &
+                 + in(i+2,j+1) * (0.005208333333333333d0) &
+                 + in(i+3,j+1) * (0.0020833333333333333d0) &
+                 + in(i+4,j+1) * (0.0011160714285714285d0) &
+                 + in(i+5,j+1) * (0.0006944444444444445d0) &
+                 + in(i+6,j+1) * (0.0004734848484848485d0) &
+                 + in(i+7,j+1) * (0.00034340659340659343d0) &
+                 + in(i+8,j+1) * (0.00026041666666666666d0) &
+                 + in(i-8,j+2) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+2) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+2) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+2) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+2) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+2) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+2) * (-0.005208333333333333d0) &
+                 + in(i-1,j+2) * (-0.03125d0) &
+                 + in(i+1,j+2) * (0.005208333333333333d0) &
+                 + in(i+2,j+2) * (0.015625d0) &
+                 + in(i+3,j+2) * (0.0020833333333333333d0) &
+                 + in(i+4,j+2) * (0.0011160714285714285d0) &
+                 + in(i+5,j+2) * (0.0006944444444444445d0) &
+                 + in(i+6,j+2) * (0.0004734848484848485d0) &
+                 + in(i+7,j+2) * (0.00034340659340659343d0) &
+                 + in(i+8,j+2) * (0.00026041666666666666d0) &
+                 + in(i-8,j+3) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+3) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+3) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+3) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+3) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+3) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+3) * (-0.005208333333333333d0) &
+                 + in(i-1,j+3) * (-0.03125d0) &
+                 + in(i+1,j+3) * (0.0020833333333333333d0) &
+                 + in(i+2,j+3) * (0.0020833333333333333d0) &
+                 + in(i+3,j+3) * (0.010416666666666666d0) &
+                 + in(i+4,j+3) * (0.0011160714285714285d0) &
+                 + in(i+5,j+3) * (0.0006944444444444445d0) &
+                 + in(i+6,j+3) * (0.0004734848484848485d0) &
+                 + in(i+7,j+3) * (0.00034340659340659343d0) &
+                 + in(i+8,j+3) * (0.00026041666666666666d0) &
+                 + in(i-8,j+4) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+4) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+4) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+4) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+4) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+4) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+4) * (-0.005208333333333333d0) &
+                 + in(i-1,j+4) * (-0.03125d0) &
+                 + in(i+1,j+4) * (0.0011160714285714285d0) &
+                 + in(i+2,j+4) * (0.0011160714285714285d0) &
+                 + in(i+3,j+4) * (0.0011160714285714285d0) &
+                 + in(i+4,j+4) * (0.0078125d0) &
+                 + in(i+5,j+4) * (0.0006944444444444445d0) &
+                 + in(i+6,j+4) * (0.0004734848484848485d0) &
+                 + in(i+7,j+4) * (0.00034340659340659343d0) &
+                 + in(i+8,j+4) * (0.00026041666666666666d0) &
+                 + in(i-8,j+5) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+5) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+5) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+5) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+5) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+5) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+5) * (-0.005208333333333333d0) &
+                 + in(i-1,j+5) * (-0.03125d0) &
+                 + in(i+1,j+5) * (0.0006944444444444445d0) &
+                 + in(i+2,j+5) * (0.0006944444444444445d0) &
+                 + in(i+3,j+5) * (0.0006944444444444445d0) &
+                 + in(i+4,j+5) * (0.0006944444444444445d0) &
+                 + in(i+5,j+5) * (0.00625d0) &
+                 + in(i+6,j+5) * (0.0004734848484848485d0) &
+                 + in(i+7,j+5) * (0.00034340659340659343d0) &
+                 + in(i+8,j+5) * (0.00026041666666666666d0) &
+                 + in(i-8,j+6) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+6) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+6) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+6) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+6) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+6) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+6) * (-0.005208333333333333d0) &
+                 + in(i-1,j+6) * (-0.03125d0) &
+                 + in(i+1,j+6) * (0.0004734848484848485d0) &
+                 + in(i+2,j+6) * (0.0004734848484848485d0) &
+                 + in(i+3,j+6) * (0.0004734848484848485d0) &
+                 + in(i+4,j+6) * (0.0004734848484848485d0) &
+                 + in(i+5,j+6) * (0.0004734848484848485d0) &
+                 + in(i+6,j+6) * (0.005208333333333333d0) &
+                 + in(i+7,j+6) * (0.00034340659340659343d0) &
+                 + in(i+8,j+6) * (0.00026041666666666666d0) &
+                 + in(i-8,j+7) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+7) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+7) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+7) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+7) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+7) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+7) * (-0.005208333333333333d0) &
+                 + in(i-1,j+7) * (-0.03125d0) &
+                 + in(i+1,j+7) * (0.00034340659340659343d0) &
+                 + in(i+2,j+7) * (0.00034340659340659343d0) &
+                 + in(i+3,j+7) * (0.00034340659340659343d0) &
+                 + in(i+4,j+7) * (0.00034340659340659343d0) &
+                 + in(i+5,j+7) * (0.00034340659340659343d0) &
+                 + in(i+6,j+7) * (0.00034340659340659343d0) &
+                 + in(i+7,j+7) * (0.004464285714285714d0) &
+                 + in(i+8,j+7) * (0.00026041666666666666d0) &
+                 + in(i-8,j+8) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+8) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+8) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+8) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+8) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+8) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+8) * (-0.005208333333333333d0) &
+                 + in(i-1,j+8) * (-0.03125d0) &
+                 + in(i+1,j+8) * (0.00026041666666666666d0) &
+                 + in(i+2,j+8) * (0.00026041666666666666d0) &
+                 + in(i+3,j+8) * (0.00026041666666666666d0) &
+                 + in(i+4,j+8) * (0.00026041666666666666d0) &
+                 + in(i+5,j+8) * (0.00026041666666666666d0) &
+                 + in(i+6,j+8) * (0.00026041666666666666d0) &
+                 + in(i+7,j+8) * (0.00026041666666666666d0) &
+                 + in(i+8,j+8) * (0.00390625d0) &
 +0.0
       end do
       !$omp end simd
@@ -1159,260 +1193,262 @@ subroutine grid9(n, in, out)
     !$omp do
     do i=9,n-9-1
       !$omp simd
+      do j=9,n-9-1
+    do i=9,n-9-1
       do j=9,n-9-1
         out(i,j) = out(i,j) &
-                 + in(i-9,j-9) * (-0.0030864197530864196) &
-                 + in(i+1,j-9) * (-0.00018155410312273057) &
-                 + in(i+2,j-9) * (-0.00018155410312273057) &
-                 + in(i+3,j-9) * (-0.00018155410312273057) &
-                 + in(i+4,j-9) * (-0.00018155410312273057) &
-                 + in(i+5,j-9) * (-0.00018155410312273057) &
-                 + in(i+6,j-9) * (-0.00018155410312273057) &
-                 + in(i+7,j-9) * (-0.00018155410312273057) &
-                 + in(i+8,j-9) * (-0.00018155410312273057) &
-                 + in(i+9,j-9) * (-0.00018155410312273057) &
-                 + in(i-8,j-8) * (-0.003472222222222222) &
-                 + in(i+1,j-8) * (-0.0002314814814814815) &
-                 + in(i+2,j-8) * (-0.0002314814814814815) &
-                 + in(i+3,j-8) * (-0.0002314814814814815) &
-                 + in(i+4,j-8) * (-0.0002314814814814815) &
-                 + in(i+5,j-8) * (-0.0002314814814814815) &
-                 + in(i+6,j-8) * (-0.0002314814814814815) &
-                 + in(i+7,j-8) * (-0.0002314814814814815) &
-                 + in(i+8,j-8) * (-0.0002314814814814815) &
-                 + in(i+9,j-8) * (-0.0002314814814814815) &
-                 + in(i-7,j-7) * (-0.003968253968253968) &
-                 + in(i+1,j-7) * (-0.00030525030525030525) &
-                 + in(i+2,j-7) * (-0.00030525030525030525) &
-                 + in(i+3,j-7) * (-0.00030525030525030525) &
-                 + in(i+4,j-7) * (-0.00030525030525030525) &
-                 + in(i+5,j-7) * (-0.00030525030525030525) &
-                 + in(i+6,j-7) * (-0.00030525030525030525) &
-                 + in(i+7,j-7) * (-0.00030525030525030525) &
-                 + in(i+8,j-7) * (-0.00030525030525030525) &
-                 + in(i+9,j-7) * (-0.00030525030525030525) &
-                 + in(i-6,j-6) * (-0.004629629629629629) &
-                 + in(i+1,j-6) * (-0.00042087542087542086) &
-                 + in(i+2,j-6) * (-0.00042087542087542086) &
-                 + in(i+3,j-6) * (-0.00042087542087542086) &
-                 + in(i+4,j-6) * (-0.00042087542087542086) &
-                 + in(i+5,j-6) * (-0.00042087542087542086) &
-                 + in(i+6,j-6) * (-0.00042087542087542086) &
-                 + in(i+7,j-6) * (-0.00042087542087542086) &
-                 + in(i+8,j-6) * (-0.00042087542087542086) &
-                 + in(i+9,j-6) * (-0.00042087542087542086) &
-                 + in(i-5,j-5) * (-0.005555555555555556) &
-                 + in(i+1,j-5) * (-0.0006172839506172839) &
-                 + in(i+2,j-5) * (-0.0006172839506172839) &
-                 + in(i+3,j-5) * (-0.0006172839506172839) &
-                 + in(i+4,j-5) * (-0.0006172839506172839) &
-                 + in(i+5,j-5) * (-0.0006172839506172839) &
-                 + in(i+6,j-5) * (-0.0006172839506172839) &
-                 + in(i+7,j-5) * (-0.0006172839506172839) &
-                 + in(i+8,j-5) * (-0.0006172839506172839) &
-                 + in(i+9,j-5) * (-0.0006172839506172839) &
-                 + in(i-4,j-4) * (-0.006944444444444444) &
-                 + in(i+1,j-4) * (-0.000992063492063492) &
-                 + in(i+2,j-4) * (-0.000992063492063492) &
-                 + in(i+3,j-4) * (-0.000992063492063492) &
-                 + in(i+4,j-4) * (-0.000992063492063492) &
-                 + in(i+5,j-4) * (-0.000992063492063492) &
-                 + in(i+6,j-4) * (-0.000992063492063492) &
-                 + in(i+7,j-4) * (-0.000992063492063492) &
-                 + in(i+8,j-4) * (-0.000992063492063492) &
-                 + in(i+9,j-4) * (-0.000992063492063492) &
-                 + in(i-3,j-3) * (-0.009259259259259259) &
-                 + in(i+1,j-3) * (-0.001851851851851852) &
-                 + in(i+2,j-3) * (-0.001851851851851852) &
-                 + in(i+3,j-3) * (-0.001851851851851852) &
-                 + in(i+4,j-3) * (-0.001851851851851852) &
-                 + in(i+5,j-3) * (-0.001851851851851852) &
-                 + in(i+6,j-3) * (-0.001851851851851852) &
-                 + in(i+7,j-3) * (-0.001851851851851852) &
-                 + in(i+8,j-3) * (-0.001851851851851852) &
-                 + in(i+9,j-3) * (-0.001851851851851852) &
-                 + in(i-2,j-2) * (-0.013888888888888888) &
-                 + in(i+1,j-2) * (-0.004629629629629629) &
-                 + in(i+2,j-2) * (-0.004629629629629629) &
-                 + in(i+3,j-2) * (-0.004629629629629629) &
-                 + in(i+4,j-2) * (-0.004629629629629629) &
-                 + in(i+5,j-2) * (-0.004629629629629629) &
-                 + in(i+6,j-2) * (-0.004629629629629629) &
-                 + in(i+7,j-2) * (-0.004629629629629629) &
-                 + in(i+8,j-2) * (-0.004629629629629629) &
-                 + in(i+9,j-2) * (-0.004629629629629629) &
-                 + in(i-1,j-1) * (-0.027777777777777776) &
-                 + in(i+1,j-1) * (-0.027777777777777776) &
-                 + in(i+2,j-1) * (-0.027777777777777776) &
-                 + in(i+3,j-1) * (-0.027777777777777776) &
-                 + in(i+4,j-1) * (-0.027777777777777776) &
-                 + in(i+5,j-1) * (-0.027777777777777776) &
-                 + in(i+6,j-1) * (-0.027777777777777776) &
-                 + in(i+7,j-1) * (-0.027777777777777776) &
-                 + in(i+8,j-1) * (-0.027777777777777776) &
-                 + in(i+9,j-1) * (-0.027777777777777776) &
-                 + in(i-9,j+1) * (-0.00018155410312273057) &
-                 + in(i-8,j+1) * (-0.0002314814814814815) &
-                 + in(i-7,j+1) * (-0.00030525030525030525) &
-                 + in(i-6,j+1) * (-0.00042087542087542086) &
-                 + in(i-5,j+1) * (-0.0006172839506172839) &
-                 + in(i-4,j+1) * (-0.000992063492063492) &
-                 + in(i-3,j+1) * (-0.001851851851851852) &
-                 + in(i-2,j+1) * (-0.004629629629629629) &
-                 + in(i-1,j+1) * (-0.027777777777777776) &
-                 + in(i+1,j+1) * (0.027777777777777776) &
-                 + in(i+2,j+1) * (0.004629629629629629) &
-                 + in(i+3,j+1) * (0.001851851851851852) &
-                 + in(i+4,j+1) * (0.000992063492063492) &
-                 + in(i+5,j+1) * (0.0006172839506172839) &
-                 + in(i+6,j+1) * (0.00042087542087542086) &
-                 + in(i+7,j+1) * (0.00030525030525030525) &
-                 + in(i+8,j+1) * (0.0002314814814814815) &
-                 + in(i+9,j+1) * (0.00018155410312273057) &
-                 + in(i-9,j+2) * (-0.00018155410312273057) &
-                 + in(i-8,j+2) * (-0.0002314814814814815) &
-                 + in(i-7,j+2) * (-0.00030525030525030525) &
-                 + in(i-6,j+2) * (-0.00042087542087542086) &
-                 + in(i-5,j+2) * (-0.0006172839506172839) &
-                 + in(i-4,j+2) * (-0.000992063492063492) &
-                 + in(i-3,j+2) * (-0.001851851851851852) &
-                 + in(i-2,j+2) * (-0.004629629629629629) &
-                 + in(i-1,j+2) * (-0.027777777777777776) &
-                 + in(i+1,j+2) * (0.004629629629629629) &
-                 + in(i+2,j+2) * (0.013888888888888888) &
-                 + in(i+3,j+2) * (0.001851851851851852) &
-                 + in(i+4,j+2) * (0.000992063492063492) &
-                 + in(i+5,j+2) * (0.0006172839506172839) &
-                 + in(i+6,j+2) * (0.00042087542087542086) &
-                 + in(i+7,j+2) * (0.00030525030525030525) &
-                 + in(i+8,j+2) * (0.0002314814814814815) &
-                 + in(i+9,j+2) * (0.00018155410312273057) &
-                 + in(i-9,j+3) * (-0.00018155410312273057) &
-                 + in(i-8,j+3) * (-0.0002314814814814815) &
-                 + in(i-7,j+3) * (-0.00030525030525030525) &
-                 + in(i-6,j+3) * (-0.00042087542087542086) &
-                 + in(i-5,j+3) * (-0.0006172839506172839) &
-                 + in(i-4,j+3) * (-0.000992063492063492) &
-                 + in(i-3,j+3) * (-0.001851851851851852) &
-                 + in(i-2,j+3) * (-0.004629629629629629) &
-                 + in(i-1,j+3) * (-0.027777777777777776) &
-                 + in(i+1,j+3) * (0.001851851851851852) &
-                 + in(i+2,j+3) * (0.001851851851851852) &
-                 + in(i+3,j+3) * (0.009259259259259259) &
-                 + in(i+4,j+3) * (0.000992063492063492) &
-                 + in(i+5,j+3) * (0.0006172839506172839) &
-                 + in(i+6,j+3) * (0.00042087542087542086) &
-                 + in(i+7,j+3) * (0.00030525030525030525) &
-                 + in(i+8,j+3) * (0.0002314814814814815) &
-                 + in(i+9,j+3) * (0.00018155410312273057) &
-                 + in(i-9,j+4) * (-0.00018155410312273057) &
-                 + in(i-8,j+4) * (-0.0002314814814814815) &
-                 + in(i-7,j+4) * (-0.00030525030525030525) &
-                 + in(i-6,j+4) * (-0.00042087542087542086) &
-                 + in(i-5,j+4) * (-0.0006172839506172839) &
-                 + in(i-4,j+4) * (-0.000992063492063492) &
-                 + in(i-3,j+4) * (-0.001851851851851852) &
-                 + in(i-2,j+4) * (-0.004629629629629629) &
-                 + in(i-1,j+4) * (-0.027777777777777776) &
-                 + in(i+1,j+4) * (0.000992063492063492) &
-                 + in(i+2,j+4) * (0.000992063492063492) &
-                 + in(i+3,j+4) * (0.000992063492063492) &
-                 + in(i+4,j+4) * (0.006944444444444444) &
-                 + in(i+5,j+4) * (0.0006172839506172839) &
-                 + in(i+6,j+4) * (0.00042087542087542086) &
-                 + in(i+7,j+4) * (0.00030525030525030525) &
-                 + in(i+8,j+4) * (0.0002314814814814815) &
-                 + in(i+9,j+4) * (0.00018155410312273057) &
-                 + in(i-9,j+5) * (-0.00018155410312273057) &
-                 + in(i-8,j+5) * (-0.0002314814814814815) &
-                 + in(i-7,j+5) * (-0.00030525030525030525) &
-                 + in(i-6,j+5) * (-0.00042087542087542086) &
-                 + in(i-5,j+5) * (-0.0006172839506172839) &
-                 + in(i-4,j+5) * (-0.000992063492063492) &
-                 + in(i-3,j+5) * (-0.001851851851851852) &
-                 + in(i-2,j+5) * (-0.004629629629629629) &
-                 + in(i-1,j+5) * (-0.027777777777777776) &
-                 + in(i+1,j+5) * (0.0006172839506172839) &
-                 + in(i+2,j+5) * (0.0006172839506172839) &
-                 + in(i+3,j+5) * (0.0006172839506172839) &
-                 + in(i+4,j+5) * (0.0006172839506172839) &
-                 + in(i+5,j+5) * (0.005555555555555556) &
-                 + in(i+6,j+5) * (0.00042087542087542086) &
-                 + in(i+7,j+5) * (0.00030525030525030525) &
-                 + in(i+8,j+5) * (0.0002314814814814815) &
-                 + in(i+9,j+5) * (0.00018155410312273057) &
-                 + in(i-9,j+6) * (-0.00018155410312273057) &
-                 + in(i-8,j+6) * (-0.0002314814814814815) &
-                 + in(i-7,j+6) * (-0.00030525030525030525) &
-                 + in(i-6,j+6) * (-0.00042087542087542086) &
-                 + in(i-5,j+6) * (-0.0006172839506172839) &
-                 + in(i-4,j+6) * (-0.000992063492063492) &
-                 + in(i-3,j+6) * (-0.001851851851851852) &
-                 + in(i-2,j+6) * (-0.004629629629629629) &
-                 + in(i-1,j+6) * (-0.027777777777777776) &
-                 + in(i+1,j+6) * (0.00042087542087542086) &
-                 + in(i+2,j+6) * (0.00042087542087542086) &
-                 + in(i+3,j+6) * (0.00042087542087542086) &
-                 + in(i+4,j+6) * (0.00042087542087542086) &
-                 + in(i+5,j+6) * (0.00042087542087542086) &
-                 + in(i+6,j+6) * (0.004629629629629629) &
-                 + in(i+7,j+6) * (0.00030525030525030525) &
-                 + in(i+8,j+6) * (0.0002314814814814815) &
-                 + in(i+9,j+6) * (0.00018155410312273057) &
-                 + in(i-9,j+7) * (-0.00018155410312273057) &
-                 + in(i-8,j+7) * (-0.0002314814814814815) &
-                 + in(i-7,j+7) * (-0.00030525030525030525) &
-                 + in(i-6,j+7) * (-0.00042087542087542086) &
-                 + in(i-5,j+7) * (-0.0006172839506172839) &
-                 + in(i-4,j+7) * (-0.000992063492063492) &
-                 + in(i-3,j+7) * (-0.001851851851851852) &
-                 + in(i-2,j+7) * (-0.004629629629629629) &
-                 + in(i-1,j+7) * (-0.027777777777777776) &
-                 + in(i+1,j+7) * (0.00030525030525030525) &
-                 + in(i+2,j+7) * (0.00030525030525030525) &
-                 + in(i+3,j+7) * (0.00030525030525030525) &
-                 + in(i+4,j+7) * (0.00030525030525030525) &
-                 + in(i+5,j+7) * (0.00030525030525030525) &
-                 + in(i+6,j+7) * (0.00030525030525030525) &
-                 + in(i+7,j+7) * (0.003968253968253968) &
-                 + in(i+8,j+7) * (0.0002314814814814815) &
-                 + in(i+9,j+7) * (0.00018155410312273057) &
-                 + in(i-9,j+8) * (-0.00018155410312273057) &
-                 + in(i-8,j+8) * (-0.0002314814814814815) &
-                 + in(i-7,j+8) * (-0.00030525030525030525) &
-                 + in(i-6,j+8) * (-0.00042087542087542086) &
-                 + in(i-5,j+8) * (-0.0006172839506172839) &
-                 + in(i-4,j+8) * (-0.000992063492063492) &
-                 + in(i-3,j+8) * (-0.001851851851851852) &
-                 + in(i-2,j+8) * (-0.004629629629629629) &
-                 + in(i-1,j+8) * (-0.027777777777777776) &
-                 + in(i+1,j+8) * (0.0002314814814814815) &
-                 + in(i+2,j+8) * (0.0002314814814814815) &
-                 + in(i+3,j+8) * (0.0002314814814814815) &
-                 + in(i+4,j+8) * (0.0002314814814814815) &
-                 + in(i+5,j+8) * (0.0002314814814814815) &
-                 + in(i+6,j+8) * (0.0002314814814814815) &
-                 + in(i+7,j+8) * (0.0002314814814814815) &
-                 + in(i+8,j+8) * (0.003472222222222222) &
-                 + in(i+9,j+8) * (0.00018155410312273057) &
-                 + in(i-9,j+9) * (-0.00018155410312273057) &
-                 + in(i-8,j+9) * (-0.0002314814814814815) &
-                 + in(i-7,j+9) * (-0.00030525030525030525) &
-                 + in(i-6,j+9) * (-0.00042087542087542086) &
-                 + in(i-5,j+9) * (-0.0006172839506172839) &
-                 + in(i-4,j+9) * (-0.000992063492063492) &
-                 + in(i-3,j+9) * (-0.001851851851851852) &
-                 + in(i-2,j+9) * (-0.004629629629629629) &
-                 + in(i-1,j+9) * (-0.027777777777777776) &
-                 + in(i+1,j+9) * (0.00018155410312273057) &
-                 + in(i+2,j+9) * (0.00018155410312273057) &
-                 + in(i+3,j+9) * (0.00018155410312273057) &
-                 + in(i+4,j+9) * (0.00018155410312273057) &
-                 + in(i+5,j+9) * (0.00018155410312273057) &
-                 + in(i+6,j+9) * (0.00018155410312273057) &
-                 + in(i+7,j+9) * (0.00018155410312273057) &
-                 + in(i+8,j+9) * (0.00018155410312273057) &
-                 + in(i+9,j+9) * (0.0030864197530864196) &
+                 + in(i-9,j-9) * (-0.0030864197530864196d0) &
+                 + in(i+1,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+2,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+3,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+4,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+5,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+6,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+7,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+8,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+9,j-9) * (-0.00018155410312273057d0) &
+                 + in(i-8,j-8) * (-0.003472222222222222d0) &
+                 + in(i+1,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+2,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+3,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+4,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+5,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+6,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+7,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+8,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+9,j-8) * (-0.0002314814814814815d0) &
+                 + in(i-7,j-7) * (-0.003968253968253968d0) &
+                 + in(i+1,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+2,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+3,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+4,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+5,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+6,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+7,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+8,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+9,j-7) * (-0.00030525030525030525d0) &
+                 + in(i-6,j-6) * (-0.004629629629629629d0) &
+                 + in(i+1,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+2,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+3,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+4,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+5,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+6,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+7,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+8,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+9,j-6) * (-0.00042087542087542086d0) &
+                 + in(i-5,j-5) * (-0.005555555555555556d0) &
+                 + in(i+1,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+2,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+3,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+4,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+5,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+6,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+7,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+8,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+9,j-5) * (-0.0006172839506172839d0) &
+                 + in(i-4,j-4) * (-0.006944444444444444d0) &
+                 + in(i+1,j-4) * (-0.000992063492063492d0) &
+                 + in(i+2,j-4) * (-0.000992063492063492d0) &
+                 + in(i+3,j-4) * (-0.000992063492063492d0) &
+                 + in(i+4,j-4) * (-0.000992063492063492d0) &
+                 + in(i+5,j-4) * (-0.000992063492063492d0) &
+                 + in(i+6,j-4) * (-0.000992063492063492d0) &
+                 + in(i+7,j-4) * (-0.000992063492063492d0) &
+                 + in(i+8,j-4) * (-0.000992063492063492d0) &
+                 + in(i+9,j-4) * (-0.000992063492063492d0) &
+                 + in(i-3,j-3) * (-0.009259259259259259d0) &
+                 + in(i+1,j-3) * (-0.001851851851851852d0) &
+                 + in(i+2,j-3) * (-0.001851851851851852d0) &
+                 + in(i+3,j-3) * (-0.001851851851851852d0) &
+                 + in(i+4,j-3) * (-0.001851851851851852d0) &
+                 + in(i+5,j-3) * (-0.001851851851851852d0) &
+                 + in(i+6,j-3) * (-0.001851851851851852d0) &
+                 + in(i+7,j-3) * (-0.001851851851851852d0) &
+                 + in(i+8,j-3) * (-0.001851851851851852d0) &
+                 + in(i+9,j-3) * (-0.001851851851851852d0) &
+                 + in(i-2,j-2) * (-0.013888888888888888d0) &
+                 + in(i+1,j-2) * (-0.004629629629629629d0) &
+                 + in(i+2,j-2) * (-0.004629629629629629d0) &
+                 + in(i+3,j-2) * (-0.004629629629629629d0) &
+                 + in(i+4,j-2) * (-0.004629629629629629d0) &
+                 + in(i+5,j-2) * (-0.004629629629629629d0) &
+                 + in(i+6,j-2) * (-0.004629629629629629d0) &
+                 + in(i+7,j-2) * (-0.004629629629629629d0) &
+                 + in(i+8,j-2) * (-0.004629629629629629d0) &
+                 + in(i+9,j-2) * (-0.004629629629629629d0) &
+                 + in(i-1,j-1) * (-0.027777777777777776d0) &
+                 + in(i+1,j-1) * (-0.027777777777777776d0) &
+                 + in(i+2,j-1) * (-0.027777777777777776d0) &
+                 + in(i+3,j-1) * (-0.027777777777777776d0) &
+                 + in(i+4,j-1) * (-0.027777777777777776d0) &
+                 + in(i+5,j-1) * (-0.027777777777777776d0) &
+                 + in(i+6,j-1) * (-0.027777777777777776d0) &
+                 + in(i+7,j-1) * (-0.027777777777777776d0) &
+                 + in(i+8,j-1) * (-0.027777777777777776d0) &
+                 + in(i+9,j-1) * (-0.027777777777777776d0) &
+                 + in(i-9,j+1) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+1) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+1) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+1) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+1) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+1) * (-0.000992063492063492d0) &
+                 + in(i-3,j+1) * (-0.001851851851851852d0) &
+                 + in(i-2,j+1) * (-0.004629629629629629d0) &
+                 + in(i-1,j+1) * (-0.027777777777777776d0) &
+                 + in(i+1,j+1) * (0.027777777777777776d0) &
+                 + in(i+2,j+1) * (0.004629629629629629d0) &
+                 + in(i+3,j+1) * (0.001851851851851852d0) &
+                 + in(i+4,j+1) * (0.000992063492063492d0) &
+                 + in(i+5,j+1) * (0.0006172839506172839d0) &
+                 + in(i+6,j+1) * (0.00042087542087542086d0) &
+                 + in(i+7,j+1) * (0.00030525030525030525d0) &
+                 + in(i+8,j+1) * (0.0002314814814814815d0) &
+                 + in(i+9,j+1) * (0.00018155410312273057d0) &
+                 + in(i-9,j+2) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+2) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+2) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+2) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+2) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+2) * (-0.000992063492063492d0) &
+                 + in(i-3,j+2) * (-0.001851851851851852d0) &
+                 + in(i-2,j+2) * (-0.004629629629629629d0) &
+                 + in(i-1,j+2) * (-0.027777777777777776d0) &
+                 + in(i+1,j+2) * (0.004629629629629629d0) &
+                 + in(i+2,j+2) * (0.013888888888888888d0) &
+                 + in(i+3,j+2) * (0.001851851851851852d0) &
+                 + in(i+4,j+2) * (0.000992063492063492d0) &
+                 + in(i+5,j+2) * (0.0006172839506172839d0) &
+                 + in(i+6,j+2) * (0.00042087542087542086d0) &
+                 + in(i+7,j+2) * (0.00030525030525030525d0) &
+                 + in(i+8,j+2) * (0.0002314814814814815d0) &
+                 + in(i+9,j+2) * (0.00018155410312273057d0) &
+                 + in(i-9,j+3) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+3) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+3) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+3) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+3) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+3) * (-0.000992063492063492d0) &
+                 + in(i-3,j+3) * (-0.001851851851851852d0) &
+                 + in(i-2,j+3) * (-0.004629629629629629d0) &
+                 + in(i-1,j+3) * (-0.027777777777777776d0) &
+                 + in(i+1,j+3) * (0.001851851851851852d0) &
+                 + in(i+2,j+3) * (0.001851851851851852d0) &
+                 + in(i+3,j+3) * (0.009259259259259259d0) &
+                 + in(i+4,j+3) * (0.000992063492063492d0) &
+                 + in(i+5,j+3) * (0.0006172839506172839d0) &
+                 + in(i+6,j+3) * (0.00042087542087542086d0) &
+                 + in(i+7,j+3) * (0.00030525030525030525d0) &
+                 + in(i+8,j+3) * (0.0002314814814814815d0) &
+                 + in(i+9,j+3) * (0.00018155410312273057d0) &
+                 + in(i-9,j+4) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+4) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+4) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+4) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+4) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+4) * (-0.000992063492063492d0) &
+                 + in(i-3,j+4) * (-0.001851851851851852d0) &
+                 + in(i-2,j+4) * (-0.004629629629629629d0) &
+                 + in(i-1,j+4) * (-0.027777777777777776d0) &
+                 + in(i+1,j+4) * (0.000992063492063492d0) &
+                 + in(i+2,j+4) * (0.000992063492063492d0) &
+                 + in(i+3,j+4) * (0.000992063492063492d0) &
+                 + in(i+4,j+4) * (0.006944444444444444d0) &
+                 + in(i+5,j+4) * (0.0006172839506172839d0) &
+                 + in(i+6,j+4) * (0.00042087542087542086d0) &
+                 + in(i+7,j+4) * (0.00030525030525030525d0) &
+                 + in(i+8,j+4) * (0.0002314814814814815d0) &
+                 + in(i+9,j+4) * (0.00018155410312273057d0) &
+                 + in(i-9,j+5) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+5) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+5) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+5) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+5) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+5) * (-0.000992063492063492d0) &
+                 + in(i-3,j+5) * (-0.001851851851851852d0) &
+                 + in(i-2,j+5) * (-0.004629629629629629d0) &
+                 + in(i-1,j+5) * (-0.027777777777777776d0) &
+                 + in(i+1,j+5) * (0.0006172839506172839d0) &
+                 + in(i+2,j+5) * (0.0006172839506172839d0) &
+                 + in(i+3,j+5) * (0.0006172839506172839d0) &
+                 + in(i+4,j+5) * (0.0006172839506172839d0) &
+                 + in(i+5,j+5) * (0.005555555555555556d0) &
+                 + in(i+6,j+5) * (0.00042087542087542086d0) &
+                 + in(i+7,j+5) * (0.00030525030525030525d0) &
+                 + in(i+8,j+5) * (0.0002314814814814815d0) &
+                 + in(i+9,j+5) * (0.00018155410312273057d0) &
+                 + in(i-9,j+6) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+6) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+6) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+6) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+6) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+6) * (-0.000992063492063492d0) &
+                 + in(i-3,j+6) * (-0.001851851851851852d0) &
+                 + in(i-2,j+6) * (-0.004629629629629629d0) &
+                 + in(i-1,j+6) * (-0.027777777777777776d0) &
+                 + in(i+1,j+6) * (0.00042087542087542086d0) &
+                 + in(i+2,j+6) * (0.00042087542087542086d0) &
+                 + in(i+3,j+6) * (0.00042087542087542086d0) &
+                 + in(i+4,j+6) * (0.00042087542087542086d0) &
+                 + in(i+5,j+6) * (0.00042087542087542086d0) &
+                 + in(i+6,j+6) * (0.004629629629629629d0) &
+                 + in(i+7,j+6) * (0.00030525030525030525d0) &
+                 + in(i+8,j+6) * (0.0002314814814814815d0) &
+                 + in(i+9,j+6) * (0.00018155410312273057d0) &
+                 + in(i-9,j+7) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+7) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+7) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+7) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+7) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+7) * (-0.000992063492063492d0) &
+                 + in(i-3,j+7) * (-0.001851851851851852d0) &
+                 + in(i-2,j+7) * (-0.004629629629629629d0) &
+                 + in(i-1,j+7) * (-0.027777777777777776d0) &
+                 + in(i+1,j+7) * (0.00030525030525030525d0) &
+                 + in(i+2,j+7) * (0.00030525030525030525d0) &
+                 + in(i+3,j+7) * (0.00030525030525030525d0) &
+                 + in(i+4,j+7) * (0.00030525030525030525d0) &
+                 + in(i+5,j+7) * (0.00030525030525030525d0) &
+                 + in(i+6,j+7) * (0.00030525030525030525d0) &
+                 + in(i+7,j+7) * (0.003968253968253968d0) &
+                 + in(i+8,j+7) * (0.0002314814814814815d0) &
+                 + in(i+9,j+7) * (0.00018155410312273057d0) &
+                 + in(i-9,j+8) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+8) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+8) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+8) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+8) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+8) * (-0.000992063492063492d0) &
+                 + in(i-3,j+8) * (-0.001851851851851852d0) &
+                 + in(i-2,j+8) * (-0.004629629629629629d0) &
+                 + in(i-1,j+8) * (-0.027777777777777776d0) &
+                 + in(i+1,j+8) * (0.0002314814814814815d0) &
+                 + in(i+2,j+8) * (0.0002314814814814815d0) &
+                 + in(i+3,j+8) * (0.0002314814814814815d0) &
+                 + in(i+4,j+8) * (0.0002314814814814815d0) &
+                 + in(i+5,j+8) * (0.0002314814814814815d0) &
+                 + in(i+6,j+8) * (0.0002314814814814815d0) &
+                 + in(i+7,j+8) * (0.0002314814814814815d0) &
+                 + in(i+8,j+8) * (0.003472222222222222d0) &
+                 + in(i+9,j+8) * (0.00018155410312273057d0) &
+                 + in(i-9,j+9) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+9) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+9) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+9) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+9) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+9) * (-0.000992063492063492d0) &
+                 + in(i-3,j+9) * (-0.001851851851851852d0) &
+                 + in(i-2,j+9) * (-0.004629629629629629d0) &
+                 + in(i-1,j+9) * (-0.027777777777777776d0) &
+                 + in(i+1,j+9) * (0.00018155410312273057d0) &
+                 + in(i+2,j+9) * (0.00018155410312273057d0) &
+                 + in(i+3,j+9) * (0.00018155410312273057d0) &
+                 + in(i+4,j+9) * (0.00018155410312273057d0) &
+                 + in(i+5,j+9) * (0.00018155410312273057d0) &
+                 + in(i+6,j+9) * (0.00018155410312273057d0) &
+                 + in(i+7,j+9) * (0.00018155410312273057d0) &
+                 + in(i+8,j+9) * (0.00018155410312273057d0) &
+                 + in(i+9,j+9) * (0.0030864197530864196d0) &
 +0.0
       end do
       !$omp end simd
diff --git a/FORTRAN/stencil_pretty.f90 b/FORTRAN/stencil_pretty.f90
index 5e2b50d4e..cb4bf8052 100644
--- a/FORTRAN/stencil_pretty.f90
+++ b/FORTRAN/stencil_pretty.f90
@@ -8,10 +8,10 @@ subroutine star1(n, in, out)
     do i=1,n-1-1
       do j=1,n-1-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-1) * (-0.5) &
-                 + in(i-1,j+0) * (-0.5) &
-                 + in(i+1,j+0) * (0.5) &
-                 + in(i+0,j+1) * (0.5) &
+                 + in(i+0,j-1) * (-0.5d0) &
+                 + in(i-1,j+0) * (-0.5d0) &
+                 + in(i+1,j+0) * (0.5d0) &
+                 + in(i+0,j+1) * (0.5d0) &
 +0.0
       end do
     end do
@@ -27,14 +27,14 @@ subroutine star2(n, in, out)
     do i=2,n-2-1
       do j=2,n-2-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-2) * (-0.125) &
-                 + in(i+0,j-1) * (-0.25) &
-                 + in(i-2,j+0) * (-0.125) &
-                 + in(i-1,j+0) * (-0.25) &
-                 + in(i+1,j+0) * (0.25) &
-                 + in(i+2,j+0) * (0.125) &
-                 + in(i+0,j+1) * (0.25) &
-                 + in(i+0,j+2) * (0.125) &
+                 + in(i+0,j-2) * (-0.125d0) &
+                 + in(i+0,j-1) * (-0.25d0) &
+                 + in(i-2,j+0) * (-0.125d0) &
+                 + in(i-1,j+0) * (-0.25d0) &
+                 + in(i+1,j+0) * (0.25d0) &
+                 + in(i+2,j+0) * (0.125d0) &
+                 + in(i+0,j+1) * (0.25d0) &
+                 + in(i+0,j+2) * (0.125d0) &
 +0.0
       end do
     end do
@@ -50,18 +50,18 @@ subroutine star3(n, in, out)
     do i=3,n-3-1
       do j=3,n-3-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-3) * (-0.05555555555555555) &
-                 + in(i+0,j-2) * (-0.08333333333333333) &
-                 + in(i+0,j-1) * (-0.16666666666666666) &
-                 + in(i-3,j+0) * (-0.05555555555555555) &
-                 + in(i-2,j+0) * (-0.08333333333333333) &
-                 + in(i-1,j+0) * (-0.16666666666666666) &
-                 + in(i+1,j+0) * (0.16666666666666666) &
-                 + in(i+2,j+0) * (0.08333333333333333) &
-                 + in(i+3,j+0) * (0.05555555555555555) &
-                 + in(i+0,j+1) * (0.16666666666666666) &
-                 + in(i+0,j+2) * (0.08333333333333333) &
-                 + in(i+0,j+3) * (0.05555555555555555) &
+                 + in(i+0,j-3) * (-0.05555555555555555d0) &
+                 + in(i+0,j-2) * (-0.08333333333333333d0) &
+                 + in(i+0,j-1) * (-0.16666666666666666d0) &
+                 + in(i-3,j+0) * (-0.05555555555555555d0) &
+                 + in(i-2,j+0) * (-0.08333333333333333d0) &
+                 + in(i-1,j+0) * (-0.16666666666666666d0) &
+                 + in(i+1,j+0) * (0.16666666666666666d0) &
+                 + in(i+2,j+0) * (0.08333333333333333d0) &
+                 + in(i+3,j+0) * (0.05555555555555555d0) &
+                 + in(i+0,j+1) * (0.16666666666666666d0) &
+                 + in(i+0,j+2) * (0.08333333333333333d0) &
+                 + in(i+0,j+3) * (0.05555555555555555d0) &
 +0.0
       end do
     end do
@@ -77,22 +77,22 @@ subroutine star4(n, in, out)
     do i=4,n-4-1
       do j=4,n-4-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-4) * (-0.03125) &
-                 + in(i+0,j-3) * (-0.041666666666666664) &
-                 + in(i+0,j-2) * (-0.0625) &
-                 + in(i+0,j-1) * (-0.125) &
-                 + in(i-4,j+0) * (-0.03125) &
-                 + in(i-3,j+0) * (-0.041666666666666664) &
-                 + in(i-2,j+0) * (-0.0625) &
-                 + in(i-1,j+0) * (-0.125) &
-                 + in(i+1,j+0) * (0.125) &
-                 + in(i+2,j+0) * (0.0625) &
-                 + in(i+3,j+0) * (0.041666666666666664) &
-                 + in(i+4,j+0) * (0.03125) &
-                 + in(i+0,j+1) * (0.125) &
-                 + in(i+0,j+2) * (0.0625) &
-                 + in(i+0,j+3) * (0.041666666666666664) &
-                 + in(i+0,j+4) * (0.03125) &
+                 + in(i+0,j-4) * (-0.03125d0) &
+                 + in(i+0,j-3) * (-0.041666666666666664d0) &
+                 + in(i+0,j-2) * (-0.0625d0) &
+                 + in(i+0,j-1) * (-0.125d0) &
+                 + in(i-4,j+0) * (-0.03125d0) &
+                 + in(i-3,j+0) * (-0.041666666666666664d0) &
+                 + in(i-2,j+0) * (-0.0625d0) &
+                 + in(i-1,j+0) * (-0.125d0) &
+                 + in(i+1,j+0) * (0.125d0) &
+                 + in(i+2,j+0) * (0.0625d0) &
+                 + in(i+3,j+0) * (0.041666666666666664d0) &
+                 + in(i+4,j+0) * (0.03125d0) &
+                 + in(i+0,j+1) * (0.125d0) &
+                 + in(i+0,j+2) * (0.0625d0) &
+                 + in(i+0,j+3) * (0.041666666666666664d0) &
+                 + in(i+0,j+4) * (0.03125d0) &
 +0.0
       end do
     end do
@@ -108,26 +108,26 @@ subroutine star5(n, in, out)
     do i=5,n-5-1
       do j=5,n-5-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-5) * (-0.02) &
-                 + in(i+0,j-4) * (-0.025) &
-                 + in(i+0,j-3) * (-0.03333333333333333) &
-                 + in(i+0,j-2) * (-0.05) &
-                 + in(i+0,j-1) * (-0.1) &
-                 + in(i-5,j+0) * (-0.02) &
-                 + in(i-4,j+0) * (-0.025) &
-                 + in(i-3,j+0) * (-0.03333333333333333) &
-                 + in(i-2,j+0) * (-0.05) &
-                 + in(i-1,j+0) * (-0.1) &
-                 + in(i+1,j+0) * (0.1) &
-                 + in(i+2,j+0) * (0.05) &
-                 + in(i+3,j+0) * (0.03333333333333333) &
-                 + in(i+4,j+0) * (0.025) &
-                 + in(i+5,j+0) * (0.02) &
-                 + in(i+0,j+1) * (0.1) &
-                 + in(i+0,j+2) * (0.05) &
-                 + in(i+0,j+3) * (0.03333333333333333) &
-                 + in(i+0,j+4) * (0.025) &
-                 + in(i+0,j+5) * (0.02) &
+                 + in(i+0,j-5) * (-0.02d0) &
+                 + in(i+0,j-4) * (-0.025d0) &
+                 + in(i+0,j-3) * (-0.03333333333333333d0) &
+                 + in(i+0,j-2) * (-0.05d0) &
+                 + in(i+0,j-1) * (-0.1d0) &
+                 + in(i-5,j+0) * (-0.02d0) &
+                 + in(i-4,j+0) * (-0.025d0) &
+                 + in(i-3,j+0) * (-0.03333333333333333d0) &
+                 + in(i-2,j+0) * (-0.05d0) &
+                 + in(i-1,j+0) * (-0.1d0) &
+                 + in(i+1,j+0) * (0.1d0) &
+                 + in(i+2,j+0) * (0.05d0) &
+                 + in(i+3,j+0) * (0.03333333333333333d0) &
+                 + in(i+4,j+0) * (0.025d0) &
+                 + in(i+5,j+0) * (0.02d0) &
+                 + in(i+0,j+1) * (0.1d0) &
+                 + in(i+0,j+2) * (0.05d0) &
+                 + in(i+0,j+3) * (0.03333333333333333d0) &
+                 + in(i+0,j+4) * (0.025d0) &
+                 + in(i+0,j+5) * (0.02d0) &
 +0.0
       end do
     end do
@@ -143,30 +143,30 @@ subroutine star6(n, in, out)
     do i=6,n-6-1
       do j=6,n-6-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-6) * (-0.013888888888888888) &
-                 + in(i+0,j-5) * (-0.016666666666666666) &
-                 + in(i+0,j-4) * (-0.020833333333333332) &
-                 + in(i+0,j-3) * (-0.027777777777777776) &
-                 + in(i+0,j-2) * (-0.041666666666666664) &
-                 + in(i+0,j-1) * (-0.08333333333333333) &
-                 + in(i-6,j+0) * (-0.013888888888888888) &
-                 + in(i-5,j+0) * (-0.016666666666666666) &
-                 + in(i-4,j+0) * (-0.020833333333333332) &
-                 + in(i-3,j+0) * (-0.027777777777777776) &
-                 + in(i-2,j+0) * (-0.041666666666666664) &
-                 + in(i-1,j+0) * (-0.08333333333333333) &
-                 + in(i+1,j+0) * (0.08333333333333333) &
-                 + in(i+2,j+0) * (0.041666666666666664) &
-                 + in(i+3,j+0) * (0.027777777777777776) &
-                 + in(i+4,j+0) * (0.020833333333333332) &
-                 + in(i+5,j+0) * (0.016666666666666666) &
-                 + in(i+6,j+0) * (0.013888888888888888) &
-                 + in(i+0,j+1) * (0.08333333333333333) &
-                 + in(i+0,j+2) * (0.041666666666666664) &
-                 + in(i+0,j+3) * (0.027777777777777776) &
-                 + in(i+0,j+4) * (0.020833333333333332) &
-                 + in(i+0,j+5) * (0.016666666666666666) &
-                 + in(i+0,j+6) * (0.013888888888888888) &
+                 + in(i+0,j-6) * (-0.013888888888888888d0) &
+                 + in(i+0,j-5) * (-0.016666666666666666d0) &
+                 + in(i+0,j-4) * (-0.020833333333333332d0) &
+                 + in(i+0,j-3) * (-0.027777777777777776d0) &
+                 + in(i+0,j-2) * (-0.041666666666666664d0) &
+                 + in(i+0,j-1) * (-0.08333333333333333d0) &
+                 + in(i-6,j+0) * (-0.013888888888888888d0) &
+                 + in(i-5,j+0) * (-0.016666666666666666d0) &
+                 + in(i-4,j+0) * (-0.020833333333333332d0) &
+                 + in(i-3,j+0) * (-0.027777777777777776d0) &
+                 + in(i-2,j+0) * (-0.041666666666666664d0) &
+                 + in(i-1,j+0) * (-0.08333333333333333d0) &
+                 + in(i+1,j+0) * (0.08333333333333333d0) &
+                 + in(i+2,j+0) * (0.041666666666666664d0) &
+                 + in(i+3,j+0) * (0.027777777777777776d0) &
+                 + in(i+4,j+0) * (0.020833333333333332d0) &
+                 + in(i+5,j+0) * (0.016666666666666666d0) &
+                 + in(i+6,j+0) * (0.013888888888888888d0) &
+                 + in(i+0,j+1) * (0.08333333333333333d0) &
+                 + in(i+0,j+2) * (0.041666666666666664d0) &
+                 + in(i+0,j+3) * (0.027777777777777776d0) &
+                 + in(i+0,j+4) * (0.020833333333333332d0) &
+                 + in(i+0,j+5) * (0.016666666666666666d0) &
+                 + in(i+0,j+6) * (0.013888888888888888d0) &
 +0.0
       end do
     end do
@@ -182,34 +182,34 @@ subroutine star7(n, in, out)
     do i=7,n-7-1
       do j=7,n-7-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-7) * (-0.01020408163265306) &
-                 + in(i+0,j-6) * (-0.011904761904761904) &
-                 + in(i+0,j-5) * (-0.014285714285714285) &
-                 + in(i+0,j-4) * (-0.017857142857142856) &
-                 + in(i+0,j-3) * (-0.023809523809523808) &
-                 + in(i+0,j-2) * (-0.03571428571428571) &
-                 + in(i+0,j-1) * (-0.07142857142857142) &
-                 + in(i-7,j+0) * (-0.01020408163265306) &
-                 + in(i-6,j+0) * (-0.011904761904761904) &
-                 + in(i-5,j+0) * (-0.014285714285714285) &
-                 + in(i-4,j+0) * (-0.017857142857142856) &
-                 + in(i-3,j+0) * (-0.023809523809523808) &
-                 + in(i-2,j+0) * (-0.03571428571428571) &
-                 + in(i-1,j+0) * (-0.07142857142857142) &
-                 + in(i+1,j+0) * (0.07142857142857142) &
-                 + in(i+2,j+0) * (0.03571428571428571) &
-                 + in(i+3,j+0) * (0.023809523809523808) &
-                 + in(i+4,j+0) * (0.017857142857142856) &
-                 + in(i+5,j+0) * (0.014285714285714285) &
-                 + in(i+6,j+0) * (0.011904761904761904) &
-                 + in(i+7,j+0) * (0.01020408163265306) &
-                 + in(i+0,j+1) * (0.07142857142857142) &
-                 + in(i+0,j+2) * (0.03571428571428571) &
-                 + in(i+0,j+3) * (0.023809523809523808) &
-                 + in(i+0,j+4) * (0.017857142857142856) &
-                 + in(i+0,j+5) * (0.014285714285714285) &
-                 + in(i+0,j+6) * (0.011904761904761904) &
-                 + in(i+0,j+7) * (0.01020408163265306) &
+                 + in(i+0,j-7) * (-0.01020408163265306d0) &
+                 + in(i+0,j-6) * (-0.011904761904761904d0) &
+                 + in(i+0,j-5) * (-0.014285714285714285d0) &
+                 + in(i+0,j-4) * (-0.017857142857142856d0) &
+                 + in(i+0,j-3) * (-0.023809523809523808d0) &
+                 + in(i+0,j-2) * (-0.03571428571428571d0) &
+                 + in(i+0,j-1) * (-0.07142857142857142d0) &
+                 + in(i-7,j+0) * (-0.01020408163265306d0) &
+                 + in(i-6,j+0) * (-0.011904761904761904d0) &
+                 + in(i-5,j+0) * (-0.014285714285714285d0) &
+                 + in(i-4,j+0) * (-0.017857142857142856d0) &
+                 + in(i-3,j+0) * (-0.023809523809523808d0) &
+                 + in(i-2,j+0) * (-0.03571428571428571d0) &
+                 + in(i-1,j+0) * (-0.07142857142857142d0) &
+                 + in(i+1,j+0) * (0.07142857142857142d0) &
+                 + in(i+2,j+0) * (0.03571428571428571d0) &
+                 + in(i+3,j+0) * (0.023809523809523808d0) &
+                 + in(i+4,j+0) * (0.017857142857142856d0) &
+                 + in(i+5,j+0) * (0.014285714285714285d0) &
+                 + in(i+6,j+0) * (0.011904761904761904d0) &
+                 + in(i+7,j+0) * (0.01020408163265306d0) &
+                 + in(i+0,j+1) * (0.07142857142857142d0) &
+                 + in(i+0,j+2) * (0.03571428571428571d0) &
+                 + in(i+0,j+3) * (0.023809523809523808d0) &
+                 + in(i+0,j+4) * (0.017857142857142856d0) &
+                 + in(i+0,j+5) * (0.014285714285714285d0) &
+                 + in(i+0,j+6) * (0.011904761904761904d0) &
+                 + in(i+0,j+7) * (0.01020408163265306d0) &
 +0.0
       end do
     end do
@@ -225,38 +225,38 @@ subroutine star8(n, in, out)
     do i=8,n-8-1
       do j=8,n-8-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-8) * (-0.0078125) &
-                 + in(i+0,j-7) * (-0.008928571428571428) &
-                 + in(i+0,j-6) * (-0.010416666666666666) &
-                 + in(i+0,j-5) * (-0.0125) &
-                 + in(i+0,j-4) * (-0.015625) &
-                 + in(i+0,j-3) * (-0.020833333333333332) &
-                 + in(i+0,j-2) * (-0.03125) &
-                 + in(i+0,j-1) * (-0.0625) &
-                 + in(i-8,j+0) * (-0.0078125) &
-                 + in(i-7,j+0) * (-0.008928571428571428) &
-                 + in(i-6,j+0) * (-0.010416666666666666) &
-                 + in(i-5,j+0) * (-0.0125) &
-                 + in(i-4,j+0) * (-0.015625) &
-                 + in(i-3,j+0) * (-0.020833333333333332) &
-                 + in(i-2,j+0) * (-0.03125) &
-                 + in(i-1,j+0) * (-0.0625) &
-                 + in(i+1,j+0) * (0.0625) &
-                 + in(i+2,j+0) * (0.03125) &
-                 + in(i+3,j+0) * (0.020833333333333332) &
-                 + in(i+4,j+0) * (0.015625) &
-                 + in(i+5,j+0) * (0.0125) &
-                 + in(i+6,j+0) * (0.010416666666666666) &
-                 + in(i+7,j+0) * (0.008928571428571428) &
-                 + in(i+8,j+0) * (0.0078125) &
-                 + in(i+0,j+1) * (0.0625) &
-                 + in(i+0,j+2) * (0.03125) &
-                 + in(i+0,j+3) * (0.020833333333333332) &
-                 + in(i+0,j+4) * (0.015625) &
-                 + in(i+0,j+5) * (0.0125) &
-                 + in(i+0,j+6) * (0.010416666666666666) &
-                 + in(i+0,j+7) * (0.008928571428571428) &
-                 + in(i+0,j+8) * (0.0078125) &
+                 + in(i+0,j-8) * (-0.0078125d0) &
+                 + in(i+0,j-7) * (-0.008928571428571428d0) &
+                 + in(i+0,j-6) * (-0.010416666666666666d0) &
+                 + in(i+0,j-5) * (-0.0125d0) &
+                 + in(i+0,j-4) * (-0.015625d0) &
+                 + in(i+0,j-3) * (-0.020833333333333332d0) &
+                 + in(i+0,j-2) * (-0.03125d0) &
+                 + in(i+0,j-1) * (-0.0625d0) &
+                 + in(i-8,j+0) * (-0.0078125d0) &
+                 + in(i-7,j+0) * (-0.008928571428571428d0) &
+                 + in(i-6,j+0) * (-0.010416666666666666d0) &
+                 + in(i-5,j+0) * (-0.0125d0) &
+                 + in(i-4,j+0) * (-0.015625d0) &
+                 + in(i-3,j+0) * (-0.020833333333333332d0) &
+                 + in(i-2,j+0) * (-0.03125d0) &
+                 + in(i-1,j+0) * (-0.0625d0) &
+                 + in(i+1,j+0) * (0.0625d0) &
+                 + in(i+2,j+0) * (0.03125d0) &
+                 + in(i+3,j+0) * (0.020833333333333332d0) &
+                 + in(i+4,j+0) * (0.015625d0) &
+                 + in(i+5,j+0) * (0.0125d0) &
+                 + in(i+6,j+0) * (0.010416666666666666d0) &
+                 + in(i+7,j+0) * (0.008928571428571428d0) &
+                 + in(i+8,j+0) * (0.0078125d0) &
+                 + in(i+0,j+1) * (0.0625d0) &
+                 + in(i+0,j+2) * (0.03125d0) &
+                 + in(i+0,j+3) * (0.020833333333333332d0) &
+                 + in(i+0,j+4) * (0.015625d0) &
+                 + in(i+0,j+5) * (0.0125d0) &
+                 + in(i+0,j+6) * (0.010416666666666666d0) &
+                 + in(i+0,j+7) * (0.008928571428571428d0) &
+                 + in(i+0,j+8) * (0.0078125d0) &
 +0.0
       end do
     end do
@@ -272,42 +272,42 @@ subroutine star9(n, in, out)
     do i=9,n-9-1
       do j=9,n-9-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-9) * (-0.006172839506172839) &
-                 + in(i+0,j-8) * (-0.006944444444444444) &
-                 + in(i+0,j-7) * (-0.007936507936507936) &
-                 + in(i+0,j-6) * (-0.009259259259259259) &
-                 + in(i+0,j-5) * (-0.011111111111111112) &
-                 + in(i+0,j-4) * (-0.013888888888888888) &
-                 + in(i+0,j-3) * (-0.018518518518518517) &
-                 + in(i+0,j-2) * (-0.027777777777777776) &
-                 + in(i+0,j-1) * (-0.05555555555555555) &
-                 + in(i-9,j+0) * (-0.006172839506172839) &
-                 + in(i-8,j+0) * (-0.006944444444444444) &
-                 + in(i-7,j+0) * (-0.007936507936507936) &
-                 + in(i-6,j+0) * (-0.009259259259259259) &
-                 + in(i-5,j+0) * (-0.011111111111111112) &
-                 + in(i-4,j+0) * (-0.013888888888888888) &
-                 + in(i-3,j+0) * (-0.018518518518518517) &
-                 + in(i-2,j+0) * (-0.027777777777777776) &
-                 + in(i-1,j+0) * (-0.05555555555555555) &
-                 + in(i+1,j+0) * (0.05555555555555555) &
-                 + in(i+2,j+0) * (0.027777777777777776) &
-                 + in(i+3,j+0) * (0.018518518518518517) &
-                 + in(i+4,j+0) * (0.013888888888888888) &
-                 + in(i+5,j+0) * (0.011111111111111112) &
-                 + in(i+6,j+0) * (0.009259259259259259) &
-                 + in(i+7,j+0) * (0.007936507936507936) &
-                 + in(i+8,j+0) * (0.006944444444444444) &
-                 + in(i+9,j+0) * (0.006172839506172839) &
-                 + in(i+0,j+1) * (0.05555555555555555) &
-                 + in(i+0,j+2) * (0.027777777777777776) &
-                 + in(i+0,j+3) * (0.018518518518518517) &
-                 + in(i+0,j+4) * (0.013888888888888888) &
-                 + in(i+0,j+5) * (0.011111111111111112) &
-                 + in(i+0,j+6) * (0.009259259259259259) &
-                 + in(i+0,j+7) * (0.007936507936507936) &
-                 + in(i+0,j+8) * (0.006944444444444444) &
-                 + in(i+0,j+9) * (0.006172839506172839) &
+                 + in(i+0,j-9) * (-0.006172839506172839d0) &
+                 + in(i+0,j-8) * (-0.006944444444444444d0) &
+                 + in(i+0,j-7) * (-0.007936507936507936d0) &
+                 + in(i+0,j-6) * (-0.009259259259259259d0) &
+                 + in(i+0,j-5) * (-0.011111111111111112d0) &
+                 + in(i+0,j-4) * (-0.013888888888888888d0) &
+                 + in(i+0,j-3) * (-0.018518518518518517d0) &
+                 + in(i+0,j-2) * (-0.027777777777777776d0) &
+                 + in(i+0,j-1) * (-0.05555555555555555d0) &
+                 + in(i-9,j+0) * (-0.006172839506172839d0) &
+                 + in(i-8,j+0) * (-0.006944444444444444d0) &
+                 + in(i-7,j+0) * (-0.007936507936507936d0) &
+                 + in(i-6,j+0) * (-0.009259259259259259d0) &
+                 + in(i-5,j+0) * (-0.011111111111111112d0) &
+                 + in(i-4,j+0) * (-0.013888888888888888d0) &
+                 + in(i-3,j+0) * (-0.018518518518518517d0) &
+                 + in(i-2,j+0) * (-0.027777777777777776d0) &
+                 + in(i-1,j+0) * (-0.05555555555555555d0) &
+                 + in(i+1,j+0) * (0.05555555555555555d0) &
+                 + in(i+2,j+0) * (0.027777777777777776d0) &
+                 + in(i+3,j+0) * (0.018518518518518517d0) &
+                 + in(i+4,j+0) * (0.013888888888888888d0) &
+                 + in(i+5,j+0) * (0.011111111111111112d0) &
+                 + in(i+6,j+0) * (0.009259259259259259d0) &
+                 + in(i+7,j+0) * (0.007936507936507936d0) &
+                 + in(i+8,j+0) * (0.006944444444444444d0) &
+                 + in(i+9,j+0) * (0.006172839506172839d0) &
+                 + in(i+0,j+1) * (0.05555555555555555d0) &
+                 + in(i+0,j+2) * (0.027777777777777776d0) &
+                 + in(i+0,j+3) * (0.018518518518518517d0) &
+                 + in(i+0,j+4) * (0.013888888888888888d0) &
+                 + in(i+0,j+5) * (0.011111111111111112d0) &
+                 + in(i+0,j+6) * (0.009259259259259259d0) &
+                 + in(i+0,j+7) * (0.007936507936507936d0) &
+                 + in(i+0,j+8) * (0.006944444444444444d0) &
+                 + in(i+0,j+9) * (0.006172839506172839d0) &
 +0.0
       end do
     end do
@@ -323,10 +323,10 @@ subroutine grid1(n, in, out)
     do i=1,n-1-1
       do j=1,n-1-1
         out(i,j) = out(i,j) &
-                 + in(i-1,j-1) * (-0.25) &
-                 + in(i+1,j-1) * (-0.25) &
-                 + in(i-1,j+1) * (-0.25) &
-                 + in(i+1,j+1) * (0.25) &
+                 + in(i-1,j-1) * (-0.25d0) &
+                 + in(i+1,j-1) * (-0.25d0) &
+                 + in(i-1,j+1) * (-0.25d0) &
+                 + in(i+1,j+1) * (0.25d0) &
 +0.0
       end do
     end do
@@ -342,20 +342,20 @@ subroutine grid2(n, in, out)
     do i=2,n-2-1
       do j=2,n-2-1
         out(i,j) = out(i,j) &
-                 + in(i-2,j-2) * (-0.0625) &
-                 + in(i+1,j-2) * (-0.020833333333333332) &
-                 + in(i+2,j-2) * (-0.020833333333333332) &
-                 + in(i-1,j-1) * (-0.125) &
-                 + in(i+1,j-1) * (-0.125) &
-                 + in(i+2,j-1) * (-0.125) &
-                 + in(i-2,j+1) * (-0.020833333333333332) &
-                 + in(i-1,j+1) * (-0.125) &
-                 + in(i+1,j+1) * (0.125) &
-                 + in(i+2,j+1) * (0.020833333333333332) &
-                 + in(i-2,j+2) * (-0.020833333333333332) &
-                 + in(i-1,j+2) * (-0.125) &
-                 + in(i+1,j+2) * (0.020833333333333332) &
-                 + in(i+2,j+2) * (0.0625) &
+                 + in(i-2,j-2) * (-0.0625d0) &
+                 + in(i+1,j-2) * (-0.020833333333333332d0) &
+                 + in(i+2,j-2) * (-0.020833333333333332d0) &
+                 + in(i-1,j-1) * (-0.125d0) &
+                 + in(i+1,j-1) * (-0.125d0) &
+                 + in(i+2,j-1) * (-0.125d0) &
+                 + in(i-2,j+1) * (-0.020833333333333332d0) &
+                 + in(i-1,j+1) * (-0.125d0) &
+                 + in(i+1,j+1) * (0.125d0) &
+                 + in(i+2,j+1) * (0.020833333333333332d0) &
+                 + in(i-2,j+2) * (-0.020833333333333332d0) &
+                 + in(i-1,j+2) * (-0.125d0) &
+                 + in(i+1,j+2) * (0.020833333333333332d0) &
+                 + in(i+2,j+2) * (0.0625d0) &
 +0.0
       end do
     end do
@@ -371,36 +371,36 @@ subroutine grid3(n, in, out)
     do i=3,n-3-1
       do j=3,n-3-1
         out(i,j) = out(i,j) &
-                 + in(i-3,j-3) * (-0.027777777777777776) &
-                 + in(i+1,j-3) * (-0.005555555555555556) &
-                 + in(i+2,j-3) * (-0.005555555555555556) &
-                 + in(i+3,j-3) * (-0.005555555555555556) &
-                 + in(i-2,j-2) * (-0.041666666666666664) &
-                 + in(i+1,j-2) * (-0.013888888888888888) &
-                 + in(i+2,j-2) * (-0.013888888888888888) &
-                 + in(i+3,j-2) * (-0.013888888888888888) &
-                 + in(i-1,j-1) * (-0.08333333333333333) &
-                 + in(i+1,j-1) * (-0.08333333333333333) &
-                 + in(i+2,j-1) * (-0.08333333333333333) &
-                 + in(i+3,j-1) * (-0.08333333333333333) &
-                 + in(i-3,j+1) * (-0.005555555555555556) &
-                 + in(i-2,j+1) * (-0.013888888888888888) &
-                 + in(i-1,j+1) * (-0.08333333333333333) &
-                 + in(i+1,j+1) * (0.08333333333333333) &
-                 + in(i+2,j+1) * (0.013888888888888888) &
-                 + in(i+3,j+1) * (0.005555555555555556) &
-                 + in(i-3,j+2) * (-0.005555555555555556) &
-                 + in(i-2,j+2) * (-0.013888888888888888) &
-                 + in(i-1,j+2) * (-0.08333333333333333) &
-                 + in(i+1,j+2) * (0.013888888888888888) &
-                 + in(i+2,j+2) * (0.041666666666666664) &
-                 + in(i+3,j+2) * (0.005555555555555556) &
-                 + in(i-3,j+3) * (-0.005555555555555556) &
-                 + in(i-2,j+3) * (-0.013888888888888888) &
-                 + in(i-1,j+3) * (-0.08333333333333333) &
-                 + in(i+1,j+3) * (0.005555555555555556) &
-                 + in(i+2,j+3) * (0.005555555555555556) &
-                 + in(i+3,j+3) * (0.027777777777777776) &
+                 + in(i-3,j-3) * (-0.027777777777777776d0) &
+                 + in(i+1,j-3) * (-0.005555555555555556d0) &
+                 + in(i+2,j-3) * (-0.005555555555555556d0) &
+                 + in(i+3,j-3) * (-0.005555555555555556d0) &
+                 + in(i-2,j-2) * (-0.041666666666666664d0) &
+                 + in(i+1,j-2) * (-0.013888888888888888d0) &
+                 + in(i+2,j-2) * (-0.013888888888888888d0) &
+                 + in(i+3,j-2) * (-0.013888888888888888d0) &
+                 + in(i-1,j-1) * (-0.08333333333333333d0) &
+                 + in(i+1,j-1) * (-0.08333333333333333d0) &
+                 + in(i+2,j-1) * (-0.08333333333333333d0) &
+                 + in(i+3,j-1) * (-0.08333333333333333d0) &
+                 + in(i-3,j+1) * (-0.005555555555555556d0) &
+                 + in(i-2,j+1) * (-0.013888888888888888d0) &
+                 + in(i-1,j+1) * (-0.08333333333333333d0) &
+                 + in(i+1,j+1) * (0.08333333333333333d0) &
+                 + in(i+2,j+1) * (0.013888888888888888d0) &
+                 + in(i+3,j+1) * (0.005555555555555556d0) &
+                 + in(i-3,j+2) * (-0.005555555555555556d0) &
+                 + in(i-2,j+2) * (-0.013888888888888888d0) &
+                 + in(i-1,j+2) * (-0.08333333333333333d0) &
+                 + in(i+1,j+2) * (0.013888888888888888d0) &
+                 + in(i+2,j+2) * (0.041666666666666664d0) &
+                 + in(i+3,j+2) * (0.005555555555555556d0) &
+                 + in(i-3,j+3) * (-0.005555555555555556d0) &
+                 + in(i-2,j+3) * (-0.013888888888888888d0) &
+                 + in(i-1,j+3) * (-0.08333333333333333d0) &
+                 + in(i+1,j+3) * (0.005555555555555556d0) &
+                 + in(i+2,j+3) * (0.005555555555555556d0) &
+                 + in(i+3,j+3) * (0.027777777777777776d0) &
 +0.0
       end do
     end do
@@ -416,58 +416,58 @@ subroutine grid4(n, in, out)
     do i=4,n-4-1
       do j=4,n-4-1
         out(i,j) = out(i,j) &
-                 + in(i-4,j-4) * (-0.015625) &
-                 + in(i+1,j-4) * (-0.002232142857142857) &
-                 + in(i+2,j-4) * (-0.002232142857142857) &
-                 + in(i+3,j-4) * (-0.002232142857142857) &
-                 + in(i+4,j-4) * (-0.002232142857142857) &
-                 + in(i-3,j-3) * (-0.020833333333333332) &
-                 + in(i+1,j-3) * (-0.004166666666666667) &
-                 + in(i+2,j-3) * (-0.004166666666666667) &
-                 + in(i+3,j-3) * (-0.004166666666666667) &
-                 + in(i+4,j-3) * (-0.004166666666666667) &
-                 + in(i-2,j-2) * (-0.03125) &
-                 + in(i+1,j-2) * (-0.010416666666666666) &
-                 + in(i+2,j-2) * (-0.010416666666666666) &
-                 + in(i+3,j-2) * (-0.010416666666666666) &
-                 + in(i+4,j-2) * (-0.010416666666666666) &
-                 + in(i-1,j-1) * (-0.0625) &
-                 + in(i+1,j-1) * (-0.0625) &
-                 + in(i+2,j-1) * (-0.0625) &
-                 + in(i+3,j-1) * (-0.0625) &
-                 + in(i+4,j-1) * (-0.0625) &
-                 + in(i-4,j+1) * (-0.002232142857142857) &
-                 + in(i-3,j+1) * (-0.004166666666666667) &
-                 + in(i-2,j+1) * (-0.010416666666666666) &
-                 + in(i-1,j+1) * (-0.0625) &
-                 + in(i+1,j+1) * (0.0625) &
-                 + in(i+2,j+1) * (0.010416666666666666) &
-                 + in(i+3,j+1) * (0.004166666666666667) &
-                 + in(i+4,j+1) * (0.002232142857142857) &
-                 + in(i-4,j+2) * (-0.002232142857142857) &
-                 + in(i-3,j+2) * (-0.004166666666666667) &
-                 + in(i-2,j+2) * (-0.010416666666666666) &
-                 + in(i-1,j+2) * (-0.0625) &
-                 + in(i+1,j+2) * (0.010416666666666666) &
-                 + in(i+2,j+2) * (0.03125) &
-                 + in(i+3,j+2) * (0.004166666666666667) &
-                 + in(i+4,j+2) * (0.002232142857142857) &
-                 + in(i-4,j+3) * (-0.002232142857142857) &
-                 + in(i-3,j+3) * (-0.004166666666666667) &
-                 + in(i-2,j+3) * (-0.010416666666666666) &
-                 + in(i-1,j+3) * (-0.0625) &
-                 + in(i+1,j+3) * (0.004166666666666667) &
-                 + in(i+2,j+3) * (0.004166666666666667) &
-                 + in(i+3,j+3) * (0.020833333333333332) &
-                 + in(i+4,j+3) * (0.002232142857142857) &
-                 + in(i-4,j+4) * (-0.002232142857142857) &
-                 + in(i-3,j+4) * (-0.004166666666666667) &
-                 + in(i-2,j+4) * (-0.010416666666666666) &
-                 + in(i-1,j+4) * (-0.0625) &
-                 + in(i+1,j+4) * (0.002232142857142857) &
-                 + in(i+2,j+4) * (0.002232142857142857) &
-                 + in(i+3,j+4) * (0.002232142857142857) &
-                 + in(i+4,j+4) * (0.015625) &
+                 + in(i-4,j-4) * (-0.015625d0) &
+                 + in(i+1,j-4) * (-0.002232142857142857d0) &
+                 + in(i+2,j-4) * (-0.002232142857142857d0) &
+                 + in(i+3,j-4) * (-0.002232142857142857d0) &
+                 + in(i+4,j-4) * (-0.002232142857142857d0) &
+                 + in(i-3,j-3) * (-0.020833333333333332d0) &
+                 + in(i+1,j-3) * (-0.004166666666666667d0) &
+                 + in(i+2,j-3) * (-0.004166666666666667d0) &
+                 + in(i+3,j-3) * (-0.004166666666666667d0) &
+                 + in(i+4,j-3) * (-0.004166666666666667d0) &
+                 + in(i-2,j-2) * (-0.03125d0) &
+                 + in(i+1,j-2) * (-0.010416666666666666d0) &
+                 + in(i+2,j-2) * (-0.010416666666666666d0) &
+                 + in(i+3,j-2) * (-0.010416666666666666d0) &
+                 + in(i+4,j-2) * (-0.010416666666666666d0) &
+                 + in(i-1,j-1) * (-0.0625d0) &
+                 + in(i+1,j-1) * (-0.0625d0) &
+                 + in(i+2,j-1) * (-0.0625d0) &
+                 + in(i+3,j-1) * (-0.0625d0) &
+                 + in(i+4,j-1) * (-0.0625d0) &
+                 + in(i-4,j+1) * (-0.002232142857142857d0) &
+                 + in(i-3,j+1) * (-0.004166666666666667d0) &
+                 + in(i-2,j+1) * (-0.010416666666666666d0) &
+                 + in(i-1,j+1) * (-0.0625d0) &
+                 + in(i+1,j+1) * (0.0625d0) &
+                 + in(i+2,j+1) * (0.010416666666666666d0) &
+                 + in(i+3,j+1) * (0.004166666666666667d0) &
+                 + in(i+4,j+1) * (0.002232142857142857d0) &
+                 + in(i-4,j+2) * (-0.002232142857142857d0) &
+                 + in(i-3,j+2) * (-0.004166666666666667d0) &
+                 + in(i-2,j+2) * (-0.010416666666666666d0) &
+                 + in(i-1,j+2) * (-0.0625d0) &
+                 + in(i+1,j+2) * (0.010416666666666666d0) &
+                 + in(i+2,j+2) * (0.03125d0) &
+                 + in(i+3,j+2) * (0.004166666666666667d0) &
+                 + in(i+4,j+2) * (0.002232142857142857d0) &
+                 + in(i-4,j+3) * (-0.002232142857142857d0) &
+                 + in(i-3,j+3) * (-0.004166666666666667d0) &
+                 + in(i-2,j+3) * (-0.010416666666666666d0) &
+                 + in(i-1,j+3) * (-0.0625d0) &
+                 + in(i+1,j+3) * (0.004166666666666667d0) &
+                 + in(i+2,j+3) * (0.004166666666666667d0) &
+                 + in(i+3,j+3) * (0.020833333333333332d0) &
+                 + in(i+4,j+3) * (0.002232142857142857d0) &
+                 + in(i-4,j+4) * (-0.002232142857142857d0) &
+                 + in(i-3,j+4) * (-0.004166666666666667d0) &
+                 + in(i-2,j+4) * (-0.010416666666666666d0) &
+                 + in(i-1,j+4) * (-0.0625d0) &
+                 + in(i+1,j+4) * (0.002232142857142857d0) &
+                 + in(i+2,j+4) * (0.002232142857142857d0) &
+                 + in(i+3,j+4) * (0.002232142857142857d0) &
+                 + in(i+4,j+4) * (0.015625d0) &
 +0.0
       end do
     end do
@@ -483,86 +483,86 @@ subroutine grid5(n, in, out)
     do i=5,n-5-1
       do j=5,n-5-1
         out(i,j) = out(i,j) &
-                 + in(i-5,j-5) * (-0.01) &
-                 + in(i+1,j-5) * (-0.0011111111111111111) &
-                 + in(i+2,j-5) * (-0.0011111111111111111) &
-                 + in(i+3,j-5) * (-0.0011111111111111111) &
-                 + in(i+4,j-5) * (-0.0011111111111111111) &
-                 + in(i+5,j-5) * (-0.0011111111111111111) &
-                 + in(i-4,j-4) * (-0.0125) &
-                 + in(i+1,j-4) * (-0.0017857142857142857) &
-                 + in(i+2,j-4) * (-0.0017857142857142857) &
-                 + in(i+3,j-4) * (-0.0017857142857142857) &
-                 + in(i+4,j-4) * (-0.0017857142857142857) &
-                 + in(i+5,j-4) * (-0.0017857142857142857) &
-                 + in(i-3,j-3) * (-0.016666666666666666) &
-                 + in(i+1,j-3) * (-0.0033333333333333335) &
-                 + in(i+2,j-3) * (-0.0033333333333333335) &
-                 + in(i+3,j-3) * (-0.0033333333333333335) &
-                 + in(i+4,j-3) * (-0.0033333333333333335) &
-                 + in(i+5,j-3) * (-0.0033333333333333335) &
-                 + in(i-2,j-2) * (-0.025) &
-                 + in(i+1,j-2) * (-0.008333333333333333) &
-                 + in(i+2,j-2) * (-0.008333333333333333) &
-                 + in(i+3,j-2) * (-0.008333333333333333) &
-                 + in(i+4,j-2) * (-0.008333333333333333) &
-                 + in(i+5,j-2) * (-0.008333333333333333) &
-                 + in(i-1,j-1) * (-0.05) &
-                 + in(i+1,j-1) * (-0.05) &
-                 + in(i+2,j-1) * (-0.05) &
-                 + in(i+3,j-1) * (-0.05) &
-                 + in(i+4,j-1) * (-0.05) &
-                 + in(i+5,j-1) * (-0.05) &
-                 + in(i-5,j+1) * (-0.0011111111111111111) &
-                 + in(i-4,j+1) * (-0.0017857142857142857) &
-                 + in(i-3,j+1) * (-0.0033333333333333335) &
-                 + in(i-2,j+1) * (-0.008333333333333333) &
-                 + in(i-1,j+1) * (-0.05) &
-                 + in(i+1,j+1) * (0.05) &
-                 + in(i+2,j+1) * (0.008333333333333333) &
-                 + in(i+3,j+1) * (0.0033333333333333335) &
-                 + in(i+4,j+1) * (0.0017857142857142857) &
-                 + in(i+5,j+1) * (0.0011111111111111111) &
-                 + in(i-5,j+2) * (-0.0011111111111111111) &
-                 + in(i-4,j+2) * (-0.0017857142857142857) &
-                 + in(i-3,j+2) * (-0.0033333333333333335) &
-                 + in(i-2,j+2) * (-0.008333333333333333) &
-                 + in(i-1,j+2) * (-0.05) &
-                 + in(i+1,j+2) * (0.008333333333333333) &
-                 + in(i+2,j+2) * (0.025) &
-                 + in(i+3,j+2) * (0.0033333333333333335) &
-                 + in(i+4,j+2) * (0.0017857142857142857) &
-                 + in(i+5,j+2) * (0.0011111111111111111) &
-                 + in(i-5,j+3) * (-0.0011111111111111111) &
-                 + in(i-4,j+3) * (-0.0017857142857142857) &
-                 + in(i-3,j+3) * (-0.0033333333333333335) &
-                 + in(i-2,j+3) * (-0.008333333333333333) &
-                 + in(i-1,j+3) * (-0.05) &
-                 + in(i+1,j+3) * (0.0033333333333333335) &
-                 + in(i+2,j+3) * (0.0033333333333333335) &
-                 + in(i+3,j+3) * (0.016666666666666666) &
-                 + in(i+4,j+3) * (0.0017857142857142857) &
-                 + in(i+5,j+3) * (0.0011111111111111111) &
-                 + in(i-5,j+4) * (-0.0011111111111111111) &
-                 + in(i-4,j+4) * (-0.0017857142857142857) &
-                 + in(i-3,j+4) * (-0.0033333333333333335) &
-                 + in(i-2,j+4) * (-0.008333333333333333) &
-                 + in(i-1,j+4) * (-0.05) &
-                 + in(i+1,j+4) * (0.0017857142857142857) &
-                 + in(i+2,j+4) * (0.0017857142857142857) &
-                 + in(i+3,j+4) * (0.0017857142857142857) &
-                 + in(i+4,j+4) * (0.0125) &
-                 + in(i+5,j+4) * (0.0011111111111111111) &
-                 + in(i-5,j+5) * (-0.0011111111111111111) &
-                 + in(i-4,j+5) * (-0.0017857142857142857) &
-                 + in(i-3,j+5) * (-0.0033333333333333335) &
-                 + in(i-2,j+5) * (-0.008333333333333333) &
-                 + in(i-1,j+5) * (-0.05) &
-                 + in(i+1,j+5) * (0.0011111111111111111) &
-                 + in(i+2,j+5) * (0.0011111111111111111) &
-                 + in(i+3,j+5) * (0.0011111111111111111) &
-                 + in(i+4,j+5) * (0.0011111111111111111) &
-                 + in(i+5,j+5) * (0.01) &
+                 + in(i-5,j-5) * (-0.01d0) &
+                 + in(i+1,j-5) * (-0.0011111111111111111d0) &
+                 + in(i+2,j-5) * (-0.0011111111111111111d0) &
+                 + in(i+3,j-5) * (-0.0011111111111111111d0) &
+                 + in(i+4,j-5) * (-0.0011111111111111111d0) &
+                 + in(i+5,j-5) * (-0.0011111111111111111d0) &
+                 + in(i-4,j-4) * (-0.0125d0) &
+                 + in(i+1,j-4) * (-0.0017857142857142857d0) &
+                 + in(i+2,j-4) * (-0.0017857142857142857d0) &
+                 + in(i+3,j-4) * (-0.0017857142857142857d0) &
+                 + in(i+4,j-4) * (-0.0017857142857142857d0) &
+                 + in(i+5,j-4) * (-0.0017857142857142857d0) &
+                 + in(i-3,j-3) * (-0.016666666666666666d0) &
+                 + in(i+1,j-3) * (-0.0033333333333333335d0) &
+                 + in(i+2,j-3) * (-0.0033333333333333335d0) &
+                 + in(i+3,j-3) * (-0.0033333333333333335d0) &
+                 + in(i+4,j-3) * (-0.0033333333333333335d0) &
+                 + in(i+5,j-3) * (-0.0033333333333333335d0) &
+                 + in(i-2,j-2) * (-0.025d0) &
+                 + in(i+1,j-2) * (-0.008333333333333333d0) &
+                 + in(i+2,j-2) * (-0.008333333333333333d0) &
+                 + in(i+3,j-2) * (-0.008333333333333333d0) &
+                 + in(i+4,j-2) * (-0.008333333333333333d0) &
+                 + in(i+5,j-2) * (-0.008333333333333333d0) &
+                 + in(i-1,j-1) * (-0.05d0) &
+                 + in(i+1,j-1) * (-0.05d0) &
+                 + in(i+2,j-1) * (-0.05d0) &
+                 + in(i+3,j-1) * (-0.05d0) &
+                 + in(i+4,j-1) * (-0.05d0) &
+                 + in(i+5,j-1) * (-0.05d0) &
+                 + in(i-5,j+1) * (-0.0011111111111111111d0) &
+                 + in(i-4,j+1) * (-0.0017857142857142857d0) &
+                 + in(i-3,j+1) * (-0.0033333333333333335d0) &
+                 + in(i-2,j+1) * (-0.008333333333333333d0) &
+                 + in(i-1,j+1) * (-0.05d0) &
+                 + in(i+1,j+1) * (0.05d0) &
+                 + in(i+2,j+1) * (0.008333333333333333d0) &
+                 + in(i+3,j+1) * (0.0033333333333333335d0) &
+                 + in(i+4,j+1) * (0.0017857142857142857d0) &
+                 + in(i+5,j+1) * (0.0011111111111111111d0) &
+                 + in(i-5,j+2) * (-0.0011111111111111111d0) &
+                 + in(i-4,j+2) * (-0.0017857142857142857d0) &
+                 + in(i-3,j+2) * (-0.0033333333333333335d0) &
+                 + in(i-2,j+2) * (-0.008333333333333333d0) &
+                 + in(i-1,j+2) * (-0.05d0) &
+                 + in(i+1,j+2) * (0.008333333333333333d0) &
+                 + in(i+2,j+2) * (0.025d0) &
+                 + in(i+3,j+2) * (0.0033333333333333335d0) &
+                 + in(i+4,j+2) * (0.0017857142857142857d0) &
+                 + in(i+5,j+2) * (0.0011111111111111111d0) &
+                 + in(i-5,j+3) * (-0.0011111111111111111d0) &
+                 + in(i-4,j+3) * (-0.0017857142857142857d0) &
+                 + in(i-3,j+3) * (-0.0033333333333333335d0) &
+                 + in(i-2,j+3) * (-0.008333333333333333d0) &
+                 + in(i-1,j+3) * (-0.05d0) &
+                 + in(i+1,j+3) * (0.0033333333333333335d0) &
+                 + in(i+2,j+3) * (0.0033333333333333335d0) &
+                 + in(i+3,j+3) * (0.016666666666666666d0) &
+                 + in(i+4,j+3) * (0.0017857142857142857d0) &
+                 + in(i+5,j+3) * (0.0011111111111111111d0) &
+                 + in(i-5,j+4) * (-0.0011111111111111111d0) &
+                 + in(i-4,j+4) * (-0.0017857142857142857d0) &
+                 + in(i-3,j+4) * (-0.0033333333333333335d0) &
+                 + in(i-2,j+4) * (-0.008333333333333333d0) &
+                 + in(i-1,j+4) * (-0.05d0) &
+                 + in(i+1,j+4) * (0.0017857142857142857d0) &
+                 + in(i+2,j+4) * (0.0017857142857142857d0) &
+                 + in(i+3,j+4) * (0.0017857142857142857d0) &
+                 + in(i+4,j+4) * (0.0125d0) &
+                 + in(i+5,j+4) * (0.0011111111111111111d0) &
+                 + in(i-5,j+5) * (-0.0011111111111111111d0) &
+                 + in(i-4,j+5) * (-0.0017857142857142857d0) &
+                 + in(i-3,j+5) * (-0.0033333333333333335d0) &
+                 + in(i-2,j+5) * (-0.008333333333333333d0) &
+                 + in(i-1,j+5) * (-0.05d0) &
+                 + in(i+1,j+5) * (0.0011111111111111111d0) &
+                 + in(i+2,j+5) * (0.0011111111111111111d0) &
+                 + in(i+3,j+5) * (0.0011111111111111111d0) &
+                 + in(i+4,j+5) * (0.0011111111111111111d0) &
+                 + in(i+5,j+5) * (0.01d0) &
 +0.0
       end do
     end do
@@ -578,120 +578,120 @@ subroutine grid6(n, in, out)
     do i=6,n-6-1
       do j=6,n-6-1
         out(i,j) = out(i,j) &
-                 + in(i-6,j-6) * (-0.006944444444444444) &
-                 + in(i+1,j-6) * (-0.0006313131313131314) &
-                 + in(i+2,j-6) * (-0.0006313131313131314) &
-                 + in(i+3,j-6) * (-0.0006313131313131314) &
-                 + in(i+4,j-6) * (-0.0006313131313131314) &
-                 + in(i+5,j-6) * (-0.0006313131313131314) &
-                 + in(i+6,j-6) * (-0.0006313131313131314) &
-                 + in(i-5,j-5) * (-0.008333333333333333) &
-                 + in(i+1,j-5) * (-0.000925925925925926) &
-                 + in(i+2,j-5) * (-0.000925925925925926) &
-                 + in(i+3,j-5) * (-0.000925925925925926) &
-                 + in(i+4,j-5) * (-0.000925925925925926) &
-                 + in(i+5,j-5) * (-0.000925925925925926) &
-                 + in(i+6,j-5) * (-0.000925925925925926) &
-                 + in(i-4,j-4) * (-0.010416666666666666) &
-                 + in(i+1,j-4) * (-0.001488095238095238) &
-                 + in(i+2,j-4) * (-0.001488095238095238) &
-                 + in(i+3,j-4) * (-0.001488095238095238) &
-                 + in(i+4,j-4) * (-0.001488095238095238) &
-                 + in(i+5,j-4) * (-0.001488095238095238) &
-                 + in(i+6,j-4) * (-0.001488095238095238) &
-                 + in(i-3,j-3) * (-0.013888888888888888) &
-                 + in(i+1,j-3) * (-0.002777777777777778) &
-                 + in(i+2,j-3) * (-0.002777777777777778) &
-                 + in(i+3,j-3) * (-0.002777777777777778) &
-                 + in(i+4,j-3) * (-0.002777777777777778) &
-                 + in(i+5,j-3) * (-0.002777777777777778) &
-                 + in(i+6,j-3) * (-0.002777777777777778) &
-                 + in(i-2,j-2) * (-0.020833333333333332) &
-                 + in(i+1,j-2) * (-0.006944444444444444) &
-                 + in(i+2,j-2) * (-0.006944444444444444) &
-                 + in(i+3,j-2) * (-0.006944444444444444) &
-                 + in(i+4,j-2) * (-0.006944444444444444) &
-                 + in(i+5,j-2) * (-0.006944444444444444) &
-                 + in(i+6,j-2) * (-0.006944444444444444) &
-                 + in(i-1,j-1) * (-0.041666666666666664) &
-                 + in(i+1,j-1) * (-0.041666666666666664) &
-                 + in(i+2,j-1) * (-0.041666666666666664) &
-                 + in(i+3,j-1) * (-0.041666666666666664) &
-                 + in(i+4,j-1) * (-0.041666666666666664) &
-                 + in(i+5,j-1) * (-0.041666666666666664) &
-                 + in(i+6,j-1) * (-0.041666666666666664) &
-                 + in(i-6,j+1) * (-0.0006313131313131314) &
-                 + in(i-5,j+1) * (-0.000925925925925926) &
-                 + in(i-4,j+1) * (-0.001488095238095238) &
-                 + in(i-3,j+1) * (-0.002777777777777778) &
-                 + in(i-2,j+1) * (-0.006944444444444444) &
-                 + in(i-1,j+1) * (-0.041666666666666664) &
-                 + in(i+1,j+1) * (0.041666666666666664) &
-                 + in(i+2,j+1) * (0.006944444444444444) &
-                 + in(i+3,j+1) * (0.002777777777777778) &
-                 + in(i+4,j+1) * (0.001488095238095238) &
-                 + in(i+5,j+1) * (0.000925925925925926) &
-                 + in(i+6,j+1) * (0.0006313131313131314) &
-                 + in(i-6,j+2) * (-0.0006313131313131314) &
-                 + in(i-5,j+2) * (-0.000925925925925926) &
-                 + in(i-4,j+2) * (-0.001488095238095238) &
-                 + in(i-3,j+2) * (-0.002777777777777778) &
-                 + in(i-2,j+2) * (-0.006944444444444444) &
-                 + in(i-1,j+2) * (-0.041666666666666664) &
-                 + in(i+1,j+2) * (0.006944444444444444) &
-                 + in(i+2,j+2) * (0.020833333333333332) &
-                 + in(i+3,j+2) * (0.002777777777777778) &
-                 + in(i+4,j+2) * (0.001488095238095238) &
-                 + in(i+5,j+2) * (0.000925925925925926) &
-                 + in(i+6,j+2) * (0.0006313131313131314) &
-                 + in(i-6,j+3) * (-0.0006313131313131314) &
-                 + in(i-5,j+3) * (-0.000925925925925926) &
-                 + in(i-4,j+3) * (-0.001488095238095238) &
-                 + in(i-3,j+3) * (-0.002777777777777778) &
-                 + in(i-2,j+3) * (-0.006944444444444444) &
-                 + in(i-1,j+3) * (-0.041666666666666664) &
-                 + in(i+1,j+3) * (0.002777777777777778) &
-                 + in(i+2,j+3) * (0.002777777777777778) &
-                 + in(i+3,j+3) * (0.013888888888888888) &
-                 + in(i+4,j+3) * (0.001488095238095238) &
-                 + in(i+5,j+3) * (0.000925925925925926) &
-                 + in(i+6,j+3) * (0.0006313131313131314) &
-                 + in(i-6,j+4) * (-0.0006313131313131314) &
-                 + in(i-5,j+4) * (-0.000925925925925926) &
-                 + in(i-4,j+4) * (-0.001488095238095238) &
-                 + in(i-3,j+4) * (-0.002777777777777778) &
-                 + in(i-2,j+4) * (-0.006944444444444444) &
-                 + in(i-1,j+4) * (-0.041666666666666664) &
-                 + in(i+1,j+4) * (0.001488095238095238) &
-                 + in(i+2,j+4) * (0.001488095238095238) &
-                 + in(i+3,j+4) * (0.001488095238095238) &
-                 + in(i+4,j+4) * (0.010416666666666666) &
-                 + in(i+5,j+4) * (0.000925925925925926) &
-                 + in(i+6,j+4) * (0.0006313131313131314) &
-                 + in(i-6,j+5) * (-0.0006313131313131314) &
-                 + in(i-5,j+5) * (-0.000925925925925926) &
-                 + in(i-4,j+5) * (-0.001488095238095238) &
-                 + in(i-3,j+5) * (-0.002777777777777778) &
-                 + in(i-2,j+5) * (-0.006944444444444444) &
-                 + in(i-1,j+5) * (-0.041666666666666664) &
-                 + in(i+1,j+5) * (0.000925925925925926) &
-                 + in(i+2,j+5) * (0.000925925925925926) &
-                 + in(i+3,j+5) * (0.000925925925925926) &
-                 + in(i+4,j+5) * (0.000925925925925926) &
-                 + in(i+5,j+5) * (0.008333333333333333) &
-                 + in(i+6,j+5) * (0.0006313131313131314) &
-                 + in(i-6,j+6) * (-0.0006313131313131314) &
-                 + in(i-5,j+6) * (-0.000925925925925926) &
-                 + in(i-4,j+6) * (-0.001488095238095238) &
-                 + in(i-3,j+6) * (-0.002777777777777778) &
-                 + in(i-2,j+6) * (-0.006944444444444444) &
-                 + in(i-1,j+6) * (-0.041666666666666664) &
-                 + in(i+1,j+6) * (0.0006313131313131314) &
-                 + in(i+2,j+6) * (0.0006313131313131314) &
-                 + in(i+3,j+6) * (0.0006313131313131314) &
-                 + in(i+4,j+6) * (0.0006313131313131314) &
-                 + in(i+5,j+6) * (0.0006313131313131314) &
-                 + in(i+6,j+6) * (0.006944444444444444) &
+                 + in(i-6,j-6) * (-0.006944444444444444d0) &
+                 + in(i+1,j-6) * (-0.0006313131313131314d0) &
+                 + in(i+2,j-6) * (-0.0006313131313131314d0) &
+                 + in(i+3,j-6) * (-0.0006313131313131314d0) &
+                 + in(i+4,j-6) * (-0.0006313131313131314d0) &
+                 + in(i+5,j-6) * (-0.0006313131313131314d0) &
+                 + in(i+6,j-6) * (-0.0006313131313131314d0) &
+                 + in(i-5,j-5) * (-0.008333333333333333d0) &
+                 + in(i+1,j-5) * (-0.000925925925925926d0) &
+                 + in(i+2,j-5) * (-0.000925925925925926d0) &
+                 + in(i+3,j-5) * (-0.000925925925925926d0) &
+                 + in(i+4,j-5) * (-0.000925925925925926d0) &
+                 + in(i+5,j-5) * (-0.000925925925925926d0) &
+                 + in(i+6,j-5) * (-0.000925925925925926d0) &
+                 + in(i-4,j-4) * (-0.010416666666666666d0) &
+                 + in(i+1,j-4) * (-0.001488095238095238d0) &
+                 + in(i+2,j-4) * (-0.001488095238095238d0) &
+                 + in(i+3,j-4) * (-0.001488095238095238d0) &
+                 + in(i+4,j-4) * (-0.001488095238095238d0) &
+                 + in(i+5,j-4) * (-0.001488095238095238d0) &
+                 + in(i+6,j-4) * (-0.001488095238095238d0) &
+                 + in(i-3,j-3) * (-0.013888888888888888d0) &
+                 + in(i+1,j-3) * (-0.002777777777777778d0) &
+                 + in(i+2,j-3) * (-0.002777777777777778d0) &
+                 + in(i+3,j-3) * (-0.002777777777777778d0) &
+                 + in(i+4,j-3) * (-0.002777777777777778d0) &
+                 + in(i+5,j-3) * (-0.002777777777777778d0) &
+                 + in(i+6,j-3) * (-0.002777777777777778d0) &
+                 + in(i-2,j-2) * (-0.020833333333333332d0) &
+                 + in(i+1,j-2) * (-0.006944444444444444d0) &
+                 + in(i+2,j-2) * (-0.006944444444444444d0) &
+                 + in(i+3,j-2) * (-0.006944444444444444d0) &
+                 + in(i+4,j-2) * (-0.006944444444444444d0) &
+                 + in(i+5,j-2) * (-0.006944444444444444d0) &
+                 + in(i+6,j-2) * (-0.006944444444444444d0) &
+                 + in(i-1,j-1) * (-0.041666666666666664d0) &
+                 + in(i+1,j-1) * (-0.041666666666666664d0) &
+                 + in(i+2,j-1) * (-0.041666666666666664d0) &
+                 + in(i+3,j-1) * (-0.041666666666666664d0) &
+                 + in(i+4,j-1) * (-0.041666666666666664d0) &
+                 + in(i+5,j-1) * (-0.041666666666666664d0) &
+                 + in(i+6,j-1) * (-0.041666666666666664d0) &
+                 + in(i-6,j+1) * (-0.0006313131313131314d0) &
+                 + in(i-5,j+1) * (-0.000925925925925926d0) &
+                 + in(i-4,j+1) * (-0.001488095238095238d0) &
+                 + in(i-3,j+1) * (-0.002777777777777778d0) &
+                 + in(i-2,j+1) * (-0.006944444444444444d0) &
+                 + in(i-1,j+1) * (-0.041666666666666664d0) &
+                 + in(i+1,j+1) * (0.041666666666666664d0) &
+                 + in(i+2,j+1) * (0.006944444444444444d0) &
+                 + in(i+3,j+1) * (0.002777777777777778d0) &
+                 + in(i+4,j+1) * (0.001488095238095238d0) &
+                 + in(i+5,j+1) * (0.000925925925925926d0) &
+                 + in(i+6,j+1) * (0.0006313131313131314d0) &
+                 + in(i-6,j+2) * (-0.0006313131313131314d0) &
+                 + in(i-5,j+2) * (-0.000925925925925926d0) &
+                 + in(i-4,j+2) * (-0.001488095238095238d0) &
+                 + in(i-3,j+2) * (-0.002777777777777778d0) &
+                 + in(i-2,j+2) * (-0.006944444444444444d0) &
+                 + in(i-1,j+2) * (-0.041666666666666664d0) &
+                 + in(i+1,j+2) * (0.006944444444444444d0) &
+                 + in(i+2,j+2) * (0.020833333333333332d0) &
+                 + in(i+3,j+2) * (0.002777777777777778d0) &
+                 + in(i+4,j+2) * (0.001488095238095238d0) &
+                 + in(i+5,j+2) * (0.000925925925925926d0) &
+                 + in(i+6,j+2) * (0.0006313131313131314d0) &
+                 + in(i-6,j+3) * (-0.0006313131313131314d0) &
+                 + in(i-5,j+3) * (-0.000925925925925926d0) &
+                 + in(i-4,j+3) * (-0.001488095238095238d0) &
+                 + in(i-3,j+3) * (-0.002777777777777778d0) &
+                 + in(i-2,j+3) * (-0.006944444444444444d0) &
+                 + in(i-1,j+3) * (-0.041666666666666664d0) &
+                 + in(i+1,j+3) * (0.002777777777777778d0) &
+                 + in(i+2,j+3) * (0.002777777777777778d0) &
+                 + in(i+3,j+3) * (0.013888888888888888d0) &
+                 + in(i+4,j+3) * (0.001488095238095238d0) &
+                 + in(i+5,j+3) * (0.000925925925925926d0) &
+                 + in(i+6,j+3) * (0.0006313131313131314d0) &
+                 + in(i-6,j+4) * (-0.0006313131313131314d0) &
+                 + in(i-5,j+4) * (-0.000925925925925926d0) &
+                 + in(i-4,j+4) * (-0.001488095238095238d0) &
+                 + in(i-3,j+4) * (-0.002777777777777778d0) &
+                 + in(i-2,j+4) * (-0.006944444444444444d0) &
+                 + in(i-1,j+4) * (-0.041666666666666664d0) &
+                 + in(i+1,j+4) * (0.001488095238095238d0) &
+                 + in(i+2,j+4) * (0.001488095238095238d0) &
+                 + in(i+3,j+4) * (0.001488095238095238d0) &
+                 + in(i+4,j+4) * (0.010416666666666666d0) &
+                 + in(i+5,j+4) * (0.000925925925925926d0) &
+                 + in(i+6,j+4) * (0.0006313131313131314d0) &
+                 + in(i-6,j+5) * (-0.0006313131313131314d0) &
+                 + in(i-5,j+5) * (-0.000925925925925926d0) &
+                 + in(i-4,j+5) * (-0.001488095238095238d0) &
+                 + in(i-3,j+5) * (-0.002777777777777778d0) &
+                 + in(i-2,j+5) * (-0.006944444444444444d0) &
+                 + in(i-1,j+5) * (-0.041666666666666664d0) &
+                 + in(i+1,j+5) * (0.000925925925925926d0) &
+                 + in(i+2,j+5) * (0.000925925925925926d0) &
+                 + in(i+3,j+5) * (0.000925925925925926d0) &
+                 + in(i+4,j+5) * (0.000925925925925926d0) &
+                 + in(i+5,j+5) * (0.008333333333333333d0) &
+                 + in(i+6,j+5) * (0.0006313131313131314d0) &
+                 + in(i-6,j+6) * (-0.0006313131313131314d0) &
+                 + in(i-5,j+6) * (-0.000925925925925926d0) &
+                 + in(i-4,j+6) * (-0.001488095238095238d0) &
+                 + in(i-3,j+6) * (-0.002777777777777778d0) &
+                 + in(i-2,j+6) * (-0.006944444444444444d0) &
+                 + in(i-1,j+6) * (-0.041666666666666664d0) &
+                 + in(i+1,j+6) * (0.0006313131313131314d0) &
+                 + in(i+2,j+6) * (0.0006313131313131314d0) &
+                 + in(i+3,j+6) * (0.0006313131313131314d0) &
+                 + in(i+4,j+6) * (0.0006313131313131314d0) &
+                 + in(i+5,j+6) * (0.0006313131313131314d0) &
+                 + in(i+6,j+6) * (0.006944444444444444d0) &
 +0.0
       end do
     end do
@@ -707,160 +707,160 @@ subroutine grid7(n, in, out)
     do i=7,n-7-1
       do j=7,n-7-1
         out(i,j) = out(i,j) &
-                 + in(i-7,j-7) * (-0.00510204081632653) &
-                 + in(i+1,j-7) * (-0.0003924646781789639) &
-                 + in(i+2,j-7) * (-0.0003924646781789639) &
-                 + in(i+3,j-7) * (-0.0003924646781789639) &
-                 + in(i+4,j-7) * (-0.0003924646781789639) &
-                 + in(i+5,j-7) * (-0.0003924646781789639) &
-                 + in(i+6,j-7) * (-0.0003924646781789639) &
-                 + in(i+7,j-7) * (-0.0003924646781789639) &
-                 + in(i-6,j-6) * (-0.005952380952380952) &
-                 + in(i+1,j-6) * (-0.0005411255411255411) &
-                 + in(i+2,j-6) * (-0.0005411255411255411) &
-                 + in(i+3,j-6) * (-0.0005411255411255411) &
-                 + in(i+4,j-6) * (-0.0005411255411255411) &
-                 + in(i+5,j-6) * (-0.0005411255411255411) &
-                 + in(i+6,j-6) * (-0.0005411255411255411) &
-                 + in(i+7,j-6) * (-0.0005411255411255411) &
-                 + in(i-5,j-5) * (-0.007142857142857143) &
-                 + in(i+1,j-5) * (-0.0007936507936507937) &
-                 + in(i+2,j-5) * (-0.0007936507936507937) &
-                 + in(i+3,j-5) * (-0.0007936507936507937) &
-                 + in(i+4,j-5) * (-0.0007936507936507937) &
-                 + in(i+5,j-5) * (-0.0007936507936507937) &
-                 + in(i+6,j-5) * (-0.0007936507936507937) &
-                 + in(i+7,j-5) * (-0.0007936507936507937) &
-                 + in(i-4,j-4) * (-0.008928571428571428) &
-                 + in(i+1,j-4) * (-0.0012755102040816326) &
-                 + in(i+2,j-4) * (-0.0012755102040816326) &
-                 + in(i+3,j-4) * (-0.0012755102040816326) &
-                 + in(i+4,j-4) * (-0.0012755102040816326) &
-                 + in(i+5,j-4) * (-0.0012755102040816326) &
-                 + in(i+6,j-4) * (-0.0012755102040816326) &
-                 + in(i+7,j-4) * (-0.0012755102040816326) &
-                 + in(i-3,j-3) * (-0.011904761904761904) &
-                 + in(i+1,j-3) * (-0.002380952380952381) &
-                 + in(i+2,j-3) * (-0.002380952380952381) &
-                 + in(i+3,j-3) * (-0.002380952380952381) &
-                 + in(i+4,j-3) * (-0.002380952380952381) &
-                 + in(i+5,j-3) * (-0.002380952380952381) &
-                 + in(i+6,j-3) * (-0.002380952380952381) &
-                 + in(i+7,j-3) * (-0.002380952380952381) &
-                 + in(i-2,j-2) * (-0.017857142857142856) &
-                 + in(i+1,j-2) * (-0.005952380952380952) &
-                 + in(i+2,j-2) * (-0.005952380952380952) &
-                 + in(i+3,j-2) * (-0.005952380952380952) &
-                 + in(i+4,j-2) * (-0.005952380952380952) &
-                 + in(i+5,j-2) * (-0.005952380952380952) &
-                 + in(i+6,j-2) * (-0.005952380952380952) &
-                 + in(i+7,j-2) * (-0.005952380952380952) &
-                 + in(i-1,j-1) * (-0.03571428571428571) &
-                 + in(i+1,j-1) * (-0.03571428571428571) &
-                 + in(i+2,j-1) * (-0.03571428571428571) &
-                 + in(i+3,j-1) * (-0.03571428571428571) &
-                 + in(i+4,j-1) * (-0.03571428571428571) &
-                 + in(i+5,j-1) * (-0.03571428571428571) &
-                 + in(i+6,j-1) * (-0.03571428571428571) &
-                 + in(i+7,j-1) * (-0.03571428571428571) &
-                 + in(i-7,j+1) * (-0.0003924646781789639) &
-                 + in(i-6,j+1) * (-0.0005411255411255411) &
-                 + in(i-5,j+1) * (-0.0007936507936507937) &
-                 + in(i-4,j+1) * (-0.0012755102040816326) &
-                 + in(i-3,j+1) * (-0.002380952380952381) &
-                 + in(i-2,j+1) * (-0.005952380952380952) &
-                 + in(i-1,j+1) * (-0.03571428571428571) &
-                 + in(i+1,j+1) * (0.03571428571428571) &
-                 + in(i+2,j+1) * (0.005952380952380952) &
-                 + in(i+3,j+1) * (0.002380952380952381) &
-                 + in(i+4,j+1) * (0.0012755102040816326) &
-                 + in(i+5,j+1) * (0.0007936507936507937) &
-                 + in(i+6,j+1) * (0.0005411255411255411) &
-                 + in(i+7,j+1) * (0.0003924646781789639) &
-                 + in(i-7,j+2) * (-0.0003924646781789639) &
-                 + in(i-6,j+2) * (-0.0005411255411255411) &
-                 + in(i-5,j+2) * (-0.0007936507936507937) &
-                 + in(i-4,j+2) * (-0.0012755102040816326) &
-                 + in(i-3,j+2) * (-0.002380952380952381) &
-                 + in(i-2,j+2) * (-0.005952380952380952) &
-                 + in(i-1,j+2) * (-0.03571428571428571) &
-                 + in(i+1,j+2) * (0.005952380952380952) &
-                 + in(i+2,j+2) * (0.017857142857142856) &
-                 + in(i+3,j+2) * (0.002380952380952381) &
-                 + in(i+4,j+2) * (0.0012755102040816326) &
-                 + in(i+5,j+2) * (0.0007936507936507937) &
-                 + in(i+6,j+2) * (0.0005411255411255411) &
-                 + in(i+7,j+2) * (0.0003924646781789639) &
-                 + in(i-7,j+3) * (-0.0003924646781789639) &
-                 + in(i-6,j+3) * (-0.0005411255411255411) &
-                 + in(i-5,j+3) * (-0.0007936507936507937) &
-                 + in(i-4,j+3) * (-0.0012755102040816326) &
-                 + in(i-3,j+3) * (-0.002380952380952381) &
-                 + in(i-2,j+3) * (-0.005952380952380952) &
-                 + in(i-1,j+3) * (-0.03571428571428571) &
-                 + in(i+1,j+3) * (0.002380952380952381) &
-                 + in(i+2,j+3) * (0.002380952380952381) &
-                 + in(i+3,j+3) * (0.011904761904761904) &
-                 + in(i+4,j+3) * (0.0012755102040816326) &
-                 + in(i+5,j+3) * (0.0007936507936507937) &
-                 + in(i+6,j+3) * (0.0005411255411255411) &
-                 + in(i+7,j+3) * (0.0003924646781789639) &
-                 + in(i-7,j+4) * (-0.0003924646781789639) &
-                 + in(i-6,j+4) * (-0.0005411255411255411) &
-                 + in(i-5,j+4) * (-0.0007936507936507937) &
-                 + in(i-4,j+4) * (-0.0012755102040816326) &
-                 + in(i-3,j+4) * (-0.002380952380952381) &
-                 + in(i-2,j+4) * (-0.005952380952380952) &
-                 + in(i-1,j+4) * (-0.03571428571428571) &
-                 + in(i+1,j+4) * (0.0012755102040816326) &
-                 + in(i+2,j+4) * (0.0012755102040816326) &
-                 + in(i+3,j+4) * (0.0012755102040816326) &
-                 + in(i+4,j+4) * (0.008928571428571428) &
-                 + in(i+5,j+4) * (0.0007936507936507937) &
-                 + in(i+6,j+4) * (0.0005411255411255411) &
-                 + in(i+7,j+4) * (0.0003924646781789639) &
-                 + in(i-7,j+5) * (-0.0003924646781789639) &
-                 + in(i-6,j+5) * (-0.0005411255411255411) &
-                 + in(i-5,j+5) * (-0.0007936507936507937) &
-                 + in(i-4,j+5) * (-0.0012755102040816326) &
-                 + in(i-3,j+5) * (-0.002380952380952381) &
-                 + in(i-2,j+5) * (-0.005952380952380952) &
-                 + in(i-1,j+5) * (-0.03571428571428571) &
-                 + in(i+1,j+5) * (0.0007936507936507937) &
-                 + in(i+2,j+5) * (0.0007936507936507937) &
-                 + in(i+3,j+5) * (0.0007936507936507937) &
-                 + in(i+4,j+5) * (0.0007936507936507937) &
-                 + in(i+5,j+5) * (0.007142857142857143) &
-                 + in(i+6,j+5) * (0.0005411255411255411) &
-                 + in(i+7,j+5) * (0.0003924646781789639) &
-                 + in(i-7,j+6) * (-0.0003924646781789639) &
-                 + in(i-6,j+6) * (-0.0005411255411255411) &
-                 + in(i-5,j+6) * (-0.0007936507936507937) &
-                 + in(i-4,j+6) * (-0.0012755102040816326) &
-                 + in(i-3,j+6) * (-0.002380952380952381) &
-                 + in(i-2,j+6) * (-0.005952380952380952) &
-                 + in(i-1,j+6) * (-0.03571428571428571) &
-                 + in(i+1,j+6) * (0.0005411255411255411) &
-                 + in(i+2,j+6) * (0.0005411255411255411) &
-                 + in(i+3,j+6) * (0.0005411255411255411) &
-                 + in(i+4,j+6) * (0.0005411255411255411) &
-                 + in(i+5,j+6) * (0.0005411255411255411) &
-                 + in(i+6,j+6) * (0.005952380952380952) &
-                 + in(i+7,j+6) * (0.0003924646781789639) &
-                 + in(i-7,j+7) * (-0.0003924646781789639) &
-                 + in(i-6,j+7) * (-0.0005411255411255411) &
-                 + in(i-5,j+7) * (-0.0007936507936507937) &
-                 + in(i-4,j+7) * (-0.0012755102040816326) &
-                 + in(i-3,j+7) * (-0.002380952380952381) &
-                 + in(i-2,j+7) * (-0.005952380952380952) &
-                 + in(i-1,j+7) * (-0.03571428571428571) &
-                 + in(i+1,j+7) * (0.0003924646781789639) &
-                 + in(i+2,j+7) * (0.0003924646781789639) &
-                 + in(i+3,j+7) * (0.0003924646781789639) &
-                 + in(i+4,j+7) * (0.0003924646781789639) &
-                 + in(i+5,j+7) * (0.0003924646781789639) &
-                 + in(i+6,j+7) * (0.0003924646781789639) &
-                 + in(i+7,j+7) * (0.00510204081632653) &
+                 + in(i-7,j-7) * (-0.00510204081632653d0) &
+                 + in(i+1,j-7) * (-0.0003924646781789639d0) &
+                 + in(i+2,j-7) * (-0.0003924646781789639d0) &
+                 + in(i+3,j-7) * (-0.0003924646781789639d0) &
+                 + in(i+4,j-7) * (-0.0003924646781789639d0) &
+                 + in(i+5,j-7) * (-0.0003924646781789639d0) &
+                 + in(i+6,j-7) * (-0.0003924646781789639d0) &
+                 + in(i+7,j-7) * (-0.0003924646781789639d0) &
+                 + in(i-6,j-6) * (-0.005952380952380952d0) &
+                 + in(i+1,j-6) * (-0.0005411255411255411d0) &
+                 + in(i+2,j-6) * (-0.0005411255411255411d0) &
+                 + in(i+3,j-6) * (-0.0005411255411255411d0) &
+                 + in(i+4,j-6) * (-0.0005411255411255411d0) &
+                 + in(i+5,j-6) * (-0.0005411255411255411d0) &
+                 + in(i+6,j-6) * (-0.0005411255411255411d0) &
+                 + in(i+7,j-6) * (-0.0005411255411255411d0) &
+                 + in(i-5,j-5) * (-0.007142857142857143d0) &
+                 + in(i+1,j-5) * (-0.0007936507936507937d0) &
+                 + in(i+2,j-5) * (-0.0007936507936507937d0) &
+                 + in(i+3,j-5) * (-0.0007936507936507937d0) &
+                 + in(i+4,j-5) * (-0.0007936507936507937d0) &
+                 + in(i+5,j-5) * (-0.0007936507936507937d0) &
+                 + in(i+6,j-5) * (-0.0007936507936507937d0) &
+                 + in(i+7,j-5) * (-0.0007936507936507937d0) &
+                 + in(i-4,j-4) * (-0.008928571428571428d0) &
+                 + in(i+1,j-4) * (-0.0012755102040816326d0) &
+                 + in(i+2,j-4) * (-0.0012755102040816326d0) &
+                 + in(i+3,j-4) * (-0.0012755102040816326d0) &
+                 + in(i+4,j-4) * (-0.0012755102040816326d0) &
+                 + in(i+5,j-4) * (-0.0012755102040816326d0) &
+                 + in(i+6,j-4) * (-0.0012755102040816326d0) &
+                 + in(i+7,j-4) * (-0.0012755102040816326d0) &
+                 + in(i-3,j-3) * (-0.011904761904761904d0) &
+                 + in(i+1,j-3) * (-0.002380952380952381d0) &
+                 + in(i+2,j-3) * (-0.002380952380952381d0) &
+                 + in(i+3,j-3) * (-0.002380952380952381d0) &
+                 + in(i+4,j-3) * (-0.002380952380952381d0) &
+                 + in(i+5,j-3) * (-0.002380952380952381d0) &
+                 + in(i+6,j-3) * (-0.002380952380952381d0) &
+                 + in(i+7,j-3) * (-0.002380952380952381d0) &
+                 + in(i-2,j-2) * (-0.017857142857142856d0) &
+                 + in(i+1,j-2) * (-0.005952380952380952d0) &
+                 + in(i+2,j-2) * (-0.005952380952380952d0) &
+                 + in(i+3,j-2) * (-0.005952380952380952d0) &
+                 + in(i+4,j-2) * (-0.005952380952380952d0) &
+                 + in(i+5,j-2) * (-0.005952380952380952d0) &
+                 + in(i+6,j-2) * (-0.005952380952380952d0) &
+                 + in(i+7,j-2) * (-0.005952380952380952d0) &
+                 + in(i-1,j-1) * (-0.03571428571428571d0) &
+                 + in(i+1,j-1) * (-0.03571428571428571d0) &
+                 + in(i+2,j-1) * (-0.03571428571428571d0) &
+                 + in(i+3,j-1) * (-0.03571428571428571d0) &
+                 + in(i+4,j-1) * (-0.03571428571428571d0) &
+                 + in(i+5,j-1) * (-0.03571428571428571d0) &
+                 + in(i+6,j-1) * (-0.03571428571428571d0) &
+                 + in(i+7,j-1) * (-0.03571428571428571d0) &
+                 + in(i-7,j+1) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+1) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+1) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+1) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+1) * (-0.002380952380952381d0) &
+                 + in(i-2,j+1) * (-0.005952380952380952d0) &
+                 + in(i-1,j+1) * (-0.03571428571428571d0) &
+                 + in(i+1,j+1) * (0.03571428571428571d0) &
+                 + in(i+2,j+1) * (0.005952380952380952d0) &
+                 + in(i+3,j+1) * (0.002380952380952381d0) &
+                 + in(i+4,j+1) * (0.0012755102040816326d0) &
+                 + in(i+5,j+1) * (0.0007936507936507937d0) &
+                 + in(i+6,j+1) * (0.0005411255411255411d0) &
+                 + in(i+7,j+1) * (0.0003924646781789639d0) &
+                 + in(i-7,j+2) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+2) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+2) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+2) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+2) * (-0.002380952380952381d0) &
+                 + in(i-2,j+2) * (-0.005952380952380952d0) &
+                 + in(i-1,j+2) * (-0.03571428571428571d0) &
+                 + in(i+1,j+2) * (0.005952380952380952d0) &
+                 + in(i+2,j+2) * (0.017857142857142856d0) &
+                 + in(i+3,j+2) * (0.002380952380952381d0) &
+                 + in(i+4,j+2) * (0.0012755102040816326d0) &
+                 + in(i+5,j+2) * (0.0007936507936507937d0) &
+                 + in(i+6,j+2) * (0.0005411255411255411d0) &
+                 + in(i+7,j+2) * (0.0003924646781789639d0) &
+                 + in(i-7,j+3) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+3) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+3) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+3) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+3) * (-0.002380952380952381d0) &
+                 + in(i-2,j+3) * (-0.005952380952380952d0) &
+                 + in(i-1,j+3) * (-0.03571428571428571d0) &
+                 + in(i+1,j+3) * (0.002380952380952381d0) &
+                 + in(i+2,j+3) * (0.002380952380952381d0) &
+                 + in(i+3,j+3) * (0.011904761904761904d0) &
+                 + in(i+4,j+3) * (0.0012755102040816326d0) &
+                 + in(i+5,j+3) * (0.0007936507936507937d0) &
+                 + in(i+6,j+3) * (0.0005411255411255411d0) &
+                 + in(i+7,j+3) * (0.0003924646781789639d0) &
+                 + in(i-7,j+4) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+4) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+4) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+4) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+4) * (-0.002380952380952381d0) &
+                 + in(i-2,j+4) * (-0.005952380952380952d0) &
+                 + in(i-1,j+4) * (-0.03571428571428571d0) &
+                 + in(i+1,j+4) * (0.0012755102040816326d0) &
+                 + in(i+2,j+4) * (0.0012755102040816326d0) &
+                 + in(i+3,j+4) * (0.0012755102040816326d0) &
+                 + in(i+4,j+4) * (0.008928571428571428d0) &
+                 + in(i+5,j+4) * (0.0007936507936507937d0) &
+                 + in(i+6,j+4) * (0.0005411255411255411d0) &
+                 + in(i+7,j+4) * (0.0003924646781789639d0) &
+                 + in(i-7,j+5) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+5) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+5) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+5) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+5) * (-0.002380952380952381d0) &
+                 + in(i-2,j+5) * (-0.005952380952380952d0) &
+                 + in(i-1,j+5) * (-0.03571428571428571d0) &
+                 + in(i+1,j+5) * (0.0007936507936507937d0) &
+                 + in(i+2,j+5) * (0.0007936507936507937d0) &
+                 + in(i+3,j+5) * (0.0007936507936507937d0) &
+                 + in(i+4,j+5) * (0.0007936507936507937d0) &
+                 + in(i+5,j+5) * (0.007142857142857143d0) &
+                 + in(i+6,j+5) * (0.0005411255411255411d0) &
+                 + in(i+7,j+5) * (0.0003924646781789639d0) &
+                 + in(i-7,j+6) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+6) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+6) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+6) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+6) * (-0.002380952380952381d0) &
+                 + in(i-2,j+6) * (-0.005952380952380952d0) &
+                 + in(i-1,j+6) * (-0.03571428571428571d0) &
+                 + in(i+1,j+6) * (0.0005411255411255411d0) &
+                 + in(i+2,j+6) * (0.0005411255411255411d0) &
+                 + in(i+3,j+6) * (0.0005411255411255411d0) &
+                 + in(i+4,j+6) * (0.0005411255411255411d0) &
+                 + in(i+5,j+6) * (0.0005411255411255411d0) &
+                 + in(i+6,j+6) * (0.005952380952380952d0) &
+                 + in(i+7,j+6) * (0.0003924646781789639d0) &
+                 + in(i-7,j+7) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+7) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+7) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+7) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+7) * (-0.002380952380952381d0) &
+                 + in(i-2,j+7) * (-0.005952380952380952d0) &
+                 + in(i-1,j+7) * (-0.03571428571428571d0) &
+                 + in(i+1,j+7) * (0.0003924646781789639d0) &
+                 + in(i+2,j+7) * (0.0003924646781789639d0) &
+                 + in(i+3,j+7) * (0.0003924646781789639d0) &
+                 + in(i+4,j+7) * (0.0003924646781789639d0) &
+                 + in(i+5,j+7) * (0.0003924646781789639d0) &
+                 + in(i+6,j+7) * (0.0003924646781789639d0) &
+                 + in(i+7,j+7) * (0.00510204081632653d0) &
 +0.0
       end do
     end do
@@ -876,206 +876,206 @@ subroutine grid8(n, in, out)
     do i=8,n-8-1
       do j=8,n-8-1
         out(i,j) = out(i,j) &
-                 + in(i-8,j-8) * (-0.00390625) &
-                 + in(i+1,j-8) * (-0.00026041666666666666) &
-                 + in(i+2,j-8) * (-0.00026041666666666666) &
-                 + in(i+3,j-8) * (-0.00026041666666666666) &
-                 + in(i+4,j-8) * (-0.00026041666666666666) &
-                 + in(i+5,j-8) * (-0.00026041666666666666) &
-                 + in(i+6,j-8) * (-0.00026041666666666666) &
-                 + in(i+7,j-8) * (-0.00026041666666666666) &
-                 + in(i+8,j-8) * (-0.00026041666666666666) &
-                 + in(i-7,j-7) * (-0.004464285714285714) &
-                 + in(i+1,j-7) * (-0.00034340659340659343) &
-                 + in(i+2,j-7) * (-0.00034340659340659343) &
-                 + in(i+3,j-7) * (-0.00034340659340659343) &
-                 + in(i+4,j-7) * (-0.00034340659340659343) &
-                 + in(i+5,j-7) * (-0.00034340659340659343) &
-                 + in(i+6,j-7) * (-0.00034340659340659343) &
-                 + in(i+7,j-7) * (-0.00034340659340659343) &
-                 + in(i+8,j-7) * (-0.00034340659340659343) &
-                 + in(i-6,j-6) * (-0.005208333333333333) &
-                 + in(i+1,j-6) * (-0.0004734848484848485) &
-                 + in(i+2,j-6) * (-0.0004734848484848485) &
-                 + in(i+3,j-6) * (-0.0004734848484848485) &
-                 + in(i+4,j-6) * (-0.0004734848484848485) &
-                 + in(i+5,j-6) * (-0.0004734848484848485) &
-                 + in(i+6,j-6) * (-0.0004734848484848485) &
-                 + in(i+7,j-6) * (-0.0004734848484848485) &
-                 + in(i+8,j-6) * (-0.0004734848484848485) &
-                 + in(i-5,j-5) * (-0.00625) &
-                 + in(i+1,j-5) * (-0.0006944444444444445) &
-                 + in(i+2,j-5) * (-0.0006944444444444445) &
-                 + in(i+3,j-5) * (-0.0006944444444444445) &
-                 + in(i+4,j-5) * (-0.0006944444444444445) &
-                 + in(i+5,j-5) * (-0.0006944444444444445) &
-                 + in(i+6,j-5) * (-0.0006944444444444445) &
-                 + in(i+7,j-5) * (-0.0006944444444444445) &
-                 + in(i+8,j-5) * (-0.0006944444444444445) &
-                 + in(i-4,j-4) * (-0.0078125) &
-                 + in(i+1,j-4) * (-0.0011160714285714285) &
-                 + in(i+2,j-4) * (-0.0011160714285714285) &
-                 + in(i+3,j-4) * (-0.0011160714285714285) &
-                 + in(i+4,j-4) * (-0.0011160714285714285) &
-                 + in(i+5,j-4) * (-0.0011160714285714285) &
-                 + in(i+6,j-4) * (-0.0011160714285714285) &
-                 + in(i+7,j-4) * (-0.0011160714285714285) &
-                 + in(i+8,j-4) * (-0.0011160714285714285) &
-                 + in(i-3,j-3) * (-0.010416666666666666) &
-                 + in(i+1,j-3) * (-0.0020833333333333333) &
-                 + in(i+2,j-3) * (-0.0020833333333333333) &
-                 + in(i+3,j-3) * (-0.0020833333333333333) &
-                 + in(i+4,j-3) * (-0.0020833333333333333) &
-                 + in(i+5,j-3) * (-0.0020833333333333333) &
-                 + in(i+6,j-3) * (-0.0020833333333333333) &
-                 + in(i+7,j-3) * (-0.0020833333333333333) &
-                 + in(i+8,j-3) * (-0.0020833333333333333) &
-                 + in(i-2,j-2) * (-0.015625) &
-                 + in(i+1,j-2) * (-0.005208333333333333) &
-                 + in(i+2,j-2) * (-0.005208333333333333) &
-                 + in(i+3,j-2) * (-0.005208333333333333) &
-                 + in(i+4,j-2) * (-0.005208333333333333) &
-                 + in(i+5,j-2) * (-0.005208333333333333) &
-                 + in(i+6,j-2) * (-0.005208333333333333) &
-                 + in(i+7,j-2) * (-0.005208333333333333) &
-                 + in(i+8,j-2) * (-0.005208333333333333) &
-                 + in(i-1,j-1) * (-0.03125) &
-                 + in(i+1,j-1) * (-0.03125) &
-                 + in(i+2,j-1) * (-0.03125) &
-                 + in(i+3,j-1) * (-0.03125) &
-                 + in(i+4,j-1) * (-0.03125) &
-                 + in(i+5,j-1) * (-0.03125) &
-                 + in(i+6,j-1) * (-0.03125) &
-                 + in(i+7,j-1) * (-0.03125) &
-                 + in(i+8,j-1) * (-0.03125) &
-                 + in(i-8,j+1) * (-0.00026041666666666666) &
-                 + in(i-7,j+1) * (-0.00034340659340659343) &
-                 + in(i-6,j+1) * (-0.0004734848484848485) &
-                 + in(i-5,j+1) * (-0.0006944444444444445) &
-                 + in(i-4,j+1) * (-0.0011160714285714285) &
-                 + in(i-3,j+1) * (-0.0020833333333333333) &
-                 + in(i-2,j+1) * (-0.005208333333333333) &
-                 + in(i-1,j+1) * (-0.03125) &
-                 + in(i+1,j+1) * (0.03125) &
-                 + in(i+2,j+1) * (0.005208333333333333) &
-                 + in(i+3,j+1) * (0.0020833333333333333) &
-                 + in(i+4,j+1) * (0.0011160714285714285) &
-                 + in(i+5,j+1) * (0.0006944444444444445) &
-                 + in(i+6,j+1) * (0.0004734848484848485) &
-                 + in(i+7,j+1) * (0.00034340659340659343) &
-                 + in(i+8,j+1) * (0.00026041666666666666) &
-                 + in(i-8,j+2) * (-0.00026041666666666666) &
-                 + in(i-7,j+2) * (-0.00034340659340659343) &
-                 + in(i-6,j+2) * (-0.0004734848484848485) &
-                 + in(i-5,j+2) * (-0.0006944444444444445) &
-                 + in(i-4,j+2) * (-0.0011160714285714285) &
-                 + in(i-3,j+2) * (-0.0020833333333333333) &
-                 + in(i-2,j+2) * (-0.005208333333333333) &
-                 + in(i-1,j+2) * (-0.03125) &
-                 + in(i+1,j+2) * (0.005208333333333333) &
-                 + in(i+2,j+2) * (0.015625) &
-                 + in(i+3,j+2) * (0.0020833333333333333) &
-                 + in(i+4,j+2) * (0.0011160714285714285) &
-                 + in(i+5,j+2) * (0.0006944444444444445) &
-                 + in(i+6,j+2) * (0.0004734848484848485) &
-                 + in(i+7,j+2) * (0.00034340659340659343) &
-                 + in(i+8,j+2) * (0.00026041666666666666) &
-                 + in(i-8,j+3) * (-0.00026041666666666666) &
-                 + in(i-7,j+3) * (-0.00034340659340659343) &
-                 + in(i-6,j+3) * (-0.0004734848484848485) &
-                 + in(i-5,j+3) * (-0.0006944444444444445) &
-                 + in(i-4,j+3) * (-0.0011160714285714285) &
-                 + in(i-3,j+3) * (-0.0020833333333333333) &
-                 + in(i-2,j+3) * (-0.005208333333333333) &
-                 + in(i-1,j+3) * (-0.03125) &
-                 + in(i+1,j+3) * (0.0020833333333333333) &
-                 + in(i+2,j+3) * (0.0020833333333333333) &
-                 + in(i+3,j+3) * (0.010416666666666666) &
-                 + in(i+4,j+3) * (0.0011160714285714285) &
-                 + in(i+5,j+3) * (0.0006944444444444445) &
-                 + in(i+6,j+3) * (0.0004734848484848485) &
-                 + in(i+7,j+3) * (0.00034340659340659343) &
-                 + in(i+8,j+3) * (0.00026041666666666666) &
-                 + in(i-8,j+4) * (-0.00026041666666666666) &
-                 + in(i-7,j+4) * (-0.00034340659340659343) &
-                 + in(i-6,j+4) * (-0.0004734848484848485) &
-                 + in(i-5,j+4) * (-0.0006944444444444445) &
-                 + in(i-4,j+4) * (-0.0011160714285714285) &
-                 + in(i-3,j+4) * (-0.0020833333333333333) &
-                 + in(i-2,j+4) * (-0.005208333333333333) &
-                 + in(i-1,j+4) * (-0.03125) &
-                 + in(i+1,j+4) * (0.0011160714285714285) &
-                 + in(i+2,j+4) * (0.0011160714285714285) &
-                 + in(i+3,j+4) * (0.0011160714285714285) &
-                 + in(i+4,j+4) * (0.0078125) &
-                 + in(i+5,j+4) * (0.0006944444444444445) &
-                 + in(i+6,j+4) * (0.0004734848484848485) &
-                 + in(i+7,j+4) * (0.00034340659340659343) &
-                 + in(i+8,j+4) * (0.00026041666666666666) &
-                 + in(i-8,j+5) * (-0.00026041666666666666) &
-                 + in(i-7,j+5) * (-0.00034340659340659343) &
-                 + in(i-6,j+5) * (-0.0004734848484848485) &
-                 + in(i-5,j+5) * (-0.0006944444444444445) &
-                 + in(i-4,j+5) * (-0.0011160714285714285) &
-                 + in(i-3,j+5) * (-0.0020833333333333333) &
-                 + in(i-2,j+5) * (-0.005208333333333333) &
-                 + in(i-1,j+5) * (-0.03125) &
-                 + in(i+1,j+5) * (0.0006944444444444445) &
-                 + in(i+2,j+5) * (0.0006944444444444445) &
-                 + in(i+3,j+5) * (0.0006944444444444445) &
-                 + in(i+4,j+5) * (0.0006944444444444445) &
-                 + in(i+5,j+5) * (0.00625) &
-                 + in(i+6,j+5) * (0.0004734848484848485) &
-                 + in(i+7,j+5) * (0.00034340659340659343) &
-                 + in(i+8,j+5) * (0.00026041666666666666) &
-                 + in(i-8,j+6) * (-0.00026041666666666666) &
-                 + in(i-7,j+6) * (-0.00034340659340659343) &
-                 + in(i-6,j+6) * (-0.0004734848484848485) &
-                 + in(i-5,j+6) * (-0.0006944444444444445) &
-                 + in(i-4,j+6) * (-0.0011160714285714285) &
-                 + in(i-3,j+6) * (-0.0020833333333333333) &
-                 + in(i-2,j+6) * (-0.005208333333333333) &
-                 + in(i-1,j+6) * (-0.03125) &
-                 + in(i+1,j+6) * (0.0004734848484848485) &
-                 + in(i+2,j+6) * (0.0004734848484848485) &
-                 + in(i+3,j+6) * (0.0004734848484848485) &
-                 + in(i+4,j+6) * (0.0004734848484848485) &
-                 + in(i+5,j+6) * (0.0004734848484848485) &
-                 + in(i+6,j+6) * (0.005208333333333333) &
-                 + in(i+7,j+6) * (0.00034340659340659343) &
-                 + in(i+8,j+6) * (0.00026041666666666666) &
-                 + in(i-8,j+7) * (-0.00026041666666666666) &
-                 + in(i-7,j+7) * (-0.00034340659340659343) &
-                 + in(i-6,j+7) * (-0.0004734848484848485) &
-                 + in(i-5,j+7) * (-0.0006944444444444445) &
-                 + in(i-4,j+7) * (-0.0011160714285714285) &
-                 + in(i-3,j+7) * (-0.0020833333333333333) &
-                 + in(i-2,j+7) * (-0.005208333333333333) &
-                 + in(i-1,j+7) * (-0.03125) &
-                 + in(i+1,j+7) * (0.00034340659340659343) &
-                 + in(i+2,j+7) * (0.00034340659340659343) &
-                 + in(i+3,j+7) * (0.00034340659340659343) &
-                 + in(i+4,j+7) * (0.00034340659340659343) &
-                 + in(i+5,j+7) * (0.00034340659340659343) &
-                 + in(i+6,j+7) * (0.00034340659340659343) &
-                 + in(i+7,j+7) * (0.004464285714285714) &
-                 + in(i+8,j+7) * (0.00026041666666666666) &
-                 + in(i-8,j+8) * (-0.00026041666666666666) &
-                 + in(i-7,j+8) * (-0.00034340659340659343) &
-                 + in(i-6,j+8) * (-0.0004734848484848485) &
-                 + in(i-5,j+8) * (-0.0006944444444444445) &
-                 + in(i-4,j+8) * (-0.0011160714285714285) &
-                 + in(i-3,j+8) * (-0.0020833333333333333) &
-                 + in(i-2,j+8) * (-0.005208333333333333) &
-                 + in(i-1,j+8) * (-0.03125) &
-                 + in(i+1,j+8) * (0.00026041666666666666) &
-                 + in(i+2,j+8) * (0.00026041666666666666) &
-                 + in(i+3,j+8) * (0.00026041666666666666) &
-                 + in(i+4,j+8) * (0.00026041666666666666) &
-                 + in(i+5,j+8) * (0.00026041666666666666) &
-                 + in(i+6,j+8) * (0.00026041666666666666) &
-                 + in(i+7,j+8) * (0.00026041666666666666) &
-                 + in(i+8,j+8) * (0.00390625) &
+                 + in(i-8,j-8) * (-0.00390625d0) &
+                 + in(i+1,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+2,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+3,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+4,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+5,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+6,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+7,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+8,j-8) * (-0.00026041666666666666d0) &
+                 + in(i-7,j-7) * (-0.004464285714285714d0) &
+                 + in(i+1,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+2,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+3,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+4,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+5,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+6,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+7,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+8,j-7) * (-0.00034340659340659343d0) &
+                 + in(i-6,j-6) * (-0.005208333333333333d0) &
+                 + in(i+1,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+2,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+3,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+4,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+5,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+6,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+7,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+8,j-6) * (-0.0004734848484848485d0) &
+                 + in(i-5,j-5) * (-0.00625d0) &
+                 + in(i+1,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+2,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+3,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+4,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+5,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+6,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+7,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+8,j-5) * (-0.0006944444444444445d0) &
+                 + in(i-4,j-4) * (-0.0078125d0) &
+                 + in(i+1,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+2,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+3,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+4,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+5,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+6,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+7,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+8,j-4) * (-0.0011160714285714285d0) &
+                 + in(i-3,j-3) * (-0.010416666666666666d0) &
+                 + in(i+1,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+2,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+3,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+4,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+5,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+6,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+7,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+8,j-3) * (-0.0020833333333333333d0) &
+                 + in(i-2,j-2) * (-0.015625d0) &
+                 + in(i+1,j-2) * (-0.005208333333333333d0) &
+                 + in(i+2,j-2) * (-0.005208333333333333d0) &
+                 + in(i+3,j-2) * (-0.005208333333333333d0) &
+                 + in(i+4,j-2) * (-0.005208333333333333d0) &
+                 + in(i+5,j-2) * (-0.005208333333333333d0) &
+                 + in(i+6,j-2) * (-0.005208333333333333d0) &
+                 + in(i+7,j-2) * (-0.005208333333333333d0) &
+                 + in(i+8,j-2) * (-0.005208333333333333d0) &
+                 + in(i-1,j-1) * (-0.03125d0) &
+                 + in(i+1,j-1) * (-0.03125d0) &
+                 + in(i+2,j-1) * (-0.03125d0) &
+                 + in(i+3,j-1) * (-0.03125d0) &
+                 + in(i+4,j-1) * (-0.03125d0) &
+                 + in(i+5,j-1) * (-0.03125d0) &
+                 + in(i+6,j-1) * (-0.03125d0) &
+                 + in(i+7,j-1) * (-0.03125d0) &
+                 + in(i+8,j-1) * (-0.03125d0) &
+                 + in(i-8,j+1) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+1) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+1) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+1) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+1) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+1) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+1) * (-0.005208333333333333d0) &
+                 + in(i-1,j+1) * (-0.03125d0) &
+                 + in(i+1,j+1) * (0.03125d0) &
+                 + in(i+2,j+1) * (0.005208333333333333d0) &
+                 + in(i+3,j+1) * (0.0020833333333333333d0) &
+                 + in(i+4,j+1) * (0.0011160714285714285d0) &
+                 + in(i+5,j+1) * (0.0006944444444444445d0) &
+                 + in(i+6,j+1) * (0.0004734848484848485d0) &
+                 + in(i+7,j+1) * (0.00034340659340659343d0) &
+                 + in(i+8,j+1) * (0.00026041666666666666d0) &
+                 + in(i-8,j+2) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+2) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+2) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+2) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+2) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+2) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+2) * (-0.005208333333333333d0) &
+                 + in(i-1,j+2) * (-0.03125d0) &
+                 + in(i+1,j+2) * (0.005208333333333333d0) &
+                 + in(i+2,j+2) * (0.015625d0) &
+                 + in(i+3,j+2) * (0.0020833333333333333d0) &
+                 + in(i+4,j+2) * (0.0011160714285714285d0) &
+                 + in(i+5,j+2) * (0.0006944444444444445d0) &
+                 + in(i+6,j+2) * (0.0004734848484848485d0) &
+                 + in(i+7,j+2) * (0.00034340659340659343d0) &
+                 + in(i+8,j+2) * (0.00026041666666666666d0) &
+                 + in(i-8,j+3) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+3) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+3) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+3) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+3) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+3) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+3) * (-0.005208333333333333d0) &
+                 + in(i-1,j+3) * (-0.03125d0) &
+                 + in(i+1,j+3) * (0.0020833333333333333d0) &
+                 + in(i+2,j+3) * (0.0020833333333333333d0) &
+                 + in(i+3,j+3) * (0.010416666666666666d0) &
+                 + in(i+4,j+3) * (0.0011160714285714285d0) &
+                 + in(i+5,j+3) * (0.0006944444444444445d0) &
+                 + in(i+6,j+3) * (0.0004734848484848485d0) &
+                 + in(i+7,j+3) * (0.00034340659340659343d0) &
+                 + in(i+8,j+3) * (0.00026041666666666666d0) &
+                 + in(i-8,j+4) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+4) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+4) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+4) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+4) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+4) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+4) * (-0.005208333333333333d0) &
+                 + in(i-1,j+4) * (-0.03125d0) &
+                 + in(i+1,j+4) * (0.0011160714285714285d0) &
+                 + in(i+2,j+4) * (0.0011160714285714285d0) &
+                 + in(i+3,j+4) * (0.0011160714285714285d0) &
+                 + in(i+4,j+4) * (0.0078125d0) &
+                 + in(i+5,j+4) * (0.0006944444444444445d0) &
+                 + in(i+6,j+4) * (0.0004734848484848485d0) &
+                 + in(i+7,j+4) * (0.00034340659340659343d0) &
+                 + in(i+8,j+4) * (0.00026041666666666666d0) &
+                 + in(i-8,j+5) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+5) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+5) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+5) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+5) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+5) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+5) * (-0.005208333333333333d0) &
+                 + in(i-1,j+5) * (-0.03125d0) &
+                 + in(i+1,j+5) * (0.0006944444444444445d0) &
+                 + in(i+2,j+5) * (0.0006944444444444445d0) &
+                 + in(i+3,j+5) * (0.0006944444444444445d0) &
+                 + in(i+4,j+5) * (0.0006944444444444445d0) &
+                 + in(i+5,j+5) * (0.00625d0) &
+                 + in(i+6,j+5) * (0.0004734848484848485d0) &
+                 + in(i+7,j+5) * (0.00034340659340659343d0) &
+                 + in(i+8,j+5) * (0.00026041666666666666d0) &
+                 + in(i-8,j+6) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+6) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+6) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+6) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+6) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+6) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+6) * (-0.005208333333333333d0) &
+                 + in(i-1,j+6) * (-0.03125d0) &
+                 + in(i+1,j+6) * (0.0004734848484848485d0) &
+                 + in(i+2,j+6) * (0.0004734848484848485d0) &
+                 + in(i+3,j+6) * (0.0004734848484848485d0) &
+                 + in(i+4,j+6) * (0.0004734848484848485d0) &
+                 + in(i+5,j+6) * (0.0004734848484848485d0) &
+                 + in(i+6,j+6) * (0.005208333333333333d0) &
+                 + in(i+7,j+6) * (0.00034340659340659343d0) &
+                 + in(i+8,j+6) * (0.00026041666666666666d0) &
+                 + in(i-8,j+7) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+7) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+7) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+7) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+7) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+7) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+7) * (-0.005208333333333333d0) &
+                 + in(i-1,j+7) * (-0.03125d0) &
+                 + in(i+1,j+7) * (0.00034340659340659343d0) &
+                 + in(i+2,j+7) * (0.00034340659340659343d0) &
+                 + in(i+3,j+7) * (0.00034340659340659343d0) &
+                 + in(i+4,j+7) * (0.00034340659340659343d0) &
+                 + in(i+5,j+7) * (0.00034340659340659343d0) &
+                 + in(i+6,j+7) * (0.00034340659340659343d0) &
+                 + in(i+7,j+7) * (0.004464285714285714d0) &
+                 + in(i+8,j+7) * (0.00026041666666666666d0) &
+                 + in(i-8,j+8) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+8) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+8) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+8) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+8) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+8) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+8) * (-0.005208333333333333d0) &
+                 + in(i-1,j+8) * (-0.03125d0) &
+                 + in(i+1,j+8) * (0.00026041666666666666d0) &
+                 + in(i+2,j+8) * (0.00026041666666666666d0) &
+                 + in(i+3,j+8) * (0.00026041666666666666d0) &
+                 + in(i+4,j+8) * (0.00026041666666666666d0) &
+                 + in(i+5,j+8) * (0.00026041666666666666d0) &
+                 + in(i+6,j+8) * (0.00026041666666666666d0) &
+                 + in(i+7,j+8) * (0.00026041666666666666d0) &
+                 + in(i+8,j+8) * (0.00390625d0) &
 +0.0
       end do
     end do
@@ -1091,258 +1091,258 @@ subroutine grid9(n, in, out)
     do i=9,n-9-1
       do j=9,n-9-1
         out(i,j) = out(i,j) &
-                 + in(i-9,j-9) * (-0.0030864197530864196) &
-                 + in(i+1,j-9) * (-0.00018155410312273057) &
-                 + in(i+2,j-9) * (-0.00018155410312273057) &
-                 + in(i+3,j-9) * (-0.00018155410312273057) &
-                 + in(i+4,j-9) * (-0.00018155410312273057) &
-                 + in(i+5,j-9) * (-0.00018155410312273057) &
-                 + in(i+6,j-9) * (-0.00018155410312273057) &
-                 + in(i+7,j-9) * (-0.00018155410312273057) &
-                 + in(i+8,j-9) * (-0.00018155410312273057) &
-                 + in(i+9,j-9) * (-0.00018155410312273057) &
-                 + in(i-8,j-8) * (-0.003472222222222222) &
-                 + in(i+1,j-8) * (-0.0002314814814814815) &
-                 + in(i+2,j-8) * (-0.0002314814814814815) &
-                 + in(i+3,j-8) * (-0.0002314814814814815) &
-                 + in(i+4,j-8) * (-0.0002314814814814815) &
-                 + in(i+5,j-8) * (-0.0002314814814814815) &
-                 + in(i+6,j-8) * (-0.0002314814814814815) &
-                 + in(i+7,j-8) * (-0.0002314814814814815) &
-                 + in(i+8,j-8) * (-0.0002314814814814815) &
-                 + in(i+9,j-8) * (-0.0002314814814814815) &
-                 + in(i-7,j-7) * (-0.003968253968253968) &
-                 + in(i+1,j-7) * (-0.00030525030525030525) &
-                 + in(i+2,j-7) * (-0.00030525030525030525) &
-                 + in(i+3,j-7) * (-0.00030525030525030525) &
-                 + in(i+4,j-7) * (-0.00030525030525030525) &
-                 + in(i+5,j-7) * (-0.00030525030525030525) &
-                 + in(i+6,j-7) * (-0.00030525030525030525) &
-                 + in(i+7,j-7) * (-0.00030525030525030525) &
-                 + in(i+8,j-7) * (-0.00030525030525030525) &
-                 + in(i+9,j-7) * (-0.00030525030525030525) &
-                 + in(i-6,j-6) * (-0.004629629629629629) &
-                 + in(i+1,j-6) * (-0.00042087542087542086) &
-                 + in(i+2,j-6) * (-0.00042087542087542086) &
-                 + in(i+3,j-6) * (-0.00042087542087542086) &
-                 + in(i+4,j-6) * (-0.00042087542087542086) &
-                 + in(i+5,j-6) * (-0.00042087542087542086) &
-                 + in(i+6,j-6) * (-0.00042087542087542086) &
-                 + in(i+7,j-6) * (-0.00042087542087542086) &
-                 + in(i+8,j-6) * (-0.00042087542087542086) &
-                 + in(i+9,j-6) * (-0.00042087542087542086) &
-                 + in(i-5,j-5) * (-0.005555555555555556) &
-                 + in(i+1,j-5) * (-0.0006172839506172839) &
-                 + in(i+2,j-5) * (-0.0006172839506172839) &
-                 + in(i+3,j-5) * (-0.0006172839506172839) &
-                 + in(i+4,j-5) * (-0.0006172839506172839) &
-                 + in(i+5,j-5) * (-0.0006172839506172839) &
-                 + in(i+6,j-5) * (-0.0006172839506172839) &
-                 + in(i+7,j-5) * (-0.0006172839506172839) &
-                 + in(i+8,j-5) * (-0.0006172839506172839) &
-                 + in(i+9,j-5) * (-0.0006172839506172839) &
-                 + in(i-4,j-4) * (-0.006944444444444444) &
-                 + in(i+1,j-4) * (-0.000992063492063492) &
-                 + in(i+2,j-4) * (-0.000992063492063492) &
-                 + in(i+3,j-4) * (-0.000992063492063492) &
-                 + in(i+4,j-4) * (-0.000992063492063492) &
-                 + in(i+5,j-4) * (-0.000992063492063492) &
-                 + in(i+6,j-4) * (-0.000992063492063492) &
-                 + in(i+7,j-4) * (-0.000992063492063492) &
-                 + in(i+8,j-4) * (-0.000992063492063492) &
-                 + in(i+9,j-4) * (-0.000992063492063492) &
-                 + in(i-3,j-3) * (-0.009259259259259259) &
-                 + in(i+1,j-3) * (-0.001851851851851852) &
-                 + in(i+2,j-3) * (-0.001851851851851852) &
-                 + in(i+3,j-3) * (-0.001851851851851852) &
-                 + in(i+4,j-3) * (-0.001851851851851852) &
-                 + in(i+5,j-3) * (-0.001851851851851852) &
-                 + in(i+6,j-3) * (-0.001851851851851852) &
-                 + in(i+7,j-3) * (-0.001851851851851852) &
-                 + in(i+8,j-3) * (-0.001851851851851852) &
-                 + in(i+9,j-3) * (-0.001851851851851852) &
-                 + in(i-2,j-2) * (-0.013888888888888888) &
-                 + in(i+1,j-2) * (-0.004629629629629629) &
-                 + in(i+2,j-2) * (-0.004629629629629629) &
-                 + in(i+3,j-2) * (-0.004629629629629629) &
-                 + in(i+4,j-2) * (-0.004629629629629629) &
-                 + in(i+5,j-2) * (-0.004629629629629629) &
-                 + in(i+6,j-2) * (-0.004629629629629629) &
-                 + in(i+7,j-2) * (-0.004629629629629629) &
-                 + in(i+8,j-2) * (-0.004629629629629629) &
-                 + in(i+9,j-2) * (-0.004629629629629629) &
-                 + in(i-1,j-1) * (-0.027777777777777776) &
-                 + in(i+1,j-1) * (-0.027777777777777776) &
-                 + in(i+2,j-1) * (-0.027777777777777776) &
-                 + in(i+3,j-1) * (-0.027777777777777776) &
-                 + in(i+4,j-1) * (-0.027777777777777776) &
-                 + in(i+5,j-1) * (-0.027777777777777776) &
-                 + in(i+6,j-1) * (-0.027777777777777776) &
-                 + in(i+7,j-1) * (-0.027777777777777776) &
-                 + in(i+8,j-1) * (-0.027777777777777776) &
-                 + in(i+9,j-1) * (-0.027777777777777776) &
-                 + in(i-9,j+1) * (-0.00018155410312273057) &
-                 + in(i-8,j+1) * (-0.0002314814814814815) &
-                 + in(i-7,j+1) * (-0.00030525030525030525) &
-                 + in(i-6,j+1) * (-0.00042087542087542086) &
-                 + in(i-5,j+1) * (-0.0006172839506172839) &
-                 + in(i-4,j+1) * (-0.000992063492063492) &
-                 + in(i-3,j+1) * (-0.001851851851851852) &
-                 + in(i-2,j+1) * (-0.004629629629629629) &
-                 + in(i-1,j+1) * (-0.027777777777777776) &
-                 + in(i+1,j+1) * (0.027777777777777776) &
-                 + in(i+2,j+1) * (0.004629629629629629) &
-                 + in(i+3,j+1) * (0.001851851851851852) &
-                 + in(i+4,j+1) * (0.000992063492063492) &
-                 + in(i+5,j+1) * (0.0006172839506172839) &
-                 + in(i+6,j+1) * (0.00042087542087542086) &
-                 + in(i+7,j+1) * (0.00030525030525030525) &
-                 + in(i+8,j+1) * (0.0002314814814814815) &
-                 + in(i+9,j+1) * (0.00018155410312273057) &
-                 + in(i-9,j+2) * (-0.00018155410312273057) &
-                 + in(i-8,j+2) * (-0.0002314814814814815) &
-                 + in(i-7,j+2) * (-0.00030525030525030525) &
-                 + in(i-6,j+2) * (-0.00042087542087542086) &
-                 + in(i-5,j+2) * (-0.0006172839506172839) &
-                 + in(i-4,j+2) * (-0.000992063492063492) &
-                 + in(i-3,j+2) * (-0.001851851851851852) &
-                 + in(i-2,j+2) * (-0.004629629629629629) &
-                 + in(i-1,j+2) * (-0.027777777777777776) &
-                 + in(i+1,j+2) * (0.004629629629629629) &
-                 + in(i+2,j+2) * (0.013888888888888888) &
-                 + in(i+3,j+2) * (0.001851851851851852) &
-                 + in(i+4,j+2) * (0.000992063492063492) &
-                 + in(i+5,j+2) * (0.0006172839506172839) &
-                 + in(i+6,j+2) * (0.00042087542087542086) &
-                 + in(i+7,j+2) * (0.00030525030525030525) &
-                 + in(i+8,j+2) * (0.0002314814814814815) &
-                 + in(i+9,j+2) * (0.00018155410312273057) &
-                 + in(i-9,j+3) * (-0.00018155410312273057) &
-                 + in(i-8,j+3) * (-0.0002314814814814815) &
-                 + in(i-7,j+3) * (-0.00030525030525030525) &
-                 + in(i-6,j+3) * (-0.00042087542087542086) &
-                 + in(i-5,j+3) * (-0.0006172839506172839) &
-                 + in(i-4,j+3) * (-0.000992063492063492) &
-                 + in(i-3,j+3) * (-0.001851851851851852) &
-                 + in(i-2,j+3) * (-0.004629629629629629) &
-                 + in(i-1,j+3) * (-0.027777777777777776) &
-                 + in(i+1,j+3) * (0.001851851851851852) &
-                 + in(i+2,j+3) * (0.001851851851851852) &
-                 + in(i+3,j+3) * (0.009259259259259259) &
-                 + in(i+4,j+3) * (0.000992063492063492) &
-                 + in(i+5,j+3) * (0.0006172839506172839) &
-                 + in(i+6,j+3) * (0.00042087542087542086) &
-                 + in(i+7,j+3) * (0.00030525030525030525) &
-                 + in(i+8,j+3) * (0.0002314814814814815) &
-                 + in(i+9,j+3) * (0.00018155410312273057) &
-                 + in(i-9,j+4) * (-0.00018155410312273057) &
-                 + in(i-8,j+4) * (-0.0002314814814814815) &
-                 + in(i-7,j+4) * (-0.00030525030525030525) &
-                 + in(i-6,j+4) * (-0.00042087542087542086) &
-                 + in(i-5,j+4) * (-0.0006172839506172839) &
-                 + in(i-4,j+4) * (-0.000992063492063492) &
-                 + in(i-3,j+4) * (-0.001851851851851852) &
-                 + in(i-2,j+4) * (-0.004629629629629629) &
-                 + in(i-1,j+4) * (-0.027777777777777776) &
-                 + in(i+1,j+4) * (0.000992063492063492) &
-                 + in(i+2,j+4) * (0.000992063492063492) &
-                 + in(i+3,j+4) * (0.000992063492063492) &
-                 + in(i+4,j+4) * (0.006944444444444444) &
-                 + in(i+5,j+4) * (0.0006172839506172839) &
-                 + in(i+6,j+4) * (0.00042087542087542086) &
-                 + in(i+7,j+4) * (0.00030525030525030525) &
-                 + in(i+8,j+4) * (0.0002314814814814815) &
-                 + in(i+9,j+4) * (0.00018155410312273057) &
-                 + in(i-9,j+5) * (-0.00018155410312273057) &
-                 + in(i-8,j+5) * (-0.0002314814814814815) &
-                 + in(i-7,j+5) * (-0.00030525030525030525) &
-                 + in(i-6,j+5) * (-0.00042087542087542086) &
-                 + in(i-5,j+5) * (-0.0006172839506172839) &
-                 + in(i-4,j+5) * (-0.000992063492063492) &
-                 + in(i-3,j+5) * (-0.001851851851851852) &
-                 + in(i-2,j+5) * (-0.004629629629629629) &
-                 + in(i-1,j+5) * (-0.027777777777777776) &
-                 + in(i+1,j+5) * (0.0006172839506172839) &
-                 + in(i+2,j+5) * (0.0006172839506172839) &
-                 + in(i+3,j+5) * (0.0006172839506172839) &
-                 + in(i+4,j+5) * (0.0006172839506172839) &
-                 + in(i+5,j+5) * (0.005555555555555556) &
-                 + in(i+6,j+5) * (0.00042087542087542086) &
-                 + in(i+7,j+5) * (0.00030525030525030525) &
-                 + in(i+8,j+5) * (0.0002314814814814815) &
-                 + in(i+9,j+5) * (0.00018155410312273057) &
-                 + in(i-9,j+6) * (-0.00018155410312273057) &
-                 + in(i-8,j+6) * (-0.0002314814814814815) &
-                 + in(i-7,j+6) * (-0.00030525030525030525) &
-                 + in(i-6,j+6) * (-0.00042087542087542086) &
-                 + in(i-5,j+6) * (-0.0006172839506172839) &
-                 + in(i-4,j+6) * (-0.000992063492063492) &
-                 + in(i-3,j+6) * (-0.001851851851851852) &
-                 + in(i-2,j+6) * (-0.004629629629629629) &
-                 + in(i-1,j+6) * (-0.027777777777777776) &
-                 + in(i+1,j+6) * (0.00042087542087542086) &
-                 + in(i+2,j+6) * (0.00042087542087542086) &
-                 + in(i+3,j+6) * (0.00042087542087542086) &
-                 + in(i+4,j+6) * (0.00042087542087542086) &
-                 + in(i+5,j+6) * (0.00042087542087542086) &
-                 + in(i+6,j+6) * (0.004629629629629629) &
-                 + in(i+7,j+6) * (0.00030525030525030525) &
-                 + in(i+8,j+6) * (0.0002314814814814815) &
-                 + in(i+9,j+6) * (0.00018155410312273057) &
-                 + in(i-9,j+7) * (-0.00018155410312273057) &
-                 + in(i-8,j+7) * (-0.0002314814814814815) &
-                 + in(i-7,j+7) * (-0.00030525030525030525) &
-                 + in(i-6,j+7) * (-0.00042087542087542086) &
-                 + in(i-5,j+7) * (-0.0006172839506172839) &
-                 + in(i-4,j+7) * (-0.000992063492063492) &
-                 + in(i-3,j+7) * (-0.001851851851851852) &
-                 + in(i-2,j+7) * (-0.004629629629629629) &
-                 + in(i-1,j+7) * (-0.027777777777777776) &
-                 + in(i+1,j+7) * (0.00030525030525030525) &
-                 + in(i+2,j+7) * (0.00030525030525030525) &
-                 + in(i+3,j+7) * (0.00030525030525030525) &
-                 + in(i+4,j+7) * (0.00030525030525030525) &
-                 + in(i+5,j+7) * (0.00030525030525030525) &
-                 + in(i+6,j+7) * (0.00030525030525030525) &
-                 + in(i+7,j+7) * (0.003968253968253968) &
-                 + in(i+8,j+7) * (0.0002314814814814815) &
-                 + in(i+9,j+7) * (0.00018155410312273057) &
-                 + in(i-9,j+8) * (-0.00018155410312273057) &
-                 + in(i-8,j+8) * (-0.0002314814814814815) &
-                 + in(i-7,j+8) * (-0.00030525030525030525) &
-                 + in(i-6,j+8) * (-0.00042087542087542086) &
-                 + in(i-5,j+8) * (-0.0006172839506172839) &
-                 + in(i-4,j+8) * (-0.000992063492063492) &
-                 + in(i-3,j+8) * (-0.001851851851851852) &
-                 + in(i-2,j+8) * (-0.004629629629629629) &
-                 + in(i-1,j+8) * (-0.027777777777777776) &
-                 + in(i+1,j+8) * (0.0002314814814814815) &
-                 + in(i+2,j+8) * (0.0002314814814814815) &
-                 + in(i+3,j+8) * (0.0002314814814814815) &
-                 + in(i+4,j+8) * (0.0002314814814814815) &
-                 + in(i+5,j+8) * (0.0002314814814814815) &
-                 + in(i+6,j+8) * (0.0002314814814814815) &
-                 + in(i+7,j+8) * (0.0002314814814814815) &
-                 + in(i+8,j+8) * (0.003472222222222222) &
-                 + in(i+9,j+8) * (0.00018155410312273057) &
-                 + in(i-9,j+9) * (-0.00018155410312273057) &
-                 + in(i-8,j+9) * (-0.0002314814814814815) &
-                 + in(i-7,j+9) * (-0.00030525030525030525) &
-                 + in(i-6,j+9) * (-0.00042087542087542086) &
-                 + in(i-5,j+9) * (-0.0006172839506172839) &
-                 + in(i-4,j+9) * (-0.000992063492063492) &
-                 + in(i-3,j+9) * (-0.001851851851851852) &
-                 + in(i-2,j+9) * (-0.004629629629629629) &
-                 + in(i-1,j+9) * (-0.027777777777777776) &
-                 + in(i+1,j+9) * (0.00018155410312273057) &
-                 + in(i+2,j+9) * (0.00018155410312273057) &
-                 + in(i+3,j+9) * (0.00018155410312273057) &
-                 + in(i+4,j+9) * (0.00018155410312273057) &
-                 + in(i+5,j+9) * (0.00018155410312273057) &
-                 + in(i+6,j+9) * (0.00018155410312273057) &
-                 + in(i+7,j+9) * (0.00018155410312273057) &
-                 + in(i+8,j+9) * (0.00018155410312273057) &
-                 + in(i+9,j+9) * (0.0030864197530864196) &
+                 + in(i-9,j-9) * (-0.0030864197530864196d0) &
+                 + in(i+1,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+2,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+3,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+4,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+5,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+6,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+7,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+8,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+9,j-9) * (-0.00018155410312273057d0) &
+                 + in(i-8,j-8) * (-0.003472222222222222d0) &
+                 + in(i+1,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+2,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+3,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+4,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+5,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+6,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+7,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+8,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+9,j-8) * (-0.0002314814814814815d0) &
+                 + in(i-7,j-7) * (-0.003968253968253968d0) &
+                 + in(i+1,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+2,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+3,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+4,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+5,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+6,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+7,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+8,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+9,j-7) * (-0.00030525030525030525d0) &
+                 + in(i-6,j-6) * (-0.004629629629629629d0) &
+                 + in(i+1,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+2,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+3,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+4,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+5,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+6,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+7,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+8,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+9,j-6) * (-0.00042087542087542086d0) &
+                 + in(i-5,j-5) * (-0.005555555555555556d0) &
+                 + in(i+1,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+2,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+3,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+4,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+5,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+6,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+7,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+8,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+9,j-5) * (-0.0006172839506172839d0) &
+                 + in(i-4,j-4) * (-0.006944444444444444d0) &
+                 + in(i+1,j-4) * (-0.000992063492063492d0) &
+                 + in(i+2,j-4) * (-0.000992063492063492d0) &
+                 + in(i+3,j-4) * (-0.000992063492063492d0) &
+                 + in(i+4,j-4) * (-0.000992063492063492d0) &
+                 + in(i+5,j-4) * (-0.000992063492063492d0) &
+                 + in(i+6,j-4) * (-0.000992063492063492d0) &
+                 + in(i+7,j-4) * (-0.000992063492063492d0) &
+                 + in(i+8,j-4) * (-0.000992063492063492d0) &
+                 + in(i+9,j-4) * (-0.000992063492063492d0) &
+                 + in(i-3,j-3) * (-0.009259259259259259d0) &
+                 + in(i+1,j-3) * (-0.001851851851851852d0) &
+                 + in(i+2,j-3) * (-0.001851851851851852d0) &
+                 + in(i+3,j-3) * (-0.001851851851851852d0) &
+                 + in(i+4,j-3) * (-0.001851851851851852d0) &
+                 + in(i+5,j-3) * (-0.001851851851851852d0) &
+                 + in(i+6,j-3) * (-0.001851851851851852d0) &
+                 + in(i+7,j-3) * (-0.001851851851851852d0) &
+                 + in(i+8,j-3) * (-0.001851851851851852d0) &
+                 + in(i+9,j-3) * (-0.001851851851851852d0) &
+                 + in(i-2,j-2) * (-0.013888888888888888d0) &
+                 + in(i+1,j-2) * (-0.004629629629629629d0) &
+                 + in(i+2,j-2) * (-0.004629629629629629d0) &
+                 + in(i+3,j-2) * (-0.004629629629629629d0) &
+                 + in(i+4,j-2) * (-0.004629629629629629d0) &
+                 + in(i+5,j-2) * (-0.004629629629629629d0) &
+                 + in(i+6,j-2) * (-0.004629629629629629d0) &
+                 + in(i+7,j-2) * (-0.004629629629629629d0) &
+                 + in(i+8,j-2) * (-0.004629629629629629d0) &
+                 + in(i+9,j-2) * (-0.004629629629629629d0) &
+                 + in(i-1,j-1) * (-0.027777777777777776d0) &
+                 + in(i+1,j-1) * (-0.027777777777777776d0) &
+                 + in(i+2,j-1) * (-0.027777777777777776d0) &
+                 + in(i+3,j-1) * (-0.027777777777777776d0) &
+                 + in(i+4,j-1) * (-0.027777777777777776d0) &
+                 + in(i+5,j-1) * (-0.027777777777777776d0) &
+                 + in(i+6,j-1) * (-0.027777777777777776d0) &
+                 + in(i+7,j-1) * (-0.027777777777777776d0) &
+                 + in(i+8,j-1) * (-0.027777777777777776d0) &
+                 + in(i+9,j-1) * (-0.027777777777777776d0) &
+                 + in(i-9,j+1) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+1) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+1) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+1) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+1) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+1) * (-0.000992063492063492d0) &
+                 + in(i-3,j+1) * (-0.001851851851851852d0) &
+                 + in(i-2,j+1) * (-0.004629629629629629d0) &
+                 + in(i-1,j+1) * (-0.027777777777777776d0) &
+                 + in(i+1,j+1) * (0.027777777777777776d0) &
+                 + in(i+2,j+1) * (0.004629629629629629d0) &
+                 + in(i+3,j+1) * (0.001851851851851852d0) &
+                 + in(i+4,j+1) * (0.000992063492063492d0) &
+                 + in(i+5,j+1) * (0.0006172839506172839d0) &
+                 + in(i+6,j+1) * (0.00042087542087542086d0) &
+                 + in(i+7,j+1) * (0.00030525030525030525d0) &
+                 + in(i+8,j+1) * (0.0002314814814814815d0) &
+                 + in(i+9,j+1) * (0.00018155410312273057d0) &
+                 + in(i-9,j+2) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+2) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+2) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+2) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+2) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+2) * (-0.000992063492063492d0) &
+                 + in(i-3,j+2) * (-0.001851851851851852d0) &
+                 + in(i-2,j+2) * (-0.004629629629629629d0) &
+                 + in(i-1,j+2) * (-0.027777777777777776d0) &
+                 + in(i+1,j+2) * (0.004629629629629629d0) &
+                 + in(i+2,j+2) * (0.013888888888888888d0) &
+                 + in(i+3,j+2) * (0.001851851851851852d0) &
+                 + in(i+4,j+2) * (0.000992063492063492d0) &
+                 + in(i+5,j+2) * (0.0006172839506172839d0) &
+                 + in(i+6,j+2) * (0.00042087542087542086d0) &
+                 + in(i+7,j+2) * (0.00030525030525030525d0) &
+                 + in(i+8,j+2) * (0.0002314814814814815d0) &
+                 + in(i+9,j+2) * (0.00018155410312273057d0) &
+                 + in(i-9,j+3) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+3) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+3) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+3) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+3) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+3) * (-0.000992063492063492d0) &
+                 + in(i-3,j+3) * (-0.001851851851851852d0) &
+                 + in(i-2,j+3) * (-0.004629629629629629d0) &
+                 + in(i-1,j+3) * (-0.027777777777777776d0) &
+                 + in(i+1,j+3) * (0.001851851851851852d0) &
+                 + in(i+2,j+3) * (0.001851851851851852d0) &
+                 + in(i+3,j+3) * (0.009259259259259259d0) &
+                 + in(i+4,j+3) * (0.000992063492063492d0) &
+                 + in(i+5,j+3) * (0.0006172839506172839d0) &
+                 + in(i+6,j+3) * (0.00042087542087542086d0) &
+                 + in(i+7,j+3) * (0.00030525030525030525d0) &
+                 + in(i+8,j+3) * (0.0002314814814814815d0) &
+                 + in(i+9,j+3) * (0.00018155410312273057d0) &
+                 + in(i-9,j+4) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+4) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+4) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+4) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+4) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+4) * (-0.000992063492063492d0) &
+                 + in(i-3,j+4) * (-0.001851851851851852d0) &
+                 + in(i-2,j+4) * (-0.004629629629629629d0) &
+                 + in(i-1,j+4) * (-0.027777777777777776d0) &
+                 + in(i+1,j+4) * (0.000992063492063492d0) &
+                 + in(i+2,j+4) * (0.000992063492063492d0) &
+                 + in(i+3,j+4) * (0.000992063492063492d0) &
+                 + in(i+4,j+4) * (0.006944444444444444d0) &
+                 + in(i+5,j+4) * (0.0006172839506172839d0) &
+                 + in(i+6,j+4) * (0.00042087542087542086d0) &
+                 + in(i+7,j+4) * (0.00030525030525030525d0) &
+                 + in(i+8,j+4) * (0.0002314814814814815d0) &
+                 + in(i+9,j+4) * (0.00018155410312273057d0) &
+                 + in(i-9,j+5) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+5) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+5) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+5) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+5) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+5) * (-0.000992063492063492d0) &
+                 + in(i-3,j+5) * (-0.001851851851851852d0) &
+                 + in(i-2,j+5) * (-0.004629629629629629d0) &
+                 + in(i-1,j+5) * (-0.027777777777777776d0) &
+                 + in(i+1,j+5) * (0.0006172839506172839d0) &
+                 + in(i+2,j+5) * (0.0006172839506172839d0) &
+                 + in(i+3,j+5) * (0.0006172839506172839d0) &
+                 + in(i+4,j+5) * (0.0006172839506172839d0) &
+                 + in(i+5,j+5) * (0.005555555555555556d0) &
+                 + in(i+6,j+5) * (0.00042087542087542086d0) &
+                 + in(i+7,j+5) * (0.00030525030525030525d0) &
+                 + in(i+8,j+5) * (0.0002314814814814815d0) &
+                 + in(i+9,j+5) * (0.00018155410312273057d0) &
+                 + in(i-9,j+6) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+6) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+6) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+6) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+6) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+6) * (-0.000992063492063492d0) &
+                 + in(i-3,j+6) * (-0.001851851851851852d0) &
+                 + in(i-2,j+6) * (-0.004629629629629629d0) &
+                 + in(i-1,j+6) * (-0.027777777777777776d0) &
+                 + in(i+1,j+6) * (0.00042087542087542086d0) &
+                 + in(i+2,j+6) * (0.00042087542087542086d0) &
+                 + in(i+3,j+6) * (0.00042087542087542086d0) &
+                 + in(i+4,j+6) * (0.00042087542087542086d0) &
+                 + in(i+5,j+6) * (0.00042087542087542086d0) &
+                 + in(i+6,j+6) * (0.004629629629629629d0) &
+                 + in(i+7,j+6) * (0.00030525030525030525d0) &
+                 + in(i+8,j+6) * (0.0002314814814814815d0) &
+                 + in(i+9,j+6) * (0.00018155410312273057d0) &
+                 + in(i-9,j+7) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+7) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+7) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+7) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+7) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+7) * (-0.000992063492063492d0) &
+                 + in(i-3,j+7) * (-0.001851851851851852d0) &
+                 + in(i-2,j+7) * (-0.004629629629629629d0) &
+                 + in(i-1,j+7) * (-0.027777777777777776d0) &
+                 + in(i+1,j+7) * (0.00030525030525030525d0) &
+                 + in(i+2,j+7) * (0.00030525030525030525d0) &
+                 + in(i+3,j+7) * (0.00030525030525030525d0) &
+                 + in(i+4,j+7) * (0.00030525030525030525d0) &
+                 + in(i+5,j+7) * (0.00030525030525030525d0) &
+                 + in(i+6,j+7) * (0.00030525030525030525d0) &
+                 + in(i+7,j+7) * (0.003968253968253968d0) &
+                 + in(i+8,j+7) * (0.0002314814814814815d0) &
+                 + in(i+9,j+7) * (0.00018155410312273057d0) &
+                 + in(i-9,j+8) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+8) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+8) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+8) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+8) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+8) * (-0.000992063492063492d0) &
+                 + in(i-3,j+8) * (-0.001851851851851852d0) &
+                 + in(i-2,j+8) * (-0.004629629629629629d0) &
+                 + in(i-1,j+8) * (-0.027777777777777776d0) &
+                 + in(i+1,j+8) * (0.0002314814814814815d0) &
+                 + in(i+2,j+8) * (0.0002314814814814815d0) &
+                 + in(i+3,j+8) * (0.0002314814814814815d0) &
+                 + in(i+4,j+8) * (0.0002314814814814815d0) &
+                 + in(i+5,j+8) * (0.0002314814814814815d0) &
+                 + in(i+6,j+8) * (0.0002314814814814815d0) &
+                 + in(i+7,j+8) * (0.0002314814814814815d0) &
+                 + in(i+8,j+8) * (0.003472222222222222d0) &
+                 + in(i+9,j+8) * (0.00018155410312273057d0) &
+                 + in(i-9,j+9) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+9) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+9) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+9) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+9) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+9) * (-0.000992063492063492d0) &
+                 + in(i-3,j+9) * (-0.001851851851851852d0) &
+                 + in(i-2,j+9) * (-0.004629629629629629d0) &
+                 + in(i-1,j+9) * (-0.027777777777777776d0) &
+                 + in(i+1,j+9) * (0.00018155410312273057d0) &
+                 + in(i+2,j+9) * (0.00018155410312273057d0) &
+                 + in(i+3,j+9) * (0.00018155410312273057d0) &
+                 + in(i+4,j+9) * (0.00018155410312273057d0) &
+                 + in(i+5,j+9) * (0.00018155410312273057d0) &
+                 + in(i+6,j+9) * (0.00018155410312273057d0) &
+                 + in(i+7,j+9) * (0.00018155410312273057d0) &
+                 + in(i+8,j+9) * (0.00018155410312273057d0) &
+                 + in(i+9,j+9) * (0.0030864197530864196d0) &
 +0.0
       end do
     end do
diff --git a/FORTRAN/stencil_serial.f90 b/FORTRAN/stencil_serial.f90
index 5e2b50d4e..cb4bf8052 100644
--- a/FORTRAN/stencil_serial.f90
+++ b/FORTRAN/stencil_serial.f90
@@ -8,10 +8,10 @@ subroutine star1(n, in, out)
     do i=1,n-1-1
       do j=1,n-1-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-1) * (-0.5) &
-                 + in(i-1,j+0) * (-0.5) &
-                 + in(i+1,j+0) * (0.5) &
-                 + in(i+0,j+1) * (0.5) &
+                 + in(i+0,j-1) * (-0.5d0) &
+                 + in(i-1,j+0) * (-0.5d0) &
+                 + in(i+1,j+0) * (0.5d0) &
+                 + in(i+0,j+1) * (0.5d0) &
 +0.0
       end do
     end do
@@ -27,14 +27,14 @@ subroutine star2(n, in, out)
     do i=2,n-2-1
       do j=2,n-2-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-2) * (-0.125) &
-                 + in(i+0,j-1) * (-0.25) &
-                 + in(i-2,j+0) * (-0.125) &
-                 + in(i-1,j+0) * (-0.25) &
-                 + in(i+1,j+0) * (0.25) &
-                 + in(i+2,j+0) * (0.125) &
-                 + in(i+0,j+1) * (0.25) &
-                 + in(i+0,j+2) * (0.125) &
+                 + in(i+0,j-2) * (-0.125d0) &
+                 + in(i+0,j-1) * (-0.25d0) &
+                 + in(i-2,j+0) * (-0.125d0) &
+                 + in(i-1,j+0) * (-0.25d0) &
+                 + in(i+1,j+0) * (0.25d0) &
+                 + in(i+2,j+0) * (0.125d0) &
+                 + in(i+0,j+1) * (0.25d0) &
+                 + in(i+0,j+2) * (0.125d0) &
 +0.0
       end do
     end do
@@ -50,18 +50,18 @@ subroutine star3(n, in, out)
     do i=3,n-3-1
       do j=3,n-3-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-3) * (-0.05555555555555555) &
-                 + in(i+0,j-2) * (-0.08333333333333333) &
-                 + in(i+0,j-1) * (-0.16666666666666666) &
-                 + in(i-3,j+0) * (-0.05555555555555555) &
-                 + in(i-2,j+0) * (-0.08333333333333333) &
-                 + in(i-1,j+0) * (-0.16666666666666666) &
-                 + in(i+1,j+0) * (0.16666666666666666) &
-                 + in(i+2,j+0) * (0.08333333333333333) &
-                 + in(i+3,j+0) * (0.05555555555555555) &
-                 + in(i+0,j+1) * (0.16666666666666666) &
-                 + in(i+0,j+2) * (0.08333333333333333) &
-                 + in(i+0,j+3) * (0.05555555555555555) &
+                 + in(i+0,j-3) * (-0.05555555555555555d0) &
+                 + in(i+0,j-2) * (-0.08333333333333333d0) &
+                 + in(i+0,j-1) * (-0.16666666666666666d0) &
+                 + in(i-3,j+0) * (-0.05555555555555555d0) &
+                 + in(i-2,j+0) * (-0.08333333333333333d0) &
+                 + in(i-1,j+0) * (-0.16666666666666666d0) &
+                 + in(i+1,j+0) * (0.16666666666666666d0) &
+                 + in(i+2,j+0) * (0.08333333333333333d0) &
+                 + in(i+3,j+0) * (0.05555555555555555d0) &
+                 + in(i+0,j+1) * (0.16666666666666666d0) &
+                 + in(i+0,j+2) * (0.08333333333333333d0) &
+                 + in(i+0,j+3) * (0.05555555555555555d0) &
 +0.0
       end do
     end do
@@ -77,22 +77,22 @@ subroutine star4(n, in, out)
     do i=4,n-4-1
       do j=4,n-4-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-4) * (-0.03125) &
-                 + in(i+0,j-3) * (-0.041666666666666664) &
-                 + in(i+0,j-2) * (-0.0625) &
-                 + in(i+0,j-1) * (-0.125) &
-                 + in(i-4,j+0) * (-0.03125) &
-                 + in(i-3,j+0) * (-0.041666666666666664) &
-                 + in(i-2,j+0) * (-0.0625) &
-                 + in(i-1,j+0) * (-0.125) &
-                 + in(i+1,j+0) * (0.125) &
-                 + in(i+2,j+0) * (0.0625) &
-                 + in(i+3,j+0) * (0.041666666666666664) &
-                 + in(i+4,j+0) * (0.03125) &
-                 + in(i+0,j+1) * (0.125) &
-                 + in(i+0,j+2) * (0.0625) &
-                 + in(i+0,j+3) * (0.041666666666666664) &
-                 + in(i+0,j+4) * (0.03125) &
+                 + in(i+0,j-4) * (-0.03125d0) &
+                 + in(i+0,j-3) * (-0.041666666666666664d0) &
+                 + in(i+0,j-2) * (-0.0625d0) &
+                 + in(i+0,j-1) * (-0.125d0) &
+                 + in(i-4,j+0) * (-0.03125d0) &
+                 + in(i-3,j+0) * (-0.041666666666666664d0) &
+                 + in(i-2,j+0) * (-0.0625d0) &
+                 + in(i-1,j+0) * (-0.125d0) &
+                 + in(i+1,j+0) * (0.125d0) &
+                 + in(i+2,j+0) * (0.0625d0) &
+                 + in(i+3,j+0) * (0.041666666666666664d0) &
+                 + in(i+4,j+0) * (0.03125d0) &
+                 + in(i+0,j+1) * (0.125d0) &
+                 + in(i+0,j+2) * (0.0625d0) &
+                 + in(i+0,j+3) * (0.041666666666666664d0) &
+                 + in(i+0,j+4) * (0.03125d0) &
 +0.0
       end do
     end do
@@ -108,26 +108,26 @@ subroutine star5(n, in, out)
     do i=5,n-5-1
       do j=5,n-5-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-5) * (-0.02) &
-                 + in(i+0,j-4) * (-0.025) &
-                 + in(i+0,j-3) * (-0.03333333333333333) &
-                 + in(i+0,j-2) * (-0.05) &
-                 + in(i+0,j-1) * (-0.1) &
-                 + in(i-5,j+0) * (-0.02) &
-                 + in(i-4,j+0) * (-0.025) &
-                 + in(i-3,j+0) * (-0.03333333333333333) &
-                 + in(i-2,j+0) * (-0.05) &
-                 + in(i-1,j+0) * (-0.1) &
-                 + in(i+1,j+0) * (0.1) &
-                 + in(i+2,j+0) * (0.05) &
-                 + in(i+3,j+0) * (0.03333333333333333) &
-                 + in(i+4,j+0) * (0.025) &
-                 + in(i+5,j+0) * (0.02) &
-                 + in(i+0,j+1) * (0.1) &
-                 + in(i+0,j+2) * (0.05) &
-                 + in(i+0,j+3) * (0.03333333333333333) &
-                 + in(i+0,j+4) * (0.025) &
-                 + in(i+0,j+5) * (0.02) &
+                 + in(i+0,j-5) * (-0.02d0) &
+                 + in(i+0,j-4) * (-0.025d0) &
+                 + in(i+0,j-3) * (-0.03333333333333333d0) &
+                 + in(i+0,j-2) * (-0.05d0) &
+                 + in(i+0,j-1) * (-0.1d0) &
+                 + in(i-5,j+0) * (-0.02d0) &
+                 + in(i-4,j+0) * (-0.025d0) &
+                 + in(i-3,j+0) * (-0.03333333333333333d0) &
+                 + in(i-2,j+0) * (-0.05d0) &
+                 + in(i-1,j+0) * (-0.1d0) &
+                 + in(i+1,j+0) * (0.1d0) &
+                 + in(i+2,j+0) * (0.05d0) &
+                 + in(i+3,j+0) * (0.03333333333333333d0) &
+                 + in(i+4,j+0) * (0.025d0) &
+                 + in(i+5,j+0) * (0.02d0) &
+                 + in(i+0,j+1) * (0.1d0) &
+                 + in(i+0,j+2) * (0.05d0) &
+                 + in(i+0,j+3) * (0.03333333333333333d0) &
+                 + in(i+0,j+4) * (0.025d0) &
+                 + in(i+0,j+5) * (0.02d0) &
 +0.0
       end do
     end do
@@ -143,30 +143,30 @@ subroutine star6(n, in, out)
     do i=6,n-6-1
       do j=6,n-6-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-6) * (-0.013888888888888888) &
-                 + in(i+0,j-5) * (-0.016666666666666666) &
-                 + in(i+0,j-4) * (-0.020833333333333332) &
-                 + in(i+0,j-3) * (-0.027777777777777776) &
-                 + in(i+0,j-2) * (-0.041666666666666664) &
-                 + in(i+0,j-1) * (-0.08333333333333333) &
-                 + in(i-6,j+0) * (-0.013888888888888888) &
-                 + in(i-5,j+0) * (-0.016666666666666666) &
-                 + in(i-4,j+0) * (-0.020833333333333332) &
-                 + in(i-3,j+0) * (-0.027777777777777776) &
-                 + in(i-2,j+0) * (-0.041666666666666664) &
-                 + in(i-1,j+0) * (-0.08333333333333333) &
-                 + in(i+1,j+0) * (0.08333333333333333) &
-                 + in(i+2,j+0) * (0.041666666666666664) &
-                 + in(i+3,j+0) * (0.027777777777777776) &
-                 + in(i+4,j+0) * (0.020833333333333332) &
-                 + in(i+5,j+0) * (0.016666666666666666) &
-                 + in(i+6,j+0) * (0.013888888888888888) &
-                 + in(i+0,j+1) * (0.08333333333333333) &
-                 + in(i+0,j+2) * (0.041666666666666664) &
-                 + in(i+0,j+3) * (0.027777777777777776) &
-                 + in(i+0,j+4) * (0.020833333333333332) &
-                 + in(i+0,j+5) * (0.016666666666666666) &
-                 + in(i+0,j+6) * (0.013888888888888888) &
+                 + in(i+0,j-6) * (-0.013888888888888888d0) &
+                 + in(i+0,j-5) * (-0.016666666666666666d0) &
+                 + in(i+0,j-4) * (-0.020833333333333332d0) &
+                 + in(i+0,j-3) * (-0.027777777777777776d0) &
+                 + in(i+0,j-2) * (-0.041666666666666664d0) &
+                 + in(i+0,j-1) * (-0.08333333333333333d0) &
+                 + in(i-6,j+0) * (-0.013888888888888888d0) &
+                 + in(i-5,j+0) * (-0.016666666666666666d0) &
+                 + in(i-4,j+0) * (-0.020833333333333332d0) &
+                 + in(i-3,j+0) * (-0.027777777777777776d0) &
+                 + in(i-2,j+0) * (-0.041666666666666664d0) &
+                 + in(i-1,j+0) * (-0.08333333333333333d0) &
+                 + in(i+1,j+0) * (0.08333333333333333d0) &
+                 + in(i+2,j+0) * (0.041666666666666664d0) &
+                 + in(i+3,j+0) * (0.027777777777777776d0) &
+                 + in(i+4,j+0) * (0.020833333333333332d0) &
+                 + in(i+5,j+0) * (0.016666666666666666d0) &
+                 + in(i+6,j+0) * (0.013888888888888888d0) &
+                 + in(i+0,j+1) * (0.08333333333333333d0) &
+                 + in(i+0,j+2) * (0.041666666666666664d0) &
+                 + in(i+0,j+3) * (0.027777777777777776d0) &
+                 + in(i+0,j+4) * (0.020833333333333332d0) &
+                 + in(i+0,j+5) * (0.016666666666666666d0) &
+                 + in(i+0,j+6) * (0.013888888888888888d0) &
 +0.0
       end do
     end do
@@ -182,34 +182,34 @@ subroutine star7(n, in, out)
     do i=7,n-7-1
       do j=7,n-7-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-7) * (-0.01020408163265306) &
-                 + in(i+0,j-6) * (-0.011904761904761904) &
-                 + in(i+0,j-5) * (-0.014285714285714285) &
-                 + in(i+0,j-4) * (-0.017857142857142856) &
-                 + in(i+0,j-3) * (-0.023809523809523808) &
-                 + in(i+0,j-2) * (-0.03571428571428571) &
-                 + in(i+0,j-1) * (-0.07142857142857142) &
-                 + in(i-7,j+0) * (-0.01020408163265306) &
-                 + in(i-6,j+0) * (-0.011904761904761904) &
-                 + in(i-5,j+0) * (-0.014285714285714285) &
-                 + in(i-4,j+0) * (-0.017857142857142856) &
-                 + in(i-3,j+0) * (-0.023809523809523808) &
-                 + in(i-2,j+0) * (-0.03571428571428571) &
-                 + in(i-1,j+0) * (-0.07142857142857142) &
-                 + in(i+1,j+0) * (0.07142857142857142) &
-                 + in(i+2,j+0) * (0.03571428571428571) &
-                 + in(i+3,j+0) * (0.023809523809523808) &
-                 + in(i+4,j+0) * (0.017857142857142856) &
-                 + in(i+5,j+0) * (0.014285714285714285) &
-                 + in(i+6,j+0) * (0.011904761904761904) &
-                 + in(i+7,j+0) * (0.01020408163265306) &
-                 + in(i+0,j+1) * (0.07142857142857142) &
-                 + in(i+0,j+2) * (0.03571428571428571) &
-                 + in(i+0,j+3) * (0.023809523809523808) &
-                 + in(i+0,j+4) * (0.017857142857142856) &
-                 + in(i+0,j+5) * (0.014285714285714285) &
-                 + in(i+0,j+6) * (0.011904761904761904) &
-                 + in(i+0,j+7) * (0.01020408163265306) &
+                 + in(i+0,j-7) * (-0.01020408163265306d0) &
+                 + in(i+0,j-6) * (-0.011904761904761904d0) &
+                 + in(i+0,j-5) * (-0.014285714285714285d0) &
+                 + in(i+0,j-4) * (-0.017857142857142856d0) &
+                 + in(i+0,j-3) * (-0.023809523809523808d0) &
+                 + in(i+0,j-2) * (-0.03571428571428571d0) &
+                 + in(i+0,j-1) * (-0.07142857142857142d0) &
+                 + in(i-7,j+0) * (-0.01020408163265306d0) &
+                 + in(i-6,j+0) * (-0.011904761904761904d0) &
+                 + in(i-5,j+0) * (-0.014285714285714285d0) &
+                 + in(i-4,j+0) * (-0.017857142857142856d0) &
+                 + in(i-3,j+0) * (-0.023809523809523808d0) &
+                 + in(i-2,j+0) * (-0.03571428571428571d0) &
+                 + in(i-1,j+0) * (-0.07142857142857142d0) &
+                 + in(i+1,j+0) * (0.07142857142857142d0) &
+                 + in(i+2,j+0) * (0.03571428571428571d0) &
+                 + in(i+3,j+0) * (0.023809523809523808d0) &
+                 + in(i+4,j+0) * (0.017857142857142856d0) &
+                 + in(i+5,j+0) * (0.014285714285714285d0) &
+                 + in(i+6,j+0) * (0.011904761904761904d0) &
+                 + in(i+7,j+0) * (0.01020408163265306d0) &
+                 + in(i+0,j+1) * (0.07142857142857142d0) &
+                 + in(i+0,j+2) * (0.03571428571428571d0) &
+                 + in(i+0,j+3) * (0.023809523809523808d0) &
+                 + in(i+0,j+4) * (0.017857142857142856d0) &
+                 + in(i+0,j+5) * (0.014285714285714285d0) &
+                 + in(i+0,j+6) * (0.011904761904761904d0) &
+                 + in(i+0,j+7) * (0.01020408163265306d0) &
 +0.0
       end do
     end do
@@ -225,38 +225,38 @@ subroutine star8(n, in, out)
     do i=8,n-8-1
       do j=8,n-8-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-8) * (-0.0078125) &
-                 + in(i+0,j-7) * (-0.008928571428571428) &
-                 + in(i+0,j-6) * (-0.010416666666666666) &
-                 + in(i+0,j-5) * (-0.0125) &
-                 + in(i+0,j-4) * (-0.015625) &
-                 + in(i+0,j-3) * (-0.020833333333333332) &
-                 + in(i+0,j-2) * (-0.03125) &
-                 + in(i+0,j-1) * (-0.0625) &
-                 + in(i-8,j+0) * (-0.0078125) &
-                 + in(i-7,j+0) * (-0.008928571428571428) &
-                 + in(i-6,j+0) * (-0.010416666666666666) &
-                 + in(i-5,j+0) * (-0.0125) &
-                 + in(i-4,j+0) * (-0.015625) &
-                 + in(i-3,j+0) * (-0.020833333333333332) &
-                 + in(i-2,j+0) * (-0.03125) &
-                 + in(i-1,j+0) * (-0.0625) &
-                 + in(i+1,j+0) * (0.0625) &
-                 + in(i+2,j+0) * (0.03125) &
-                 + in(i+3,j+0) * (0.020833333333333332) &
-                 + in(i+4,j+0) * (0.015625) &
-                 + in(i+5,j+0) * (0.0125) &
-                 + in(i+6,j+0) * (0.010416666666666666) &
-                 + in(i+7,j+0) * (0.008928571428571428) &
-                 + in(i+8,j+0) * (0.0078125) &
-                 + in(i+0,j+1) * (0.0625) &
-                 + in(i+0,j+2) * (0.03125) &
-                 + in(i+0,j+3) * (0.020833333333333332) &
-                 + in(i+0,j+4) * (0.015625) &
-                 + in(i+0,j+5) * (0.0125) &
-                 + in(i+0,j+6) * (0.010416666666666666) &
-                 + in(i+0,j+7) * (0.008928571428571428) &
-                 + in(i+0,j+8) * (0.0078125) &
+                 + in(i+0,j-8) * (-0.0078125d0) &
+                 + in(i+0,j-7) * (-0.008928571428571428d0) &
+                 + in(i+0,j-6) * (-0.010416666666666666d0) &
+                 + in(i+0,j-5) * (-0.0125d0) &
+                 + in(i+0,j-4) * (-0.015625d0) &
+                 + in(i+0,j-3) * (-0.020833333333333332d0) &
+                 + in(i+0,j-2) * (-0.03125d0) &
+                 + in(i+0,j-1) * (-0.0625d0) &
+                 + in(i-8,j+0) * (-0.0078125d0) &
+                 + in(i-7,j+0) * (-0.008928571428571428d0) &
+                 + in(i-6,j+0) * (-0.010416666666666666d0) &
+                 + in(i-5,j+0) * (-0.0125d0) &
+                 + in(i-4,j+0) * (-0.015625d0) &
+                 + in(i-3,j+0) * (-0.020833333333333332d0) &
+                 + in(i-2,j+0) * (-0.03125d0) &
+                 + in(i-1,j+0) * (-0.0625d0) &
+                 + in(i+1,j+0) * (0.0625d0) &
+                 + in(i+2,j+0) * (0.03125d0) &
+                 + in(i+3,j+0) * (0.020833333333333332d0) &
+                 + in(i+4,j+0) * (0.015625d0) &
+                 + in(i+5,j+0) * (0.0125d0) &
+                 + in(i+6,j+0) * (0.010416666666666666d0) &
+                 + in(i+7,j+0) * (0.008928571428571428d0) &
+                 + in(i+8,j+0) * (0.0078125d0) &
+                 + in(i+0,j+1) * (0.0625d0) &
+                 + in(i+0,j+2) * (0.03125d0) &
+                 + in(i+0,j+3) * (0.020833333333333332d0) &
+                 + in(i+0,j+4) * (0.015625d0) &
+                 + in(i+0,j+5) * (0.0125d0) &
+                 + in(i+0,j+6) * (0.010416666666666666d0) &
+                 + in(i+0,j+7) * (0.008928571428571428d0) &
+                 + in(i+0,j+8) * (0.0078125d0) &
 +0.0
       end do
     end do
@@ -272,42 +272,42 @@ subroutine star9(n, in, out)
     do i=9,n-9-1
       do j=9,n-9-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-9) * (-0.006172839506172839) &
-                 + in(i+0,j-8) * (-0.006944444444444444) &
-                 + in(i+0,j-7) * (-0.007936507936507936) &
-                 + in(i+0,j-6) * (-0.009259259259259259) &
-                 + in(i+0,j-5) * (-0.011111111111111112) &
-                 + in(i+0,j-4) * (-0.013888888888888888) &
-                 + in(i+0,j-3) * (-0.018518518518518517) &
-                 + in(i+0,j-2) * (-0.027777777777777776) &
-                 + in(i+0,j-1) * (-0.05555555555555555) &
-                 + in(i-9,j+0) * (-0.006172839506172839) &
-                 + in(i-8,j+0) * (-0.006944444444444444) &
-                 + in(i-7,j+0) * (-0.007936507936507936) &
-                 + in(i-6,j+0) * (-0.009259259259259259) &
-                 + in(i-5,j+0) * (-0.011111111111111112) &
-                 + in(i-4,j+0) * (-0.013888888888888888) &
-                 + in(i-3,j+0) * (-0.018518518518518517) &
-                 + in(i-2,j+0) * (-0.027777777777777776) &
-                 + in(i-1,j+0) * (-0.05555555555555555) &
-                 + in(i+1,j+0) * (0.05555555555555555) &
-                 + in(i+2,j+0) * (0.027777777777777776) &
-                 + in(i+3,j+0) * (0.018518518518518517) &
-                 + in(i+4,j+0) * (0.013888888888888888) &
-                 + in(i+5,j+0) * (0.011111111111111112) &
-                 + in(i+6,j+0) * (0.009259259259259259) &
-                 + in(i+7,j+0) * (0.007936507936507936) &
-                 + in(i+8,j+0) * (0.006944444444444444) &
-                 + in(i+9,j+0) * (0.006172839506172839) &
-                 + in(i+0,j+1) * (0.05555555555555555) &
-                 + in(i+0,j+2) * (0.027777777777777776) &
-                 + in(i+0,j+3) * (0.018518518518518517) &
-                 + in(i+0,j+4) * (0.013888888888888888) &
-                 + in(i+0,j+5) * (0.011111111111111112) &
-                 + in(i+0,j+6) * (0.009259259259259259) &
-                 + in(i+0,j+7) * (0.007936507936507936) &
-                 + in(i+0,j+8) * (0.006944444444444444) &
-                 + in(i+0,j+9) * (0.006172839506172839) &
+                 + in(i+0,j-9) * (-0.006172839506172839d0) &
+                 + in(i+0,j-8) * (-0.006944444444444444d0) &
+                 + in(i+0,j-7) * (-0.007936507936507936d0) &
+                 + in(i+0,j-6) * (-0.009259259259259259d0) &
+                 + in(i+0,j-5) * (-0.011111111111111112d0) &
+                 + in(i+0,j-4) * (-0.013888888888888888d0) &
+                 + in(i+0,j-3) * (-0.018518518518518517d0) &
+                 + in(i+0,j-2) * (-0.027777777777777776d0) &
+                 + in(i+0,j-1) * (-0.05555555555555555d0) &
+                 + in(i-9,j+0) * (-0.006172839506172839d0) &
+                 + in(i-8,j+0) * (-0.006944444444444444d0) &
+                 + in(i-7,j+0) * (-0.007936507936507936d0) &
+                 + in(i-6,j+0) * (-0.009259259259259259d0) &
+                 + in(i-5,j+0) * (-0.011111111111111112d0) &
+                 + in(i-4,j+0) * (-0.013888888888888888d0) &
+                 + in(i-3,j+0) * (-0.018518518518518517d0) &
+                 + in(i-2,j+0) * (-0.027777777777777776d0) &
+                 + in(i-1,j+0) * (-0.05555555555555555d0) &
+                 + in(i+1,j+0) * (0.05555555555555555d0) &
+                 + in(i+2,j+0) * (0.027777777777777776d0) &
+                 + in(i+3,j+0) * (0.018518518518518517d0) &
+                 + in(i+4,j+0) * (0.013888888888888888d0) &
+                 + in(i+5,j+0) * (0.011111111111111112d0) &
+                 + in(i+6,j+0) * (0.009259259259259259d0) &
+                 + in(i+7,j+0) * (0.007936507936507936d0) &
+                 + in(i+8,j+0) * (0.006944444444444444d0) &
+                 + in(i+9,j+0) * (0.006172839506172839d0) &
+                 + in(i+0,j+1) * (0.05555555555555555d0) &
+                 + in(i+0,j+2) * (0.027777777777777776d0) &
+                 + in(i+0,j+3) * (0.018518518518518517d0) &
+                 + in(i+0,j+4) * (0.013888888888888888d0) &
+                 + in(i+0,j+5) * (0.011111111111111112d0) &
+                 + in(i+0,j+6) * (0.009259259259259259d0) &
+                 + in(i+0,j+7) * (0.007936507936507936d0) &
+                 + in(i+0,j+8) * (0.006944444444444444d0) &
+                 + in(i+0,j+9) * (0.006172839506172839d0) &
 +0.0
       end do
     end do
@@ -323,10 +323,10 @@ subroutine grid1(n, in, out)
     do i=1,n-1-1
       do j=1,n-1-1
         out(i,j) = out(i,j) &
-                 + in(i-1,j-1) * (-0.25) &
-                 + in(i+1,j-1) * (-0.25) &
-                 + in(i-1,j+1) * (-0.25) &
-                 + in(i+1,j+1) * (0.25) &
+                 + in(i-1,j-1) * (-0.25d0) &
+                 + in(i+1,j-1) * (-0.25d0) &
+                 + in(i-1,j+1) * (-0.25d0) &
+                 + in(i+1,j+1) * (0.25d0) &
 +0.0
       end do
     end do
@@ -342,20 +342,20 @@ subroutine grid2(n, in, out)
     do i=2,n-2-1
       do j=2,n-2-1
         out(i,j) = out(i,j) &
-                 + in(i-2,j-2) * (-0.0625) &
-                 + in(i+1,j-2) * (-0.020833333333333332) &
-                 + in(i+2,j-2) * (-0.020833333333333332) &
-                 + in(i-1,j-1) * (-0.125) &
-                 + in(i+1,j-1) * (-0.125) &
-                 + in(i+2,j-1) * (-0.125) &
-                 + in(i-2,j+1) * (-0.020833333333333332) &
-                 + in(i-1,j+1) * (-0.125) &
-                 + in(i+1,j+1) * (0.125) &
-                 + in(i+2,j+1) * (0.020833333333333332) &
-                 + in(i-2,j+2) * (-0.020833333333333332) &
-                 + in(i-1,j+2) * (-0.125) &
-                 + in(i+1,j+2) * (0.020833333333333332) &
-                 + in(i+2,j+2) * (0.0625) &
+                 + in(i-2,j-2) * (-0.0625d0) &
+                 + in(i+1,j-2) * (-0.020833333333333332d0) &
+                 + in(i+2,j-2) * (-0.020833333333333332d0) &
+                 + in(i-1,j-1) * (-0.125d0) &
+                 + in(i+1,j-1) * (-0.125d0) &
+                 + in(i+2,j-1) * (-0.125d0) &
+                 + in(i-2,j+1) * (-0.020833333333333332d0) &
+                 + in(i-1,j+1) * (-0.125d0) &
+                 + in(i+1,j+1) * (0.125d0) &
+                 + in(i+2,j+1) * (0.020833333333333332d0) &
+                 + in(i-2,j+2) * (-0.020833333333333332d0) &
+                 + in(i-1,j+2) * (-0.125d0) &
+                 + in(i+1,j+2) * (0.020833333333333332d0) &
+                 + in(i+2,j+2) * (0.0625d0) &
 +0.0
       end do
     end do
@@ -371,36 +371,36 @@ subroutine grid3(n, in, out)
     do i=3,n-3-1
       do j=3,n-3-1
         out(i,j) = out(i,j) &
-                 + in(i-3,j-3) * (-0.027777777777777776) &
-                 + in(i+1,j-3) * (-0.005555555555555556) &
-                 + in(i+2,j-3) * (-0.005555555555555556) &
-                 + in(i+3,j-3) * (-0.005555555555555556) &
-                 + in(i-2,j-2) * (-0.041666666666666664) &
-                 + in(i+1,j-2) * (-0.013888888888888888) &
-                 + in(i+2,j-2) * (-0.013888888888888888) &
-                 + in(i+3,j-2) * (-0.013888888888888888) &
-                 + in(i-1,j-1) * (-0.08333333333333333) &
-                 + in(i+1,j-1) * (-0.08333333333333333) &
-                 + in(i+2,j-1) * (-0.08333333333333333) &
-                 + in(i+3,j-1) * (-0.08333333333333333) &
-                 + in(i-3,j+1) * (-0.005555555555555556) &
-                 + in(i-2,j+1) * (-0.013888888888888888) &
-                 + in(i-1,j+1) * (-0.08333333333333333) &
-                 + in(i+1,j+1) * (0.08333333333333333) &
-                 + in(i+2,j+1) * (0.013888888888888888) &
-                 + in(i+3,j+1) * (0.005555555555555556) &
-                 + in(i-3,j+2) * (-0.005555555555555556) &
-                 + in(i-2,j+2) * (-0.013888888888888888) &
-                 + in(i-1,j+2) * (-0.08333333333333333) &
-                 + in(i+1,j+2) * (0.013888888888888888) &
-                 + in(i+2,j+2) * (0.041666666666666664) &
-                 + in(i+3,j+2) * (0.005555555555555556) &
-                 + in(i-3,j+3) * (-0.005555555555555556) &
-                 + in(i-2,j+3) * (-0.013888888888888888) &
-                 + in(i-1,j+3) * (-0.08333333333333333) &
-                 + in(i+1,j+3) * (0.005555555555555556) &
-                 + in(i+2,j+3) * (0.005555555555555556) &
-                 + in(i+3,j+3) * (0.027777777777777776) &
+                 + in(i-3,j-3) * (-0.027777777777777776d0) &
+                 + in(i+1,j-3) * (-0.005555555555555556d0) &
+                 + in(i+2,j-3) * (-0.005555555555555556d0) &
+                 + in(i+3,j-3) * (-0.005555555555555556d0) &
+                 + in(i-2,j-2) * (-0.041666666666666664d0) &
+                 + in(i+1,j-2) * (-0.013888888888888888d0) &
+                 + in(i+2,j-2) * (-0.013888888888888888d0) &
+                 + in(i+3,j-2) * (-0.013888888888888888d0) &
+                 + in(i-1,j-1) * (-0.08333333333333333d0) &
+                 + in(i+1,j-1) * (-0.08333333333333333d0) &
+                 + in(i+2,j-1) * (-0.08333333333333333d0) &
+                 + in(i+3,j-1) * (-0.08333333333333333d0) &
+                 + in(i-3,j+1) * (-0.005555555555555556d0) &
+                 + in(i-2,j+1) * (-0.013888888888888888d0) &
+                 + in(i-1,j+1) * (-0.08333333333333333d0) &
+                 + in(i+1,j+1) * (0.08333333333333333d0) &
+                 + in(i+2,j+1) * (0.013888888888888888d0) &
+                 + in(i+3,j+1) * (0.005555555555555556d0) &
+                 + in(i-3,j+2) * (-0.005555555555555556d0) &
+                 + in(i-2,j+2) * (-0.013888888888888888d0) &
+                 + in(i-1,j+2) * (-0.08333333333333333d0) &
+                 + in(i+1,j+2) * (0.013888888888888888d0) &
+                 + in(i+2,j+2) * (0.041666666666666664d0) &
+                 + in(i+3,j+2) * (0.005555555555555556d0) &
+                 + in(i-3,j+3) * (-0.005555555555555556d0) &
+                 + in(i-2,j+3) * (-0.013888888888888888d0) &
+                 + in(i-1,j+3) * (-0.08333333333333333d0) &
+                 + in(i+1,j+3) * (0.005555555555555556d0) &
+                 + in(i+2,j+3) * (0.005555555555555556d0) &
+                 + in(i+3,j+3) * (0.027777777777777776d0) &
 +0.0
       end do
     end do
@@ -416,58 +416,58 @@ subroutine grid4(n, in, out)
     do i=4,n-4-1
       do j=4,n-4-1
         out(i,j) = out(i,j) &
-                 + in(i-4,j-4) * (-0.015625) &
-                 + in(i+1,j-4) * (-0.002232142857142857) &
-                 + in(i+2,j-4) * (-0.002232142857142857) &
-                 + in(i+3,j-4) * (-0.002232142857142857) &
-                 + in(i+4,j-4) * (-0.002232142857142857) &
-                 + in(i-3,j-3) * (-0.020833333333333332) &
-                 + in(i+1,j-3) * (-0.004166666666666667) &
-                 + in(i+2,j-3) * (-0.004166666666666667) &
-                 + in(i+3,j-3) * (-0.004166666666666667) &
-                 + in(i+4,j-3) * (-0.004166666666666667) &
-                 + in(i-2,j-2) * (-0.03125) &
-                 + in(i+1,j-2) * (-0.010416666666666666) &
-                 + in(i+2,j-2) * (-0.010416666666666666) &
-                 + in(i+3,j-2) * (-0.010416666666666666) &
-                 + in(i+4,j-2) * (-0.010416666666666666) &
-                 + in(i-1,j-1) * (-0.0625) &
-                 + in(i+1,j-1) * (-0.0625) &
-                 + in(i+2,j-1) * (-0.0625) &
-                 + in(i+3,j-1) * (-0.0625) &
-                 + in(i+4,j-1) * (-0.0625) &
-                 + in(i-4,j+1) * (-0.002232142857142857) &
-                 + in(i-3,j+1) * (-0.004166666666666667) &
-                 + in(i-2,j+1) * (-0.010416666666666666) &
-                 + in(i-1,j+1) * (-0.0625) &
-                 + in(i+1,j+1) * (0.0625) &
-                 + in(i+2,j+1) * (0.010416666666666666) &
-                 + in(i+3,j+1) * (0.004166666666666667) &
-                 + in(i+4,j+1) * (0.002232142857142857) &
-                 + in(i-4,j+2) * (-0.002232142857142857) &
-                 + in(i-3,j+2) * (-0.004166666666666667) &
-                 + in(i-2,j+2) * (-0.010416666666666666) &
-                 + in(i-1,j+2) * (-0.0625) &
-                 + in(i+1,j+2) * (0.010416666666666666) &
-                 + in(i+2,j+2) * (0.03125) &
-                 + in(i+3,j+2) * (0.004166666666666667) &
-                 + in(i+4,j+2) * (0.002232142857142857) &
-                 + in(i-4,j+3) * (-0.002232142857142857) &
-                 + in(i-3,j+3) * (-0.004166666666666667) &
-                 + in(i-2,j+3) * (-0.010416666666666666) &
-                 + in(i-1,j+3) * (-0.0625) &
-                 + in(i+1,j+3) * (0.004166666666666667) &
-                 + in(i+2,j+3) * (0.004166666666666667) &
-                 + in(i+3,j+3) * (0.020833333333333332) &
-                 + in(i+4,j+3) * (0.002232142857142857) &
-                 + in(i-4,j+4) * (-0.002232142857142857) &
-                 + in(i-3,j+4) * (-0.004166666666666667) &
-                 + in(i-2,j+4) * (-0.010416666666666666) &
-                 + in(i-1,j+4) * (-0.0625) &
-                 + in(i+1,j+4) * (0.002232142857142857) &
-                 + in(i+2,j+4) * (0.002232142857142857) &
-                 + in(i+3,j+4) * (0.002232142857142857) &
-                 + in(i+4,j+4) * (0.015625) &
+                 + in(i-4,j-4) * (-0.015625d0) &
+                 + in(i+1,j-4) * (-0.002232142857142857d0) &
+                 + in(i+2,j-4) * (-0.002232142857142857d0) &
+                 + in(i+3,j-4) * (-0.002232142857142857d0) &
+                 + in(i+4,j-4) * (-0.002232142857142857d0) &
+                 + in(i-3,j-3) * (-0.020833333333333332d0) &
+                 + in(i+1,j-3) * (-0.004166666666666667d0) &
+                 + in(i+2,j-3) * (-0.004166666666666667d0) &
+                 + in(i+3,j-3) * (-0.004166666666666667d0) &
+                 + in(i+4,j-3) * (-0.004166666666666667d0) &
+                 + in(i-2,j-2) * (-0.03125d0) &
+                 + in(i+1,j-2) * (-0.010416666666666666d0) &
+                 + in(i+2,j-2) * (-0.010416666666666666d0) &
+                 + in(i+3,j-2) * (-0.010416666666666666d0) &
+                 + in(i+4,j-2) * (-0.010416666666666666d0) &
+                 + in(i-1,j-1) * (-0.0625d0) &
+                 + in(i+1,j-1) * (-0.0625d0) &
+                 + in(i+2,j-1) * (-0.0625d0) &
+                 + in(i+3,j-1) * (-0.0625d0) &
+                 + in(i+4,j-1) * (-0.0625d0) &
+                 + in(i-4,j+1) * (-0.002232142857142857d0) &
+                 + in(i-3,j+1) * (-0.004166666666666667d0) &
+                 + in(i-2,j+1) * (-0.010416666666666666d0) &
+                 + in(i-1,j+1) * (-0.0625d0) &
+                 + in(i+1,j+1) * (0.0625d0) &
+                 + in(i+2,j+1) * (0.010416666666666666d0) &
+                 + in(i+3,j+1) * (0.004166666666666667d0) &
+                 + in(i+4,j+1) * (0.002232142857142857d0) &
+                 + in(i-4,j+2) * (-0.002232142857142857d0) &
+                 + in(i-3,j+2) * (-0.004166666666666667d0) &
+                 + in(i-2,j+2) * (-0.010416666666666666d0) &
+                 + in(i-1,j+2) * (-0.0625d0) &
+                 + in(i+1,j+2) * (0.010416666666666666d0) &
+                 + in(i+2,j+2) * (0.03125d0) &
+                 + in(i+3,j+2) * (0.004166666666666667d0) &
+                 + in(i+4,j+2) * (0.002232142857142857d0) &
+                 + in(i-4,j+3) * (-0.002232142857142857d0) &
+                 + in(i-3,j+3) * (-0.004166666666666667d0) &
+                 + in(i-2,j+3) * (-0.010416666666666666d0) &
+                 + in(i-1,j+3) * (-0.0625d0) &
+                 + in(i+1,j+3) * (0.004166666666666667d0) &
+                 + in(i+2,j+3) * (0.004166666666666667d0) &
+                 + in(i+3,j+3) * (0.020833333333333332d0) &
+                 + in(i+4,j+3) * (0.002232142857142857d0) &
+                 + in(i-4,j+4) * (-0.002232142857142857d0) &
+                 + in(i-3,j+4) * (-0.004166666666666667d0) &
+                 + in(i-2,j+4) * (-0.010416666666666666d0) &
+                 + in(i-1,j+4) * (-0.0625d0) &
+                 + in(i+1,j+4) * (0.002232142857142857d0) &
+                 + in(i+2,j+4) * (0.002232142857142857d0) &
+                 + in(i+3,j+4) * (0.002232142857142857d0) &
+                 + in(i+4,j+4) * (0.015625d0) &
 +0.0
       end do
     end do
@@ -483,86 +483,86 @@ subroutine grid5(n, in, out)
     do i=5,n-5-1
       do j=5,n-5-1
         out(i,j) = out(i,j) &
-                 + in(i-5,j-5) * (-0.01) &
-                 + in(i+1,j-5) * (-0.0011111111111111111) &
-                 + in(i+2,j-5) * (-0.0011111111111111111) &
-                 + in(i+3,j-5) * (-0.0011111111111111111) &
-                 + in(i+4,j-5) * (-0.0011111111111111111) &
-                 + in(i+5,j-5) * (-0.0011111111111111111) &
-                 + in(i-4,j-4) * (-0.0125) &
-                 + in(i+1,j-4) * (-0.0017857142857142857) &
-                 + in(i+2,j-4) * (-0.0017857142857142857) &
-                 + in(i+3,j-4) * (-0.0017857142857142857) &
-                 + in(i+4,j-4) * (-0.0017857142857142857) &
-                 + in(i+5,j-4) * (-0.0017857142857142857) &
-                 + in(i-3,j-3) * (-0.016666666666666666) &
-                 + in(i+1,j-3) * (-0.0033333333333333335) &
-                 + in(i+2,j-3) * (-0.0033333333333333335) &
-                 + in(i+3,j-3) * (-0.0033333333333333335) &
-                 + in(i+4,j-3) * (-0.0033333333333333335) &
-                 + in(i+5,j-3) * (-0.0033333333333333335) &
-                 + in(i-2,j-2) * (-0.025) &
-                 + in(i+1,j-2) * (-0.008333333333333333) &
-                 + in(i+2,j-2) * (-0.008333333333333333) &
-                 + in(i+3,j-2) * (-0.008333333333333333) &
-                 + in(i+4,j-2) * (-0.008333333333333333) &
-                 + in(i+5,j-2) * (-0.008333333333333333) &
-                 + in(i-1,j-1) * (-0.05) &
-                 + in(i+1,j-1) * (-0.05) &
-                 + in(i+2,j-1) * (-0.05) &
-                 + in(i+3,j-1) * (-0.05) &
-                 + in(i+4,j-1) * (-0.05) &
-                 + in(i+5,j-1) * (-0.05) &
-                 + in(i-5,j+1) * (-0.0011111111111111111) &
-                 + in(i-4,j+1) * (-0.0017857142857142857) &
-                 + in(i-3,j+1) * (-0.0033333333333333335) &
-                 + in(i-2,j+1) * (-0.008333333333333333) &
-                 + in(i-1,j+1) * (-0.05) &
-                 + in(i+1,j+1) * (0.05) &
-                 + in(i+2,j+1) * (0.008333333333333333) &
-                 + in(i+3,j+1) * (0.0033333333333333335) &
-                 + in(i+4,j+1) * (0.0017857142857142857) &
-                 + in(i+5,j+1) * (0.0011111111111111111) &
-                 + in(i-5,j+2) * (-0.0011111111111111111) &
-                 + in(i-4,j+2) * (-0.0017857142857142857) &
-                 + in(i-3,j+2) * (-0.0033333333333333335) &
-                 + in(i-2,j+2) * (-0.008333333333333333) &
-                 + in(i-1,j+2) * (-0.05) &
-                 + in(i+1,j+2) * (0.008333333333333333) &
-                 + in(i+2,j+2) * (0.025) &
-                 + in(i+3,j+2) * (0.0033333333333333335) &
-                 + in(i+4,j+2) * (0.0017857142857142857) &
-                 + in(i+5,j+2) * (0.0011111111111111111) &
-                 + in(i-5,j+3) * (-0.0011111111111111111) &
-                 + in(i-4,j+3) * (-0.0017857142857142857) &
-                 + in(i-3,j+3) * (-0.0033333333333333335) &
-                 + in(i-2,j+3) * (-0.008333333333333333) &
-                 + in(i-1,j+3) * (-0.05) &
-                 + in(i+1,j+3) * (0.0033333333333333335) &
-                 + in(i+2,j+3) * (0.0033333333333333335) &
-                 + in(i+3,j+3) * (0.016666666666666666) &
-                 + in(i+4,j+3) * (0.0017857142857142857) &
-                 + in(i+5,j+3) * (0.0011111111111111111) &
-                 + in(i-5,j+4) * (-0.0011111111111111111) &
-                 + in(i-4,j+4) * (-0.0017857142857142857) &
-                 + in(i-3,j+4) * (-0.0033333333333333335) &
-                 + in(i-2,j+4) * (-0.008333333333333333) &
-                 + in(i-1,j+4) * (-0.05) &
-                 + in(i+1,j+4) * (0.0017857142857142857) &
-                 + in(i+2,j+4) * (0.0017857142857142857) &
-                 + in(i+3,j+4) * (0.0017857142857142857) &
-                 + in(i+4,j+4) * (0.0125) &
-                 + in(i+5,j+4) * (0.0011111111111111111) &
-                 + in(i-5,j+5) * (-0.0011111111111111111) &
-                 + in(i-4,j+5) * (-0.0017857142857142857) &
-                 + in(i-3,j+5) * (-0.0033333333333333335) &
-                 + in(i-2,j+5) * (-0.008333333333333333) &
-                 + in(i-1,j+5) * (-0.05) &
-                 + in(i+1,j+5) * (0.0011111111111111111) &
-                 + in(i+2,j+5) * (0.0011111111111111111) &
-                 + in(i+3,j+5) * (0.0011111111111111111) &
-                 + in(i+4,j+5) * (0.0011111111111111111) &
-                 + in(i+5,j+5) * (0.01) &
+                 + in(i-5,j-5) * (-0.01d0) &
+                 + in(i+1,j-5) * (-0.0011111111111111111d0) &
+                 + in(i+2,j-5) * (-0.0011111111111111111d0) &
+                 + in(i+3,j-5) * (-0.0011111111111111111d0) &
+                 + in(i+4,j-5) * (-0.0011111111111111111d0) &
+                 + in(i+5,j-5) * (-0.0011111111111111111d0) &
+                 + in(i-4,j-4) * (-0.0125d0) &
+                 + in(i+1,j-4) * (-0.0017857142857142857d0) &
+                 + in(i+2,j-4) * (-0.0017857142857142857d0) &
+                 + in(i+3,j-4) * (-0.0017857142857142857d0) &
+                 + in(i+4,j-4) * (-0.0017857142857142857d0) &
+                 + in(i+5,j-4) * (-0.0017857142857142857d0) &
+                 + in(i-3,j-3) * (-0.016666666666666666d0) &
+                 + in(i+1,j-3) * (-0.0033333333333333335d0) &
+                 + in(i+2,j-3) * (-0.0033333333333333335d0) &
+                 + in(i+3,j-3) * (-0.0033333333333333335d0) &
+                 + in(i+4,j-3) * (-0.0033333333333333335d0) &
+                 + in(i+5,j-3) * (-0.0033333333333333335d0) &
+                 + in(i-2,j-2) * (-0.025d0) &
+                 + in(i+1,j-2) * (-0.008333333333333333d0) &
+                 + in(i+2,j-2) * (-0.008333333333333333d0) &
+                 + in(i+3,j-2) * (-0.008333333333333333d0) &
+                 + in(i+4,j-2) * (-0.008333333333333333d0) &
+                 + in(i+5,j-2) * (-0.008333333333333333d0) &
+                 + in(i-1,j-1) * (-0.05d0) &
+                 + in(i+1,j-1) * (-0.05d0) &
+                 + in(i+2,j-1) * (-0.05d0) &
+                 + in(i+3,j-1) * (-0.05d0) &
+                 + in(i+4,j-1) * (-0.05d0) &
+                 + in(i+5,j-1) * (-0.05d0) &
+                 + in(i-5,j+1) * (-0.0011111111111111111d0) &
+                 + in(i-4,j+1) * (-0.0017857142857142857d0) &
+                 + in(i-3,j+1) * (-0.0033333333333333335d0) &
+                 + in(i-2,j+1) * (-0.008333333333333333d0) &
+                 + in(i-1,j+1) * (-0.05d0) &
+                 + in(i+1,j+1) * (0.05d0) &
+                 + in(i+2,j+1) * (0.008333333333333333d0) &
+                 + in(i+3,j+1) * (0.0033333333333333335d0) &
+                 + in(i+4,j+1) * (0.0017857142857142857d0) &
+                 + in(i+5,j+1) * (0.0011111111111111111d0) &
+                 + in(i-5,j+2) * (-0.0011111111111111111d0) &
+                 + in(i-4,j+2) * (-0.0017857142857142857d0) &
+                 + in(i-3,j+2) * (-0.0033333333333333335d0) &
+                 + in(i-2,j+2) * (-0.008333333333333333d0) &
+                 + in(i-1,j+2) * (-0.05d0) &
+                 + in(i+1,j+2) * (0.008333333333333333d0) &
+                 + in(i+2,j+2) * (0.025d0) &
+                 + in(i+3,j+2) * (0.0033333333333333335d0) &
+                 + in(i+4,j+2) * (0.0017857142857142857d0) &
+                 + in(i+5,j+2) * (0.0011111111111111111d0) &
+                 + in(i-5,j+3) * (-0.0011111111111111111d0) &
+                 + in(i-4,j+3) * (-0.0017857142857142857d0) &
+                 + in(i-3,j+3) * (-0.0033333333333333335d0) &
+                 + in(i-2,j+3) * (-0.008333333333333333d0) &
+                 + in(i-1,j+3) * (-0.05d0) &
+                 + in(i+1,j+3) * (0.0033333333333333335d0) &
+                 + in(i+2,j+3) * (0.0033333333333333335d0) &
+                 + in(i+3,j+3) * (0.016666666666666666d0) &
+                 + in(i+4,j+3) * (0.0017857142857142857d0) &
+                 + in(i+5,j+3) * (0.0011111111111111111d0) &
+                 + in(i-5,j+4) * (-0.0011111111111111111d0) &
+                 + in(i-4,j+4) * (-0.0017857142857142857d0) &
+                 + in(i-3,j+4) * (-0.0033333333333333335d0) &
+                 + in(i-2,j+4) * (-0.008333333333333333d0) &
+                 + in(i-1,j+4) * (-0.05d0) &
+                 + in(i+1,j+4) * (0.0017857142857142857d0) &
+                 + in(i+2,j+4) * (0.0017857142857142857d0) &
+                 + in(i+3,j+4) * (0.0017857142857142857d0) &
+                 + in(i+4,j+4) * (0.0125d0) &
+                 + in(i+5,j+4) * (0.0011111111111111111d0) &
+                 + in(i-5,j+5) * (-0.0011111111111111111d0) &
+                 + in(i-4,j+5) * (-0.0017857142857142857d0) &
+                 + in(i-3,j+5) * (-0.0033333333333333335d0) &
+                 + in(i-2,j+5) * (-0.008333333333333333d0) &
+                 + in(i-1,j+5) * (-0.05d0) &
+                 + in(i+1,j+5) * (0.0011111111111111111d0) &
+                 + in(i+2,j+5) * (0.0011111111111111111d0) &
+                 + in(i+3,j+5) * (0.0011111111111111111d0) &
+                 + in(i+4,j+5) * (0.0011111111111111111d0) &
+                 + in(i+5,j+5) * (0.01d0) &
 +0.0
       end do
     end do
@@ -578,120 +578,120 @@ subroutine grid6(n, in, out)
     do i=6,n-6-1
       do j=6,n-6-1
         out(i,j) = out(i,j) &
-                 + in(i-6,j-6) * (-0.006944444444444444) &
-                 + in(i+1,j-6) * (-0.0006313131313131314) &
-                 + in(i+2,j-6) * (-0.0006313131313131314) &
-                 + in(i+3,j-6) * (-0.0006313131313131314) &
-                 + in(i+4,j-6) * (-0.0006313131313131314) &
-                 + in(i+5,j-6) * (-0.0006313131313131314) &
-                 + in(i+6,j-6) * (-0.0006313131313131314) &
-                 + in(i-5,j-5) * (-0.008333333333333333) &
-                 + in(i+1,j-5) * (-0.000925925925925926) &
-                 + in(i+2,j-5) * (-0.000925925925925926) &
-                 + in(i+3,j-5) * (-0.000925925925925926) &
-                 + in(i+4,j-5) * (-0.000925925925925926) &
-                 + in(i+5,j-5) * (-0.000925925925925926) &
-                 + in(i+6,j-5) * (-0.000925925925925926) &
-                 + in(i-4,j-4) * (-0.010416666666666666) &
-                 + in(i+1,j-4) * (-0.001488095238095238) &
-                 + in(i+2,j-4) * (-0.001488095238095238) &
-                 + in(i+3,j-4) * (-0.001488095238095238) &
-                 + in(i+4,j-4) * (-0.001488095238095238) &
-                 + in(i+5,j-4) * (-0.001488095238095238) &
-                 + in(i+6,j-4) * (-0.001488095238095238) &
-                 + in(i-3,j-3) * (-0.013888888888888888) &
-                 + in(i+1,j-3) * (-0.002777777777777778) &
-                 + in(i+2,j-3) * (-0.002777777777777778) &
-                 + in(i+3,j-3) * (-0.002777777777777778) &
-                 + in(i+4,j-3) * (-0.002777777777777778) &
-                 + in(i+5,j-3) * (-0.002777777777777778) &
-                 + in(i+6,j-3) * (-0.002777777777777778) &
-                 + in(i-2,j-2) * (-0.020833333333333332) &
-                 + in(i+1,j-2) * (-0.006944444444444444) &
-                 + in(i+2,j-2) * (-0.006944444444444444) &
-                 + in(i+3,j-2) * (-0.006944444444444444) &
-                 + in(i+4,j-2) * (-0.006944444444444444) &
-                 + in(i+5,j-2) * (-0.006944444444444444) &
-                 + in(i+6,j-2) * (-0.006944444444444444) &
-                 + in(i-1,j-1) * (-0.041666666666666664) &
-                 + in(i+1,j-1) * (-0.041666666666666664) &
-                 + in(i+2,j-1) * (-0.041666666666666664) &
-                 + in(i+3,j-1) * (-0.041666666666666664) &
-                 + in(i+4,j-1) * (-0.041666666666666664) &
-                 + in(i+5,j-1) * (-0.041666666666666664) &
-                 + in(i+6,j-1) * (-0.041666666666666664) &
-                 + in(i-6,j+1) * (-0.0006313131313131314) &
-                 + in(i-5,j+1) * (-0.000925925925925926) &
-                 + in(i-4,j+1) * (-0.001488095238095238) &
-                 + in(i-3,j+1) * (-0.002777777777777778) &
-                 + in(i-2,j+1) * (-0.006944444444444444) &
-                 + in(i-1,j+1) * (-0.041666666666666664) &
-                 + in(i+1,j+1) * (0.041666666666666664) &
-                 + in(i+2,j+1) * (0.006944444444444444) &
-                 + in(i+3,j+1) * (0.002777777777777778) &
-                 + in(i+4,j+1) * (0.001488095238095238) &
-                 + in(i+5,j+1) * (0.000925925925925926) &
-                 + in(i+6,j+1) * (0.0006313131313131314) &
-                 + in(i-6,j+2) * (-0.0006313131313131314) &
-                 + in(i-5,j+2) * (-0.000925925925925926) &
-                 + in(i-4,j+2) * (-0.001488095238095238) &
-                 + in(i-3,j+2) * (-0.002777777777777778) &
-                 + in(i-2,j+2) * (-0.006944444444444444) &
-                 + in(i-1,j+2) * (-0.041666666666666664) &
-                 + in(i+1,j+2) * (0.006944444444444444) &
-                 + in(i+2,j+2) * (0.020833333333333332) &
-                 + in(i+3,j+2) * (0.002777777777777778) &
-                 + in(i+4,j+2) * (0.001488095238095238) &
-                 + in(i+5,j+2) * (0.000925925925925926) &
-                 + in(i+6,j+2) * (0.0006313131313131314) &
-                 + in(i-6,j+3) * (-0.0006313131313131314) &
-                 + in(i-5,j+3) * (-0.000925925925925926) &
-                 + in(i-4,j+3) * (-0.001488095238095238) &
-                 + in(i-3,j+3) * (-0.002777777777777778) &
-                 + in(i-2,j+3) * (-0.006944444444444444) &
-                 + in(i-1,j+3) * (-0.041666666666666664) &
-                 + in(i+1,j+3) * (0.002777777777777778) &
-                 + in(i+2,j+3) * (0.002777777777777778) &
-                 + in(i+3,j+3) * (0.013888888888888888) &
-                 + in(i+4,j+3) * (0.001488095238095238) &
-                 + in(i+5,j+3) * (0.000925925925925926) &
-                 + in(i+6,j+3) * (0.0006313131313131314) &
-                 + in(i-6,j+4) * (-0.0006313131313131314) &
-                 + in(i-5,j+4) * (-0.000925925925925926) &
-                 + in(i-4,j+4) * (-0.001488095238095238) &
-                 + in(i-3,j+4) * (-0.002777777777777778) &
-                 + in(i-2,j+4) * (-0.006944444444444444) &
-                 + in(i-1,j+4) * (-0.041666666666666664) &
-                 + in(i+1,j+4) * (0.001488095238095238) &
-                 + in(i+2,j+4) * (0.001488095238095238) &
-                 + in(i+3,j+4) * (0.001488095238095238) &
-                 + in(i+4,j+4) * (0.010416666666666666) &
-                 + in(i+5,j+4) * (0.000925925925925926) &
-                 + in(i+6,j+4) * (0.0006313131313131314) &
-                 + in(i-6,j+5) * (-0.0006313131313131314) &
-                 + in(i-5,j+5) * (-0.000925925925925926) &
-                 + in(i-4,j+5) * (-0.001488095238095238) &
-                 + in(i-3,j+5) * (-0.002777777777777778) &
-                 + in(i-2,j+5) * (-0.006944444444444444) &
-                 + in(i-1,j+5) * (-0.041666666666666664) &
-                 + in(i+1,j+5) * (0.000925925925925926) &
-                 + in(i+2,j+5) * (0.000925925925925926) &
-                 + in(i+3,j+5) * (0.000925925925925926) &
-                 + in(i+4,j+5) * (0.000925925925925926) &
-                 + in(i+5,j+5) * (0.008333333333333333) &
-                 + in(i+6,j+5) * (0.0006313131313131314) &
-                 + in(i-6,j+6) * (-0.0006313131313131314) &
-                 + in(i-5,j+6) * (-0.000925925925925926) &
-                 + in(i-4,j+6) * (-0.001488095238095238) &
-                 + in(i-3,j+6) * (-0.002777777777777778) &
-                 + in(i-2,j+6) * (-0.006944444444444444) &
-                 + in(i-1,j+6) * (-0.041666666666666664) &
-                 + in(i+1,j+6) * (0.0006313131313131314) &
-                 + in(i+2,j+6) * (0.0006313131313131314) &
-                 + in(i+3,j+6) * (0.0006313131313131314) &
-                 + in(i+4,j+6) * (0.0006313131313131314) &
-                 + in(i+5,j+6) * (0.0006313131313131314) &
-                 + in(i+6,j+6) * (0.006944444444444444) &
+                 + in(i-6,j-6) * (-0.006944444444444444d0) &
+                 + in(i+1,j-6) * (-0.0006313131313131314d0) &
+                 + in(i+2,j-6) * (-0.0006313131313131314d0) &
+                 + in(i+3,j-6) * (-0.0006313131313131314d0) &
+                 + in(i+4,j-6) * (-0.0006313131313131314d0) &
+                 + in(i+5,j-6) * (-0.0006313131313131314d0) &
+                 + in(i+6,j-6) * (-0.0006313131313131314d0) &
+                 + in(i-5,j-5) * (-0.008333333333333333d0) &
+                 + in(i+1,j-5) * (-0.000925925925925926d0) &
+                 + in(i+2,j-5) * (-0.000925925925925926d0) &
+                 + in(i+3,j-5) * (-0.000925925925925926d0) &
+                 + in(i+4,j-5) * (-0.000925925925925926d0) &
+                 + in(i+5,j-5) * (-0.000925925925925926d0) &
+                 + in(i+6,j-5) * (-0.000925925925925926d0) &
+                 + in(i-4,j-4) * (-0.010416666666666666d0) &
+                 + in(i+1,j-4) * (-0.001488095238095238d0) &
+                 + in(i+2,j-4) * (-0.001488095238095238d0) &
+                 + in(i+3,j-4) * (-0.001488095238095238d0) &
+                 + in(i+4,j-4) * (-0.001488095238095238d0) &
+                 + in(i+5,j-4) * (-0.001488095238095238d0) &
+                 + in(i+6,j-4) * (-0.001488095238095238d0) &
+                 + in(i-3,j-3) * (-0.013888888888888888d0) &
+                 + in(i+1,j-3) * (-0.002777777777777778d0) &
+                 + in(i+2,j-3) * (-0.002777777777777778d0) &
+                 + in(i+3,j-3) * (-0.002777777777777778d0) &
+                 + in(i+4,j-3) * (-0.002777777777777778d0) &
+                 + in(i+5,j-3) * (-0.002777777777777778d0) &
+                 + in(i+6,j-3) * (-0.002777777777777778d0) &
+                 + in(i-2,j-2) * (-0.020833333333333332d0) &
+                 + in(i+1,j-2) * (-0.006944444444444444d0) &
+                 + in(i+2,j-2) * (-0.006944444444444444d0) &
+                 + in(i+3,j-2) * (-0.006944444444444444d0) &
+                 + in(i+4,j-2) * (-0.006944444444444444d0) &
+                 + in(i+5,j-2) * (-0.006944444444444444d0) &
+                 + in(i+6,j-2) * (-0.006944444444444444d0) &
+                 + in(i-1,j-1) * (-0.041666666666666664d0) &
+                 + in(i+1,j-1) * (-0.041666666666666664d0) &
+                 + in(i+2,j-1) * (-0.041666666666666664d0) &
+                 + in(i+3,j-1) * (-0.041666666666666664d0) &
+                 + in(i+4,j-1) * (-0.041666666666666664d0) &
+                 + in(i+5,j-1) * (-0.041666666666666664d0) &
+                 + in(i+6,j-1) * (-0.041666666666666664d0) &
+                 + in(i-6,j+1) * (-0.0006313131313131314d0) &
+                 + in(i-5,j+1) * (-0.000925925925925926d0) &
+                 + in(i-4,j+1) * (-0.001488095238095238d0) &
+                 + in(i-3,j+1) * (-0.002777777777777778d0) &
+                 + in(i-2,j+1) * (-0.006944444444444444d0) &
+                 + in(i-1,j+1) * (-0.041666666666666664d0) &
+                 + in(i+1,j+1) * (0.041666666666666664d0) &
+                 + in(i+2,j+1) * (0.006944444444444444d0) &
+                 + in(i+3,j+1) * (0.002777777777777778d0) &
+                 + in(i+4,j+1) * (0.001488095238095238d0) &
+                 + in(i+5,j+1) * (0.000925925925925926d0) &
+                 + in(i+6,j+1) * (0.0006313131313131314d0) &
+                 + in(i-6,j+2) * (-0.0006313131313131314d0) &
+                 + in(i-5,j+2) * (-0.000925925925925926d0) &
+                 + in(i-4,j+2) * (-0.001488095238095238d0) &
+                 + in(i-3,j+2) * (-0.002777777777777778d0) &
+                 + in(i-2,j+2) * (-0.006944444444444444d0) &
+                 + in(i-1,j+2) * (-0.041666666666666664d0) &
+                 + in(i+1,j+2) * (0.006944444444444444d0) &
+                 + in(i+2,j+2) * (0.020833333333333332d0) &
+                 + in(i+3,j+2) * (0.002777777777777778d0) &
+                 + in(i+4,j+2) * (0.001488095238095238d0) &
+                 + in(i+5,j+2) * (0.000925925925925926d0) &
+                 + in(i+6,j+2) * (0.0006313131313131314d0) &
+                 + in(i-6,j+3) * (-0.0006313131313131314d0) &
+                 + in(i-5,j+3) * (-0.000925925925925926d0) &
+                 + in(i-4,j+3) * (-0.001488095238095238d0) &
+                 + in(i-3,j+3) * (-0.002777777777777778d0) &
+                 + in(i-2,j+3) * (-0.006944444444444444d0) &
+                 + in(i-1,j+3) * (-0.041666666666666664d0) &
+                 + in(i+1,j+3) * (0.002777777777777778d0) &
+                 + in(i+2,j+3) * (0.002777777777777778d0) &
+                 + in(i+3,j+3) * (0.013888888888888888d0) &
+                 + in(i+4,j+3) * (0.001488095238095238d0) &
+                 + in(i+5,j+3) * (0.000925925925925926d0) &
+                 + in(i+6,j+3) * (0.0006313131313131314d0) &
+                 + in(i-6,j+4) * (-0.0006313131313131314d0) &
+                 + in(i-5,j+4) * (-0.000925925925925926d0) &
+                 + in(i-4,j+4) * (-0.001488095238095238d0) &
+                 + in(i-3,j+4) * (-0.002777777777777778d0) &
+                 + in(i-2,j+4) * (-0.006944444444444444d0) &
+                 + in(i-1,j+4) * (-0.041666666666666664d0) &
+                 + in(i+1,j+4) * (0.001488095238095238d0) &
+                 + in(i+2,j+4) * (0.001488095238095238d0) &
+                 + in(i+3,j+4) * (0.001488095238095238d0) &
+                 + in(i+4,j+4) * (0.010416666666666666d0) &
+                 + in(i+5,j+4) * (0.000925925925925926d0) &
+                 + in(i+6,j+4) * (0.0006313131313131314d0) &
+                 + in(i-6,j+5) * (-0.0006313131313131314d0) &
+                 + in(i-5,j+5) * (-0.000925925925925926d0) &
+                 + in(i-4,j+5) * (-0.001488095238095238d0) &
+                 + in(i-3,j+5) * (-0.002777777777777778d0) &
+                 + in(i-2,j+5) * (-0.006944444444444444d0) &
+                 + in(i-1,j+5) * (-0.041666666666666664d0) &
+                 + in(i+1,j+5) * (0.000925925925925926d0) &
+                 + in(i+2,j+5) * (0.000925925925925926d0) &
+                 + in(i+3,j+5) * (0.000925925925925926d0) &
+                 + in(i+4,j+5) * (0.000925925925925926d0) &
+                 + in(i+5,j+5) * (0.008333333333333333d0) &
+                 + in(i+6,j+5) * (0.0006313131313131314d0) &
+                 + in(i-6,j+6) * (-0.0006313131313131314d0) &
+                 + in(i-5,j+6) * (-0.000925925925925926d0) &
+                 + in(i-4,j+6) * (-0.001488095238095238d0) &
+                 + in(i-3,j+6) * (-0.002777777777777778d0) &
+                 + in(i-2,j+6) * (-0.006944444444444444d0) &
+                 + in(i-1,j+6) * (-0.041666666666666664d0) &
+                 + in(i+1,j+6) * (0.0006313131313131314d0) &
+                 + in(i+2,j+6) * (0.0006313131313131314d0) &
+                 + in(i+3,j+6) * (0.0006313131313131314d0) &
+                 + in(i+4,j+6) * (0.0006313131313131314d0) &
+                 + in(i+5,j+6) * (0.0006313131313131314d0) &
+                 + in(i+6,j+6) * (0.006944444444444444d0) &
 +0.0
       end do
     end do
@@ -707,160 +707,160 @@ subroutine grid7(n, in, out)
     do i=7,n-7-1
       do j=7,n-7-1
         out(i,j) = out(i,j) &
-                 + in(i-7,j-7) * (-0.00510204081632653) &
-                 + in(i+1,j-7) * (-0.0003924646781789639) &
-                 + in(i+2,j-7) * (-0.0003924646781789639) &
-                 + in(i+3,j-7) * (-0.0003924646781789639) &
-                 + in(i+4,j-7) * (-0.0003924646781789639) &
-                 + in(i+5,j-7) * (-0.0003924646781789639) &
-                 + in(i+6,j-7) * (-0.0003924646781789639) &
-                 + in(i+7,j-7) * (-0.0003924646781789639) &
-                 + in(i-6,j-6) * (-0.005952380952380952) &
-                 + in(i+1,j-6) * (-0.0005411255411255411) &
-                 + in(i+2,j-6) * (-0.0005411255411255411) &
-                 + in(i+3,j-6) * (-0.0005411255411255411) &
-                 + in(i+4,j-6) * (-0.0005411255411255411) &
-                 + in(i+5,j-6) * (-0.0005411255411255411) &
-                 + in(i+6,j-6) * (-0.0005411255411255411) &
-                 + in(i+7,j-6) * (-0.0005411255411255411) &
-                 + in(i-5,j-5) * (-0.007142857142857143) &
-                 + in(i+1,j-5) * (-0.0007936507936507937) &
-                 + in(i+2,j-5) * (-0.0007936507936507937) &
-                 + in(i+3,j-5) * (-0.0007936507936507937) &
-                 + in(i+4,j-5) * (-0.0007936507936507937) &
-                 + in(i+5,j-5) * (-0.0007936507936507937) &
-                 + in(i+6,j-5) * (-0.0007936507936507937) &
-                 + in(i+7,j-5) * (-0.0007936507936507937) &
-                 + in(i-4,j-4) * (-0.008928571428571428) &
-                 + in(i+1,j-4) * (-0.0012755102040816326) &
-                 + in(i+2,j-4) * (-0.0012755102040816326) &
-                 + in(i+3,j-4) * (-0.0012755102040816326) &
-                 + in(i+4,j-4) * (-0.0012755102040816326) &
-                 + in(i+5,j-4) * (-0.0012755102040816326) &
-                 + in(i+6,j-4) * (-0.0012755102040816326) &
-                 + in(i+7,j-4) * (-0.0012755102040816326) &
-                 + in(i-3,j-3) * (-0.011904761904761904) &
-                 + in(i+1,j-3) * (-0.002380952380952381) &
-                 + in(i+2,j-3) * (-0.002380952380952381) &
-                 + in(i+3,j-3) * (-0.002380952380952381) &
-                 + in(i+4,j-3) * (-0.002380952380952381) &
-                 + in(i+5,j-3) * (-0.002380952380952381) &
-                 + in(i+6,j-3) * (-0.002380952380952381) &
-                 + in(i+7,j-3) * (-0.002380952380952381) &
-                 + in(i-2,j-2) * (-0.017857142857142856) &
-                 + in(i+1,j-2) * (-0.005952380952380952) &
-                 + in(i+2,j-2) * (-0.005952380952380952) &
-                 + in(i+3,j-2) * (-0.005952380952380952) &
-                 + in(i+4,j-2) * (-0.005952380952380952) &
-                 + in(i+5,j-2) * (-0.005952380952380952) &
-                 + in(i+6,j-2) * (-0.005952380952380952) &
-                 + in(i+7,j-2) * (-0.005952380952380952) &
-                 + in(i-1,j-1) * (-0.03571428571428571) &
-                 + in(i+1,j-1) * (-0.03571428571428571) &
-                 + in(i+2,j-1) * (-0.03571428571428571) &
-                 + in(i+3,j-1) * (-0.03571428571428571) &
-                 + in(i+4,j-1) * (-0.03571428571428571) &
-                 + in(i+5,j-1) * (-0.03571428571428571) &
-                 + in(i+6,j-1) * (-0.03571428571428571) &
-                 + in(i+7,j-1) * (-0.03571428571428571) &
-                 + in(i-7,j+1) * (-0.0003924646781789639) &
-                 + in(i-6,j+1) * (-0.0005411255411255411) &
-                 + in(i-5,j+1) * (-0.0007936507936507937) &
-                 + in(i-4,j+1) * (-0.0012755102040816326) &
-                 + in(i-3,j+1) * (-0.002380952380952381) &
-                 + in(i-2,j+1) * (-0.005952380952380952) &
-                 + in(i-1,j+1) * (-0.03571428571428571) &
-                 + in(i+1,j+1) * (0.03571428571428571) &
-                 + in(i+2,j+1) * (0.005952380952380952) &
-                 + in(i+3,j+1) * (0.002380952380952381) &
-                 + in(i+4,j+1) * (0.0012755102040816326) &
-                 + in(i+5,j+1) * (0.0007936507936507937) &
-                 + in(i+6,j+1) * (0.0005411255411255411) &
-                 + in(i+7,j+1) * (0.0003924646781789639) &
-                 + in(i-7,j+2) * (-0.0003924646781789639) &
-                 + in(i-6,j+2) * (-0.0005411255411255411) &
-                 + in(i-5,j+2) * (-0.0007936507936507937) &
-                 + in(i-4,j+2) * (-0.0012755102040816326) &
-                 + in(i-3,j+2) * (-0.002380952380952381) &
-                 + in(i-2,j+2) * (-0.005952380952380952) &
-                 + in(i-1,j+2) * (-0.03571428571428571) &
-                 + in(i+1,j+2) * (0.005952380952380952) &
-                 + in(i+2,j+2) * (0.017857142857142856) &
-                 + in(i+3,j+2) * (0.002380952380952381) &
-                 + in(i+4,j+2) * (0.0012755102040816326) &
-                 + in(i+5,j+2) * (0.0007936507936507937) &
-                 + in(i+6,j+2) * (0.0005411255411255411) &
-                 + in(i+7,j+2) * (0.0003924646781789639) &
-                 + in(i-7,j+3) * (-0.0003924646781789639) &
-                 + in(i-6,j+3) * (-0.0005411255411255411) &
-                 + in(i-5,j+3) * (-0.0007936507936507937) &
-                 + in(i-4,j+3) * (-0.0012755102040816326) &
-                 + in(i-3,j+3) * (-0.002380952380952381) &
-                 + in(i-2,j+3) * (-0.005952380952380952) &
-                 + in(i-1,j+3) * (-0.03571428571428571) &
-                 + in(i+1,j+3) * (0.002380952380952381) &
-                 + in(i+2,j+3) * (0.002380952380952381) &
-                 + in(i+3,j+3) * (0.011904761904761904) &
-                 + in(i+4,j+3) * (0.0012755102040816326) &
-                 + in(i+5,j+3) * (0.0007936507936507937) &
-                 + in(i+6,j+3) * (0.0005411255411255411) &
-                 + in(i+7,j+3) * (0.0003924646781789639) &
-                 + in(i-7,j+4) * (-0.0003924646781789639) &
-                 + in(i-6,j+4) * (-0.0005411255411255411) &
-                 + in(i-5,j+4) * (-0.0007936507936507937) &
-                 + in(i-4,j+4) * (-0.0012755102040816326) &
-                 + in(i-3,j+4) * (-0.002380952380952381) &
-                 + in(i-2,j+4) * (-0.005952380952380952) &
-                 + in(i-1,j+4) * (-0.03571428571428571) &
-                 + in(i+1,j+4) * (0.0012755102040816326) &
-                 + in(i+2,j+4) * (0.0012755102040816326) &
-                 + in(i+3,j+4) * (0.0012755102040816326) &
-                 + in(i+4,j+4) * (0.008928571428571428) &
-                 + in(i+5,j+4) * (0.0007936507936507937) &
-                 + in(i+6,j+4) * (0.0005411255411255411) &
-                 + in(i+7,j+4) * (0.0003924646781789639) &
-                 + in(i-7,j+5) * (-0.0003924646781789639) &
-                 + in(i-6,j+5) * (-0.0005411255411255411) &
-                 + in(i-5,j+5) * (-0.0007936507936507937) &
-                 + in(i-4,j+5) * (-0.0012755102040816326) &
-                 + in(i-3,j+5) * (-0.002380952380952381) &
-                 + in(i-2,j+5) * (-0.005952380952380952) &
-                 + in(i-1,j+5) * (-0.03571428571428571) &
-                 + in(i+1,j+5) * (0.0007936507936507937) &
-                 + in(i+2,j+5) * (0.0007936507936507937) &
-                 + in(i+3,j+5) * (0.0007936507936507937) &
-                 + in(i+4,j+5) * (0.0007936507936507937) &
-                 + in(i+5,j+5) * (0.007142857142857143) &
-                 + in(i+6,j+5) * (0.0005411255411255411) &
-                 + in(i+7,j+5) * (0.0003924646781789639) &
-                 + in(i-7,j+6) * (-0.0003924646781789639) &
-                 + in(i-6,j+6) * (-0.0005411255411255411) &
-                 + in(i-5,j+6) * (-0.0007936507936507937) &
-                 + in(i-4,j+6) * (-0.0012755102040816326) &
-                 + in(i-3,j+6) * (-0.002380952380952381) &
-                 + in(i-2,j+6) * (-0.005952380952380952) &
-                 + in(i-1,j+6) * (-0.03571428571428571) &
-                 + in(i+1,j+6) * (0.0005411255411255411) &
-                 + in(i+2,j+6) * (0.0005411255411255411) &
-                 + in(i+3,j+6) * (0.0005411255411255411) &
-                 + in(i+4,j+6) * (0.0005411255411255411) &
-                 + in(i+5,j+6) * (0.0005411255411255411) &
-                 + in(i+6,j+6) * (0.005952380952380952) &
-                 + in(i+7,j+6) * (0.0003924646781789639) &
-                 + in(i-7,j+7) * (-0.0003924646781789639) &
-                 + in(i-6,j+7) * (-0.0005411255411255411) &
-                 + in(i-5,j+7) * (-0.0007936507936507937) &
-                 + in(i-4,j+7) * (-0.0012755102040816326) &
-                 + in(i-3,j+7) * (-0.002380952380952381) &
-                 + in(i-2,j+7) * (-0.005952380952380952) &
-                 + in(i-1,j+7) * (-0.03571428571428571) &
-                 + in(i+1,j+7) * (0.0003924646781789639) &
-                 + in(i+2,j+7) * (0.0003924646781789639) &
-                 + in(i+3,j+7) * (0.0003924646781789639) &
-                 + in(i+4,j+7) * (0.0003924646781789639) &
-                 + in(i+5,j+7) * (0.0003924646781789639) &
-                 + in(i+6,j+7) * (0.0003924646781789639) &
-                 + in(i+7,j+7) * (0.00510204081632653) &
+                 + in(i-7,j-7) * (-0.00510204081632653d0) &
+                 + in(i+1,j-7) * (-0.0003924646781789639d0) &
+                 + in(i+2,j-7) * (-0.0003924646781789639d0) &
+                 + in(i+3,j-7) * (-0.0003924646781789639d0) &
+                 + in(i+4,j-7) * (-0.0003924646781789639d0) &
+                 + in(i+5,j-7) * (-0.0003924646781789639d0) &
+                 + in(i+6,j-7) * (-0.0003924646781789639d0) &
+                 + in(i+7,j-7) * (-0.0003924646781789639d0) &
+                 + in(i-6,j-6) * (-0.005952380952380952d0) &
+                 + in(i+1,j-6) * (-0.0005411255411255411d0) &
+                 + in(i+2,j-6) * (-0.0005411255411255411d0) &
+                 + in(i+3,j-6) * (-0.0005411255411255411d0) &
+                 + in(i+4,j-6) * (-0.0005411255411255411d0) &
+                 + in(i+5,j-6) * (-0.0005411255411255411d0) &
+                 + in(i+6,j-6) * (-0.0005411255411255411d0) &
+                 + in(i+7,j-6) * (-0.0005411255411255411d0) &
+                 + in(i-5,j-5) * (-0.007142857142857143d0) &
+                 + in(i+1,j-5) * (-0.0007936507936507937d0) &
+                 + in(i+2,j-5) * (-0.0007936507936507937d0) &
+                 + in(i+3,j-5) * (-0.0007936507936507937d0) &
+                 + in(i+4,j-5) * (-0.0007936507936507937d0) &
+                 + in(i+5,j-5) * (-0.0007936507936507937d0) &
+                 + in(i+6,j-5) * (-0.0007936507936507937d0) &
+                 + in(i+7,j-5) * (-0.0007936507936507937d0) &
+                 + in(i-4,j-4) * (-0.008928571428571428d0) &
+                 + in(i+1,j-4) * (-0.0012755102040816326d0) &
+                 + in(i+2,j-4) * (-0.0012755102040816326d0) &
+                 + in(i+3,j-4) * (-0.0012755102040816326d0) &
+                 + in(i+4,j-4) * (-0.0012755102040816326d0) &
+                 + in(i+5,j-4) * (-0.0012755102040816326d0) &
+                 + in(i+6,j-4) * (-0.0012755102040816326d0) &
+                 + in(i+7,j-4) * (-0.0012755102040816326d0) &
+                 + in(i-3,j-3) * (-0.011904761904761904d0) &
+                 + in(i+1,j-3) * (-0.002380952380952381d0) &
+                 + in(i+2,j-3) * (-0.002380952380952381d0) &
+                 + in(i+3,j-3) * (-0.002380952380952381d0) &
+                 + in(i+4,j-3) * (-0.002380952380952381d0) &
+                 + in(i+5,j-3) * (-0.002380952380952381d0) &
+                 + in(i+6,j-3) * (-0.002380952380952381d0) &
+                 + in(i+7,j-3) * (-0.002380952380952381d0) &
+                 + in(i-2,j-2) * (-0.017857142857142856d0) &
+                 + in(i+1,j-2) * (-0.005952380952380952d0) &
+                 + in(i+2,j-2) * (-0.005952380952380952d0) &
+                 + in(i+3,j-2) * (-0.005952380952380952d0) &
+                 + in(i+4,j-2) * (-0.005952380952380952d0) &
+                 + in(i+5,j-2) * (-0.005952380952380952d0) &
+                 + in(i+6,j-2) * (-0.005952380952380952d0) &
+                 + in(i+7,j-2) * (-0.005952380952380952d0) &
+                 + in(i-1,j-1) * (-0.03571428571428571d0) &
+                 + in(i+1,j-1) * (-0.03571428571428571d0) &
+                 + in(i+2,j-1) * (-0.03571428571428571d0) &
+                 + in(i+3,j-1) * (-0.03571428571428571d0) &
+                 + in(i+4,j-1) * (-0.03571428571428571d0) &
+                 + in(i+5,j-1) * (-0.03571428571428571d0) &
+                 + in(i+6,j-1) * (-0.03571428571428571d0) &
+                 + in(i+7,j-1) * (-0.03571428571428571d0) &
+                 + in(i-7,j+1) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+1) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+1) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+1) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+1) * (-0.002380952380952381d0) &
+                 + in(i-2,j+1) * (-0.005952380952380952d0) &
+                 + in(i-1,j+1) * (-0.03571428571428571d0) &
+                 + in(i+1,j+1) * (0.03571428571428571d0) &
+                 + in(i+2,j+1) * (0.005952380952380952d0) &
+                 + in(i+3,j+1) * (0.002380952380952381d0) &
+                 + in(i+4,j+1) * (0.0012755102040816326d0) &
+                 + in(i+5,j+1) * (0.0007936507936507937d0) &
+                 + in(i+6,j+1) * (0.0005411255411255411d0) &
+                 + in(i+7,j+1) * (0.0003924646781789639d0) &
+                 + in(i-7,j+2) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+2) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+2) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+2) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+2) * (-0.002380952380952381d0) &
+                 + in(i-2,j+2) * (-0.005952380952380952d0) &
+                 + in(i-1,j+2) * (-0.03571428571428571d0) &
+                 + in(i+1,j+2) * (0.005952380952380952d0) &
+                 + in(i+2,j+2) * (0.017857142857142856d0) &
+                 + in(i+3,j+2) * (0.002380952380952381d0) &
+                 + in(i+4,j+2) * (0.0012755102040816326d0) &
+                 + in(i+5,j+2) * (0.0007936507936507937d0) &
+                 + in(i+6,j+2) * (0.0005411255411255411d0) &
+                 + in(i+7,j+2) * (0.0003924646781789639d0) &
+                 + in(i-7,j+3) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+3) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+3) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+3) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+3) * (-0.002380952380952381d0) &
+                 + in(i-2,j+3) * (-0.005952380952380952d0) &
+                 + in(i-1,j+3) * (-0.03571428571428571d0) &
+                 + in(i+1,j+3) * (0.002380952380952381d0) &
+                 + in(i+2,j+3) * (0.002380952380952381d0) &
+                 + in(i+3,j+3) * (0.011904761904761904d0) &
+                 + in(i+4,j+3) * (0.0012755102040816326d0) &
+                 + in(i+5,j+3) * (0.0007936507936507937d0) &
+                 + in(i+6,j+3) * (0.0005411255411255411d0) &
+                 + in(i+7,j+3) * (0.0003924646781789639d0) &
+                 + in(i-7,j+4) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+4) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+4) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+4) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+4) * (-0.002380952380952381d0) &
+                 + in(i-2,j+4) * (-0.005952380952380952d0) &
+                 + in(i-1,j+4) * (-0.03571428571428571d0) &
+                 + in(i+1,j+4) * (0.0012755102040816326d0) &
+                 + in(i+2,j+4) * (0.0012755102040816326d0) &
+                 + in(i+3,j+4) * (0.0012755102040816326d0) &
+                 + in(i+4,j+4) * (0.008928571428571428d0) &
+                 + in(i+5,j+4) * (0.0007936507936507937d0) &
+                 + in(i+6,j+4) * (0.0005411255411255411d0) &
+                 + in(i+7,j+4) * (0.0003924646781789639d0) &
+                 + in(i-7,j+5) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+5) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+5) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+5) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+5) * (-0.002380952380952381d0) &
+                 + in(i-2,j+5) * (-0.005952380952380952d0) &
+                 + in(i-1,j+5) * (-0.03571428571428571d0) &
+                 + in(i+1,j+5) * (0.0007936507936507937d0) &
+                 + in(i+2,j+5) * (0.0007936507936507937d0) &
+                 + in(i+3,j+5) * (0.0007936507936507937d0) &
+                 + in(i+4,j+5) * (0.0007936507936507937d0) &
+                 + in(i+5,j+5) * (0.007142857142857143d0) &
+                 + in(i+6,j+5) * (0.0005411255411255411d0) &
+                 + in(i+7,j+5) * (0.0003924646781789639d0) &
+                 + in(i-7,j+6) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+6) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+6) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+6) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+6) * (-0.002380952380952381d0) &
+                 + in(i-2,j+6) * (-0.005952380952380952d0) &
+                 + in(i-1,j+6) * (-0.03571428571428571d0) &
+                 + in(i+1,j+6) * (0.0005411255411255411d0) &
+                 + in(i+2,j+6) * (0.0005411255411255411d0) &
+                 + in(i+3,j+6) * (0.0005411255411255411d0) &
+                 + in(i+4,j+6) * (0.0005411255411255411d0) &
+                 + in(i+5,j+6) * (0.0005411255411255411d0) &
+                 + in(i+6,j+6) * (0.005952380952380952d0) &
+                 + in(i+7,j+6) * (0.0003924646781789639d0) &
+                 + in(i-7,j+7) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+7) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+7) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+7) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+7) * (-0.002380952380952381d0) &
+                 + in(i-2,j+7) * (-0.005952380952380952d0) &
+                 + in(i-1,j+7) * (-0.03571428571428571d0) &
+                 + in(i+1,j+7) * (0.0003924646781789639d0) &
+                 + in(i+2,j+7) * (0.0003924646781789639d0) &
+                 + in(i+3,j+7) * (0.0003924646781789639d0) &
+                 + in(i+4,j+7) * (0.0003924646781789639d0) &
+                 + in(i+5,j+7) * (0.0003924646781789639d0) &
+                 + in(i+6,j+7) * (0.0003924646781789639d0) &
+                 + in(i+7,j+7) * (0.00510204081632653d0) &
 +0.0
       end do
     end do
@@ -876,206 +876,206 @@ subroutine grid8(n, in, out)
     do i=8,n-8-1
       do j=8,n-8-1
         out(i,j) = out(i,j) &
-                 + in(i-8,j-8) * (-0.00390625) &
-                 + in(i+1,j-8) * (-0.00026041666666666666) &
-                 + in(i+2,j-8) * (-0.00026041666666666666) &
-                 + in(i+3,j-8) * (-0.00026041666666666666) &
-                 + in(i+4,j-8) * (-0.00026041666666666666) &
-                 + in(i+5,j-8) * (-0.00026041666666666666) &
-                 + in(i+6,j-8) * (-0.00026041666666666666) &
-                 + in(i+7,j-8) * (-0.00026041666666666666) &
-                 + in(i+8,j-8) * (-0.00026041666666666666) &
-                 + in(i-7,j-7) * (-0.004464285714285714) &
-                 + in(i+1,j-7) * (-0.00034340659340659343) &
-                 + in(i+2,j-7) * (-0.00034340659340659343) &
-                 + in(i+3,j-7) * (-0.00034340659340659343) &
-                 + in(i+4,j-7) * (-0.00034340659340659343) &
-                 + in(i+5,j-7) * (-0.00034340659340659343) &
-                 + in(i+6,j-7) * (-0.00034340659340659343) &
-                 + in(i+7,j-7) * (-0.00034340659340659343) &
-                 + in(i+8,j-7) * (-0.00034340659340659343) &
-                 + in(i-6,j-6) * (-0.005208333333333333) &
-                 + in(i+1,j-6) * (-0.0004734848484848485) &
-                 + in(i+2,j-6) * (-0.0004734848484848485) &
-                 + in(i+3,j-6) * (-0.0004734848484848485) &
-                 + in(i+4,j-6) * (-0.0004734848484848485) &
-                 + in(i+5,j-6) * (-0.0004734848484848485) &
-                 + in(i+6,j-6) * (-0.0004734848484848485) &
-                 + in(i+7,j-6) * (-0.0004734848484848485) &
-                 + in(i+8,j-6) * (-0.0004734848484848485) &
-                 + in(i-5,j-5) * (-0.00625) &
-                 + in(i+1,j-5) * (-0.0006944444444444445) &
-                 + in(i+2,j-5) * (-0.0006944444444444445) &
-                 + in(i+3,j-5) * (-0.0006944444444444445) &
-                 + in(i+4,j-5) * (-0.0006944444444444445) &
-                 + in(i+5,j-5) * (-0.0006944444444444445) &
-                 + in(i+6,j-5) * (-0.0006944444444444445) &
-                 + in(i+7,j-5) * (-0.0006944444444444445) &
-                 + in(i+8,j-5) * (-0.0006944444444444445) &
-                 + in(i-4,j-4) * (-0.0078125) &
-                 + in(i+1,j-4) * (-0.0011160714285714285) &
-                 + in(i+2,j-4) * (-0.0011160714285714285) &
-                 + in(i+3,j-4) * (-0.0011160714285714285) &
-                 + in(i+4,j-4) * (-0.0011160714285714285) &
-                 + in(i+5,j-4) * (-0.0011160714285714285) &
-                 + in(i+6,j-4) * (-0.0011160714285714285) &
-                 + in(i+7,j-4) * (-0.0011160714285714285) &
-                 + in(i+8,j-4) * (-0.0011160714285714285) &
-                 + in(i-3,j-3) * (-0.010416666666666666) &
-                 + in(i+1,j-3) * (-0.0020833333333333333) &
-                 + in(i+2,j-3) * (-0.0020833333333333333) &
-                 + in(i+3,j-3) * (-0.0020833333333333333) &
-                 + in(i+4,j-3) * (-0.0020833333333333333) &
-                 + in(i+5,j-3) * (-0.0020833333333333333) &
-                 + in(i+6,j-3) * (-0.0020833333333333333) &
-                 + in(i+7,j-3) * (-0.0020833333333333333) &
-                 + in(i+8,j-3) * (-0.0020833333333333333) &
-                 + in(i-2,j-2) * (-0.015625) &
-                 + in(i+1,j-2) * (-0.005208333333333333) &
-                 + in(i+2,j-2) * (-0.005208333333333333) &
-                 + in(i+3,j-2) * (-0.005208333333333333) &
-                 + in(i+4,j-2) * (-0.005208333333333333) &
-                 + in(i+5,j-2) * (-0.005208333333333333) &
-                 + in(i+6,j-2) * (-0.005208333333333333) &
-                 + in(i+7,j-2) * (-0.005208333333333333) &
-                 + in(i+8,j-2) * (-0.005208333333333333) &
-                 + in(i-1,j-1) * (-0.03125) &
-                 + in(i+1,j-1) * (-0.03125) &
-                 + in(i+2,j-1) * (-0.03125) &
-                 + in(i+3,j-1) * (-0.03125) &
-                 + in(i+4,j-1) * (-0.03125) &
-                 + in(i+5,j-1) * (-0.03125) &
-                 + in(i+6,j-1) * (-0.03125) &
-                 + in(i+7,j-1) * (-0.03125) &
-                 + in(i+8,j-1) * (-0.03125) &
-                 + in(i-8,j+1) * (-0.00026041666666666666) &
-                 + in(i-7,j+1) * (-0.00034340659340659343) &
-                 + in(i-6,j+1) * (-0.0004734848484848485) &
-                 + in(i-5,j+1) * (-0.0006944444444444445) &
-                 + in(i-4,j+1) * (-0.0011160714285714285) &
-                 + in(i-3,j+1) * (-0.0020833333333333333) &
-                 + in(i-2,j+1) * (-0.005208333333333333) &
-                 + in(i-1,j+1) * (-0.03125) &
-                 + in(i+1,j+1) * (0.03125) &
-                 + in(i+2,j+1) * (0.005208333333333333) &
-                 + in(i+3,j+1) * (0.0020833333333333333) &
-                 + in(i+4,j+1) * (0.0011160714285714285) &
-                 + in(i+5,j+1) * (0.0006944444444444445) &
-                 + in(i+6,j+1) * (0.0004734848484848485) &
-                 + in(i+7,j+1) * (0.00034340659340659343) &
-                 + in(i+8,j+1) * (0.00026041666666666666) &
-                 + in(i-8,j+2) * (-0.00026041666666666666) &
-                 + in(i-7,j+2) * (-0.00034340659340659343) &
-                 + in(i-6,j+2) * (-0.0004734848484848485) &
-                 + in(i-5,j+2) * (-0.0006944444444444445) &
-                 + in(i-4,j+2) * (-0.0011160714285714285) &
-                 + in(i-3,j+2) * (-0.0020833333333333333) &
-                 + in(i-2,j+2) * (-0.005208333333333333) &
-                 + in(i-1,j+2) * (-0.03125) &
-                 + in(i+1,j+2) * (0.005208333333333333) &
-                 + in(i+2,j+2) * (0.015625) &
-                 + in(i+3,j+2) * (0.0020833333333333333) &
-                 + in(i+4,j+2) * (0.0011160714285714285) &
-                 + in(i+5,j+2) * (0.0006944444444444445) &
-                 + in(i+6,j+2) * (0.0004734848484848485) &
-                 + in(i+7,j+2) * (0.00034340659340659343) &
-                 + in(i+8,j+2) * (0.00026041666666666666) &
-                 + in(i-8,j+3) * (-0.00026041666666666666) &
-                 + in(i-7,j+3) * (-0.00034340659340659343) &
-                 + in(i-6,j+3) * (-0.0004734848484848485) &
-                 + in(i-5,j+3) * (-0.0006944444444444445) &
-                 + in(i-4,j+3) * (-0.0011160714285714285) &
-                 + in(i-3,j+3) * (-0.0020833333333333333) &
-                 + in(i-2,j+3) * (-0.005208333333333333) &
-                 + in(i-1,j+3) * (-0.03125) &
-                 + in(i+1,j+3) * (0.0020833333333333333) &
-                 + in(i+2,j+3) * (0.0020833333333333333) &
-                 + in(i+3,j+3) * (0.010416666666666666) &
-                 + in(i+4,j+3) * (0.0011160714285714285) &
-                 + in(i+5,j+3) * (0.0006944444444444445) &
-                 + in(i+6,j+3) * (0.0004734848484848485) &
-                 + in(i+7,j+3) * (0.00034340659340659343) &
-                 + in(i+8,j+3) * (0.00026041666666666666) &
-                 + in(i-8,j+4) * (-0.00026041666666666666) &
-                 + in(i-7,j+4) * (-0.00034340659340659343) &
-                 + in(i-6,j+4) * (-0.0004734848484848485) &
-                 + in(i-5,j+4) * (-0.0006944444444444445) &
-                 + in(i-4,j+4) * (-0.0011160714285714285) &
-                 + in(i-3,j+4) * (-0.0020833333333333333) &
-                 + in(i-2,j+4) * (-0.005208333333333333) &
-                 + in(i-1,j+4) * (-0.03125) &
-                 + in(i+1,j+4) * (0.0011160714285714285) &
-                 + in(i+2,j+4) * (0.0011160714285714285) &
-                 + in(i+3,j+4) * (0.0011160714285714285) &
-                 + in(i+4,j+4) * (0.0078125) &
-                 + in(i+5,j+4) * (0.0006944444444444445) &
-                 + in(i+6,j+4) * (0.0004734848484848485) &
-                 + in(i+7,j+4) * (0.00034340659340659343) &
-                 + in(i+8,j+4) * (0.00026041666666666666) &
-                 + in(i-8,j+5) * (-0.00026041666666666666) &
-                 + in(i-7,j+5) * (-0.00034340659340659343) &
-                 + in(i-6,j+5) * (-0.0004734848484848485) &
-                 + in(i-5,j+5) * (-0.0006944444444444445) &
-                 + in(i-4,j+5) * (-0.0011160714285714285) &
-                 + in(i-3,j+5) * (-0.0020833333333333333) &
-                 + in(i-2,j+5) * (-0.005208333333333333) &
-                 + in(i-1,j+5) * (-0.03125) &
-                 + in(i+1,j+5) * (0.0006944444444444445) &
-                 + in(i+2,j+5) * (0.0006944444444444445) &
-                 + in(i+3,j+5) * (0.0006944444444444445) &
-                 + in(i+4,j+5) * (0.0006944444444444445) &
-                 + in(i+5,j+5) * (0.00625) &
-                 + in(i+6,j+5) * (0.0004734848484848485) &
-                 + in(i+7,j+5) * (0.00034340659340659343) &
-                 + in(i+8,j+5) * (0.00026041666666666666) &
-                 + in(i-8,j+6) * (-0.00026041666666666666) &
-                 + in(i-7,j+6) * (-0.00034340659340659343) &
-                 + in(i-6,j+6) * (-0.0004734848484848485) &
-                 + in(i-5,j+6) * (-0.0006944444444444445) &
-                 + in(i-4,j+6) * (-0.0011160714285714285) &
-                 + in(i-3,j+6) * (-0.0020833333333333333) &
-                 + in(i-2,j+6) * (-0.005208333333333333) &
-                 + in(i-1,j+6) * (-0.03125) &
-                 + in(i+1,j+6) * (0.0004734848484848485) &
-                 + in(i+2,j+6) * (0.0004734848484848485) &
-                 + in(i+3,j+6) * (0.0004734848484848485) &
-                 + in(i+4,j+6) * (0.0004734848484848485) &
-                 + in(i+5,j+6) * (0.0004734848484848485) &
-                 + in(i+6,j+6) * (0.005208333333333333) &
-                 + in(i+7,j+6) * (0.00034340659340659343) &
-                 + in(i+8,j+6) * (0.00026041666666666666) &
-                 + in(i-8,j+7) * (-0.00026041666666666666) &
-                 + in(i-7,j+7) * (-0.00034340659340659343) &
-                 + in(i-6,j+7) * (-0.0004734848484848485) &
-                 + in(i-5,j+7) * (-0.0006944444444444445) &
-                 + in(i-4,j+7) * (-0.0011160714285714285) &
-                 + in(i-3,j+7) * (-0.0020833333333333333) &
-                 + in(i-2,j+7) * (-0.005208333333333333) &
-                 + in(i-1,j+7) * (-0.03125) &
-                 + in(i+1,j+7) * (0.00034340659340659343) &
-                 + in(i+2,j+7) * (0.00034340659340659343) &
-                 + in(i+3,j+7) * (0.00034340659340659343) &
-                 + in(i+4,j+7) * (0.00034340659340659343) &
-                 + in(i+5,j+7) * (0.00034340659340659343) &
-                 + in(i+6,j+7) * (0.00034340659340659343) &
-                 + in(i+7,j+7) * (0.004464285714285714) &
-                 + in(i+8,j+7) * (0.00026041666666666666) &
-                 + in(i-8,j+8) * (-0.00026041666666666666) &
-                 + in(i-7,j+8) * (-0.00034340659340659343) &
-                 + in(i-6,j+8) * (-0.0004734848484848485) &
-                 + in(i-5,j+8) * (-0.0006944444444444445) &
-                 + in(i-4,j+8) * (-0.0011160714285714285) &
-                 + in(i-3,j+8) * (-0.0020833333333333333) &
-                 + in(i-2,j+8) * (-0.005208333333333333) &
-                 + in(i-1,j+8) * (-0.03125) &
-                 + in(i+1,j+8) * (0.00026041666666666666) &
-                 + in(i+2,j+8) * (0.00026041666666666666) &
-                 + in(i+3,j+8) * (0.00026041666666666666) &
-                 + in(i+4,j+8) * (0.00026041666666666666) &
-                 + in(i+5,j+8) * (0.00026041666666666666) &
-                 + in(i+6,j+8) * (0.00026041666666666666) &
-                 + in(i+7,j+8) * (0.00026041666666666666) &
-                 + in(i+8,j+8) * (0.00390625) &
+                 + in(i-8,j-8) * (-0.00390625d0) &
+                 + in(i+1,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+2,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+3,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+4,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+5,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+6,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+7,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+8,j-8) * (-0.00026041666666666666d0) &
+                 + in(i-7,j-7) * (-0.004464285714285714d0) &
+                 + in(i+1,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+2,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+3,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+4,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+5,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+6,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+7,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+8,j-7) * (-0.00034340659340659343d0) &
+                 + in(i-6,j-6) * (-0.005208333333333333d0) &
+                 + in(i+1,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+2,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+3,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+4,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+5,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+6,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+7,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+8,j-6) * (-0.0004734848484848485d0) &
+                 + in(i-5,j-5) * (-0.00625d0) &
+                 + in(i+1,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+2,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+3,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+4,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+5,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+6,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+7,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+8,j-5) * (-0.0006944444444444445d0) &
+                 + in(i-4,j-4) * (-0.0078125d0) &
+                 + in(i+1,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+2,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+3,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+4,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+5,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+6,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+7,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+8,j-4) * (-0.0011160714285714285d0) &
+                 + in(i-3,j-3) * (-0.010416666666666666d0) &
+                 + in(i+1,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+2,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+3,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+4,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+5,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+6,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+7,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+8,j-3) * (-0.0020833333333333333d0) &
+                 + in(i-2,j-2) * (-0.015625d0) &
+                 + in(i+1,j-2) * (-0.005208333333333333d0) &
+                 + in(i+2,j-2) * (-0.005208333333333333d0) &
+                 + in(i+3,j-2) * (-0.005208333333333333d0) &
+                 + in(i+4,j-2) * (-0.005208333333333333d0) &
+                 + in(i+5,j-2) * (-0.005208333333333333d0) &
+                 + in(i+6,j-2) * (-0.005208333333333333d0) &
+                 + in(i+7,j-2) * (-0.005208333333333333d0) &
+                 + in(i+8,j-2) * (-0.005208333333333333d0) &
+                 + in(i-1,j-1) * (-0.03125d0) &
+                 + in(i+1,j-1) * (-0.03125d0) &
+                 + in(i+2,j-1) * (-0.03125d0) &
+                 + in(i+3,j-1) * (-0.03125d0) &
+                 + in(i+4,j-1) * (-0.03125d0) &
+                 + in(i+5,j-1) * (-0.03125d0) &
+                 + in(i+6,j-1) * (-0.03125d0) &
+                 + in(i+7,j-1) * (-0.03125d0) &
+                 + in(i+8,j-1) * (-0.03125d0) &
+                 + in(i-8,j+1) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+1) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+1) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+1) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+1) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+1) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+1) * (-0.005208333333333333d0) &
+                 + in(i-1,j+1) * (-0.03125d0) &
+                 + in(i+1,j+1) * (0.03125d0) &
+                 + in(i+2,j+1) * (0.005208333333333333d0) &
+                 + in(i+3,j+1) * (0.0020833333333333333d0) &
+                 + in(i+4,j+1) * (0.0011160714285714285d0) &
+                 + in(i+5,j+1) * (0.0006944444444444445d0) &
+                 + in(i+6,j+1) * (0.0004734848484848485d0) &
+                 + in(i+7,j+1) * (0.00034340659340659343d0) &
+                 + in(i+8,j+1) * (0.00026041666666666666d0) &
+                 + in(i-8,j+2) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+2) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+2) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+2) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+2) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+2) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+2) * (-0.005208333333333333d0) &
+                 + in(i-1,j+2) * (-0.03125d0) &
+                 + in(i+1,j+2) * (0.005208333333333333d0) &
+                 + in(i+2,j+2) * (0.015625d0) &
+                 + in(i+3,j+2) * (0.0020833333333333333d0) &
+                 + in(i+4,j+2) * (0.0011160714285714285d0) &
+                 + in(i+5,j+2) * (0.0006944444444444445d0) &
+                 + in(i+6,j+2) * (0.0004734848484848485d0) &
+                 + in(i+7,j+2) * (0.00034340659340659343d0) &
+                 + in(i+8,j+2) * (0.00026041666666666666d0) &
+                 + in(i-8,j+3) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+3) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+3) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+3) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+3) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+3) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+3) * (-0.005208333333333333d0) &
+                 + in(i-1,j+3) * (-0.03125d0) &
+                 + in(i+1,j+3) * (0.0020833333333333333d0) &
+                 + in(i+2,j+3) * (0.0020833333333333333d0) &
+                 + in(i+3,j+3) * (0.010416666666666666d0) &
+                 + in(i+4,j+3) * (0.0011160714285714285d0) &
+                 + in(i+5,j+3) * (0.0006944444444444445d0) &
+                 + in(i+6,j+3) * (0.0004734848484848485d0) &
+                 + in(i+7,j+3) * (0.00034340659340659343d0) &
+                 + in(i+8,j+3) * (0.00026041666666666666d0) &
+                 + in(i-8,j+4) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+4) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+4) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+4) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+4) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+4) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+4) * (-0.005208333333333333d0) &
+                 + in(i-1,j+4) * (-0.03125d0) &
+                 + in(i+1,j+4) * (0.0011160714285714285d0) &
+                 + in(i+2,j+4) * (0.0011160714285714285d0) &
+                 + in(i+3,j+4) * (0.0011160714285714285d0) &
+                 + in(i+4,j+4) * (0.0078125d0) &
+                 + in(i+5,j+4) * (0.0006944444444444445d0) &
+                 + in(i+6,j+4) * (0.0004734848484848485d0) &
+                 + in(i+7,j+4) * (0.00034340659340659343d0) &
+                 + in(i+8,j+4) * (0.00026041666666666666d0) &
+                 + in(i-8,j+5) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+5) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+5) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+5) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+5) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+5) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+5) * (-0.005208333333333333d0) &
+                 + in(i-1,j+5) * (-0.03125d0) &
+                 + in(i+1,j+5) * (0.0006944444444444445d0) &
+                 + in(i+2,j+5) * (0.0006944444444444445d0) &
+                 + in(i+3,j+5) * (0.0006944444444444445d0) &
+                 + in(i+4,j+5) * (0.0006944444444444445d0) &
+                 + in(i+5,j+5) * (0.00625d0) &
+                 + in(i+6,j+5) * (0.0004734848484848485d0) &
+                 + in(i+7,j+5) * (0.00034340659340659343d0) &
+                 + in(i+8,j+5) * (0.00026041666666666666d0) &
+                 + in(i-8,j+6) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+6) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+6) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+6) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+6) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+6) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+6) * (-0.005208333333333333d0) &
+                 + in(i-1,j+6) * (-0.03125d0) &
+                 + in(i+1,j+6) * (0.0004734848484848485d0) &
+                 + in(i+2,j+6) * (0.0004734848484848485d0) &
+                 + in(i+3,j+6) * (0.0004734848484848485d0) &
+                 + in(i+4,j+6) * (0.0004734848484848485d0) &
+                 + in(i+5,j+6) * (0.0004734848484848485d0) &
+                 + in(i+6,j+6) * (0.005208333333333333d0) &
+                 + in(i+7,j+6) * (0.00034340659340659343d0) &
+                 + in(i+8,j+6) * (0.00026041666666666666d0) &
+                 + in(i-8,j+7) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+7) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+7) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+7) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+7) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+7) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+7) * (-0.005208333333333333d0) &
+                 + in(i-1,j+7) * (-0.03125d0) &
+                 + in(i+1,j+7) * (0.00034340659340659343d0) &
+                 + in(i+2,j+7) * (0.00034340659340659343d0) &
+                 + in(i+3,j+7) * (0.00034340659340659343d0) &
+                 + in(i+4,j+7) * (0.00034340659340659343d0) &
+                 + in(i+5,j+7) * (0.00034340659340659343d0) &
+                 + in(i+6,j+7) * (0.00034340659340659343d0) &
+                 + in(i+7,j+7) * (0.004464285714285714d0) &
+                 + in(i+8,j+7) * (0.00026041666666666666d0) &
+                 + in(i-8,j+8) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+8) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+8) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+8) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+8) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+8) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+8) * (-0.005208333333333333d0) &
+                 + in(i-1,j+8) * (-0.03125d0) &
+                 + in(i+1,j+8) * (0.00026041666666666666d0) &
+                 + in(i+2,j+8) * (0.00026041666666666666d0) &
+                 + in(i+3,j+8) * (0.00026041666666666666d0) &
+                 + in(i+4,j+8) * (0.00026041666666666666d0) &
+                 + in(i+5,j+8) * (0.00026041666666666666d0) &
+                 + in(i+6,j+8) * (0.00026041666666666666d0) &
+                 + in(i+7,j+8) * (0.00026041666666666666d0) &
+                 + in(i+8,j+8) * (0.00390625d0) &
 +0.0
       end do
     end do
@@ -1091,258 +1091,258 @@ subroutine grid9(n, in, out)
     do i=9,n-9-1
       do j=9,n-9-1
         out(i,j) = out(i,j) &
-                 + in(i-9,j-9) * (-0.0030864197530864196) &
-                 + in(i+1,j-9) * (-0.00018155410312273057) &
-                 + in(i+2,j-9) * (-0.00018155410312273057) &
-                 + in(i+3,j-9) * (-0.00018155410312273057) &
-                 + in(i+4,j-9) * (-0.00018155410312273057) &
-                 + in(i+5,j-9) * (-0.00018155410312273057) &
-                 + in(i+6,j-9) * (-0.00018155410312273057) &
-                 + in(i+7,j-9) * (-0.00018155410312273057) &
-                 + in(i+8,j-9) * (-0.00018155410312273057) &
-                 + in(i+9,j-9) * (-0.00018155410312273057) &
-                 + in(i-8,j-8) * (-0.003472222222222222) &
-                 + in(i+1,j-8) * (-0.0002314814814814815) &
-                 + in(i+2,j-8) * (-0.0002314814814814815) &
-                 + in(i+3,j-8) * (-0.0002314814814814815) &
-                 + in(i+4,j-8) * (-0.0002314814814814815) &
-                 + in(i+5,j-8) * (-0.0002314814814814815) &
-                 + in(i+6,j-8) * (-0.0002314814814814815) &
-                 + in(i+7,j-8) * (-0.0002314814814814815) &
-                 + in(i+8,j-8) * (-0.0002314814814814815) &
-                 + in(i+9,j-8) * (-0.0002314814814814815) &
-                 + in(i-7,j-7) * (-0.003968253968253968) &
-                 + in(i+1,j-7) * (-0.00030525030525030525) &
-                 + in(i+2,j-7) * (-0.00030525030525030525) &
-                 + in(i+3,j-7) * (-0.00030525030525030525) &
-                 + in(i+4,j-7) * (-0.00030525030525030525) &
-                 + in(i+5,j-7) * (-0.00030525030525030525) &
-                 + in(i+6,j-7) * (-0.00030525030525030525) &
-                 + in(i+7,j-7) * (-0.00030525030525030525) &
-                 + in(i+8,j-7) * (-0.00030525030525030525) &
-                 + in(i+9,j-7) * (-0.00030525030525030525) &
-                 + in(i-6,j-6) * (-0.004629629629629629) &
-                 + in(i+1,j-6) * (-0.00042087542087542086) &
-                 + in(i+2,j-6) * (-0.00042087542087542086) &
-                 + in(i+3,j-6) * (-0.00042087542087542086) &
-                 + in(i+4,j-6) * (-0.00042087542087542086) &
-                 + in(i+5,j-6) * (-0.00042087542087542086) &
-                 + in(i+6,j-6) * (-0.00042087542087542086) &
-                 + in(i+7,j-6) * (-0.00042087542087542086) &
-                 + in(i+8,j-6) * (-0.00042087542087542086) &
-                 + in(i+9,j-6) * (-0.00042087542087542086) &
-                 + in(i-5,j-5) * (-0.005555555555555556) &
-                 + in(i+1,j-5) * (-0.0006172839506172839) &
-                 + in(i+2,j-5) * (-0.0006172839506172839) &
-                 + in(i+3,j-5) * (-0.0006172839506172839) &
-                 + in(i+4,j-5) * (-0.0006172839506172839) &
-                 + in(i+5,j-5) * (-0.0006172839506172839) &
-                 + in(i+6,j-5) * (-0.0006172839506172839) &
-                 + in(i+7,j-5) * (-0.0006172839506172839) &
-                 + in(i+8,j-5) * (-0.0006172839506172839) &
-                 + in(i+9,j-5) * (-0.0006172839506172839) &
-                 + in(i-4,j-4) * (-0.006944444444444444) &
-                 + in(i+1,j-4) * (-0.000992063492063492) &
-                 + in(i+2,j-4) * (-0.000992063492063492) &
-                 + in(i+3,j-4) * (-0.000992063492063492) &
-                 + in(i+4,j-4) * (-0.000992063492063492) &
-                 + in(i+5,j-4) * (-0.000992063492063492) &
-                 + in(i+6,j-4) * (-0.000992063492063492) &
-                 + in(i+7,j-4) * (-0.000992063492063492) &
-                 + in(i+8,j-4) * (-0.000992063492063492) &
-                 + in(i+9,j-4) * (-0.000992063492063492) &
-                 + in(i-3,j-3) * (-0.009259259259259259) &
-                 + in(i+1,j-3) * (-0.001851851851851852) &
-                 + in(i+2,j-3) * (-0.001851851851851852) &
-                 + in(i+3,j-3) * (-0.001851851851851852) &
-                 + in(i+4,j-3) * (-0.001851851851851852) &
-                 + in(i+5,j-3) * (-0.001851851851851852) &
-                 + in(i+6,j-3) * (-0.001851851851851852) &
-                 + in(i+7,j-3) * (-0.001851851851851852) &
-                 + in(i+8,j-3) * (-0.001851851851851852) &
-                 + in(i+9,j-3) * (-0.001851851851851852) &
-                 + in(i-2,j-2) * (-0.013888888888888888) &
-                 + in(i+1,j-2) * (-0.004629629629629629) &
-                 + in(i+2,j-2) * (-0.004629629629629629) &
-                 + in(i+3,j-2) * (-0.004629629629629629) &
-                 + in(i+4,j-2) * (-0.004629629629629629) &
-                 + in(i+5,j-2) * (-0.004629629629629629) &
-                 + in(i+6,j-2) * (-0.004629629629629629) &
-                 + in(i+7,j-2) * (-0.004629629629629629) &
-                 + in(i+8,j-2) * (-0.004629629629629629) &
-                 + in(i+9,j-2) * (-0.004629629629629629) &
-                 + in(i-1,j-1) * (-0.027777777777777776) &
-                 + in(i+1,j-1) * (-0.027777777777777776) &
-                 + in(i+2,j-1) * (-0.027777777777777776) &
-                 + in(i+3,j-1) * (-0.027777777777777776) &
-                 + in(i+4,j-1) * (-0.027777777777777776) &
-                 + in(i+5,j-1) * (-0.027777777777777776) &
-                 + in(i+6,j-1) * (-0.027777777777777776) &
-                 + in(i+7,j-1) * (-0.027777777777777776) &
-                 + in(i+8,j-1) * (-0.027777777777777776) &
-                 + in(i+9,j-1) * (-0.027777777777777776) &
-                 + in(i-9,j+1) * (-0.00018155410312273057) &
-                 + in(i-8,j+1) * (-0.0002314814814814815) &
-                 + in(i-7,j+1) * (-0.00030525030525030525) &
-                 + in(i-6,j+1) * (-0.00042087542087542086) &
-                 + in(i-5,j+1) * (-0.0006172839506172839) &
-                 + in(i-4,j+1) * (-0.000992063492063492) &
-                 + in(i-3,j+1) * (-0.001851851851851852) &
-                 + in(i-2,j+1) * (-0.004629629629629629) &
-                 + in(i-1,j+1) * (-0.027777777777777776) &
-                 + in(i+1,j+1) * (0.027777777777777776) &
-                 + in(i+2,j+1) * (0.004629629629629629) &
-                 + in(i+3,j+1) * (0.001851851851851852) &
-                 + in(i+4,j+1) * (0.000992063492063492) &
-                 + in(i+5,j+1) * (0.0006172839506172839) &
-                 + in(i+6,j+1) * (0.00042087542087542086) &
-                 + in(i+7,j+1) * (0.00030525030525030525) &
-                 + in(i+8,j+1) * (0.0002314814814814815) &
-                 + in(i+9,j+1) * (0.00018155410312273057) &
-                 + in(i-9,j+2) * (-0.00018155410312273057) &
-                 + in(i-8,j+2) * (-0.0002314814814814815) &
-                 + in(i-7,j+2) * (-0.00030525030525030525) &
-                 + in(i-6,j+2) * (-0.00042087542087542086) &
-                 + in(i-5,j+2) * (-0.0006172839506172839) &
-                 + in(i-4,j+2) * (-0.000992063492063492) &
-                 + in(i-3,j+2) * (-0.001851851851851852) &
-                 + in(i-2,j+2) * (-0.004629629629629629) &
-                 + in(i-1,j+2) * (-0.027777777777777776) &
-                 + in(i+1,j+2) * (0.004629629629629629) &
-                 + in(i+2,j+2) * (0.013888888888888888) &
-                 + in(i+3,j+2) * (0.001851851851851852) &
-                 + in(i+4,j+2) * (0.000992063492063492) &
-                 + in(i+5,j+2) * (0.0006172839506172839) &
-                 + in(i+6,j+2) * (0.00042087542087542086) &
-                 + in(i+7,j+2) * (0.00030525030525030525) &
-                 + in(i+8,j+2) * (0.0002314814814814815) &
-                 + in(i+9,j+2) * (0.00018155410312273057) &
-                 + in(i-9,j+3) * (-0.00018155410312273057) &
-                 + in(i-8,j+3) * (-0.0002314814814814815) &
-                 + in(i-7,j+3) * (-0.00030525030525030525) &
-                 + in(i-6,j+3) * (-0.00042087542087542086) &
-                 + in(i-5,j+3) * (-0.0006172839506172839) &
-                 + in(i-4,j+3) * (-0.000992063492063492) &
-                 + in(i-3,j+3) * (-0.001851851851851852) &
-                 + in(i-2,j+3) * (-0.004629629629629629) &
-                 + in(i-1,j+3) * (-0.027777777777777776) &
-                 + in(i+1,j+3) * (0.001851851851851852) &
-                 + in(i+2,j+3) * (0.001851851851851852) &
-                 + in(i+3,j+3) * (0.009259259259259259) &
-                 + in(i+4,j+3) * (0.000992063492063492) &
-                 + in(i+5,j+3) * (0.0006172839506172839) &
-                 + in(i+6,j+3) * (0.00042087542087542086) &
-                 + in(i+7,j+3) * (0.00030525030525030525) &
-                 + in(i+8,j+3) * (0.0002314814814814815) &
-                 + in(i+9,j+3) * (0.00018155410312273057) &
-                 + in(i-9,j+4) * (-0.00018155410312273057) &
-                 + in(i-8,j+4) * (-0.0002314814814814815) &
-                 + in(i-7,j+4) * (-0.00030525030525030525) &
-                 + in(i-6,j+4) * (-0.00042087542087542086) &
-                 + in(i-5,j+4) * (-0.0006172839506172839) &
-                 + in(i-4,j+4) * (-0.000992063492063492) &
-                 + in(i-3,j+4) * (-0.001851851851851852) &
-                 + in(i-2,j+4) * (-0.004629629629629629) &
-                 + in(i-1,j+4) * (-0.027777777777777776) &
-                 + in(i+1,j+4) * (0.000992063492063492) &
-                 + in(i+2,j+4) * (0.000992063492063492) &
-                 + in(i+3,j+4) * (0.000992063492063492) &
-                 + in(i+4,j+4) * (0.006944444444444444) &
-                 + in(i+5,j+4) * (0.0006172839506172839) &
-                 + in(i+6,j+4) * (0.00042087542087542086) &
-                 + in(i+7,j+4) * (0.00030525030525030525) &
-                 + in(i+8,j+4) * (0.0002314814814814815) &
-                 + in(i+9,j+4) * (0.00018155410312273057) &
-                 + in(i-9,j+5) * (-0.00018155410312273057) &
-                 + in(i-8,j+5) * (-0.0002314814814814815) &
-                 + in(i-7,j+5) * (-0.00030525030525030525) &
-                 + in(i-6,j+5) * (-0.00042087542087542086) &
-                 + in(i-5,j+5) * (-0.0006172839506172839) &
-                 + in(i-4,j+5) * (-0.000992063492063492) &
-                 + in(i-3,j+5) * (-0.001851851851851852) &
-                 + in(i-2,j+5) * (-0.004629629629629629) &
-                 + in(i-1,j+5) * (-0.027777777777777776) &
-                 + in(i+1,j+5) * (0.0006172839506172839) &
-                 + in(i+2,j+5) * (0.0006172839506172839) &
-                 + in(i+3,j+5) * (0.0006172839506172839) &
-                 + in(i+4,j+5) * (0.0006172839506172839) &
-                 + in(i+5,j+5) * (0.005555555555555556) &
-                 + in(i+6,j+5) * (0.00042087542087542086) &
-                 + in(i+7,j+5) * (0.00030525030525030525) &
-                 + in(i+8,j+5) * (0.0002314814814814815) &
-                 + in(i+9,j+5) * (0.00018155410312273057) &
-                 + in(i-9,j+6) * (-0.00018155410312273057) &
-                 + in(i-8,j+6) * (-0.0002314814814814815) &
-                 + in(i-7,j+6) * (-0.00030525030525030525) &
-                 + in(i-6,j+6) * (-0.00042087542087542086) &
-                 + in(i-5,j+6) * (-0.0006172839506172839) &
-                 + in(i-4,j+6) * (-0.000992063492063492) &
-                 + in(i-3,j+6) * (-0.001851851851851852) &
-                 + in(i-2,j+6) * (-0.004629629629629629) &
-                 + in(i-1,j+6) * (-0.027777777777777776) &
-                 + in(i+1,j+6) * (0.00042087542087542086) &
-                 + in(i+2,j+6) * (0.00042087542087542086) &
-                 + in(i+3,j+6) * (0.00042087542087542086) &
-                 + in(i+4,j+6) * (0.00042087542087542086) &
-                 + in(i+5,j+6) * (0.00042087542087542086) &
-                 + in(i+6,j+6) * (0.004629629629629629) &
-                 + in(i+7,j+6) * (0.00030525030525030525) &
-                 + in(i+8,j+6) * (0.0002314814814814815) &
-                 + in(i+9,j+6) * (0.00018155410312273057) &
-                 + in(i-9,j+7) * (-0.00018155410312273057) &
-                 + in(i-8,j+7) * (-0.0002314814814814815) &
-                 + in(i-7,j+7) * (-0.00030525030525030525) &
-                 + in(i-6,j+7) * (-0.00042087542087542086) &
-                 + in(i-5,j+7) * (-0.0006172839506172839) &
-                 + in(i-4,j+7) * (-0.000992063492063492) &
-                 + in(i-3,j+7) * (-0.001851851851851852) &
-                 + in(i-2,j+7) * (-0.004629629629629629) &
-                 + in(i-1,j+7) * (-0.027777777777777776) &
-                 + in(i+1,j+7) * (0.00030525030525030525) &
-                 + in(i+2,j+7) * (0.00030525030525030525) &
-                 + in(i+3,j+7) * (0.00030525030525030525) &
-                 + in(i+4,j+7) * (0.00030525030525030525) &
-                 + in(i+5,j+7) * (0.00030525030525030525) &
-                 + in(i+6,j+7) * (0.00030525030525030525) &
-                 + in(i+7,j+7) * (0.003968253968253968) &
-                 + in(i+8,j+7) * (0.0002314814814814815) &
-                 + in(i+9,j+7) * (0.00018155410312273057) &
-                 + in(i-9,j+8) * (-0.00018155410312273057) &
-                 + in(i-8,j+8) * (-0.0002314814814814815) &
-                 + in(i-7,j+8) * (-0.00030525030525030525) &
-                 + in(i-6,j+8) * (-0.00042087542087542086) &
-                 + in(i-5,j+8) * (-0.0006172839506172839) &
-                 + in(i-4,j+8) * (-0.000992063492063492) &
-                 + in(i-3,j+8) * (-0.001851851851851852) &
-                 + in(i-2,j+8) * (-0.004629629629629629) &
-                 + in(i-1,j+8) * (-0.027777777777777776) &
-                 + in(i+1,j+8) * (0.0002314814814814815) &
-                 + in(i+2,j+8) * (0.0002314814814814815) &
-                 + in(i+3,j+8) * (0.0002314814814814815) &
-                 + in(i+4,j+8) * (0.0002314814814814815) &
-                 + in(i+5,j+8) * (0.0002314814814814815) &
-                 + in(i+6,j+8) * (0.0002314814814814815) &
-                 + in(i+7,j+8) * (0.0002314814814814815) &
-                 + in(i+8,j+8) * (0.003472222222222222) &
-                 + in(i+9,j+8) * (0.00018155410312273057) &
-                 + in(i-9,j+9) * (-0.00018155410312273057) &
-                 + in(i-8,j+9) * (-0.0002314814814814815) &
-                 + in(i-7,j+9) * (-0.00030525030525030525) &
-                 + in(i-6,j+9) * (-0.00042087542087542086) &
-                 + in(i-5,j+9) * (-0.0006172839506172839) &
-                 + in(i-4,j+9) * (-0.000992063492063492) &
-                 + in(i-3,j+9) * (-0.001851851851851852) &
-                 + in(i-2,j+9) * (-0.004629629629629629) &
-                 + in(i-1,j+9) * (-0.027777777777777776) &
-                 + in(i+1,j+9) * (0.00018155410312273057) &
-                 + in(i+2,j+9) * (0.00018155410312273057) &
-                 + in(i+3,j+9) * (0.00018155410312273057) &
-                 + in(i+4,j+9) * (0.00018155410312273057) &
-                 + in(i+5,j+9) * (0.00018155410312273057) &
-                 + in(i+6,j+9) * (0.00018155410312273057) &
-                 + in(i+7,j+9) * (0.00018155410312273057) &
-                 + in(i+8,j+9) * (0.00018155410312273057) &
-                 + in(i+9,j+9) * (0.0030864197530864196) &
+                 + in(i-9,j-9) * (-0.0030864197530864196d0) &
+                 + in(i+1,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+2,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+3,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+4,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+5,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+6,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+7,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+8,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+9,j-9) * (-0.00018155410312273057d0) &
+                 + in(i-8,j-8) * (-0.003472222222222222d0) &
+                 + in(i+1,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+2,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+3,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+4,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+5,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+6,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+7,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+8,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+9,j-8) * (-0.0002314814814814815d0) &
+                 + in(i-7,j-7) * (-0.003968253968253968d0) &
+                 + in(i+1,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+2,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+3,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+4,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+5,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+6,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+7,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+8,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+9,j-7) * (-0.00030525030525030525d0) &
+                 + in(i-6,j-6) * (-0.004629629629629629d0) &
+                 + in(i+1,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+2,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+3,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+4,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+5,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+6,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+7,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+8,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+9,j-6) * (-0.00042087542087542086d0) &
+                 + in(i-5,j-5) * (-0.005555555555555556d0) &
+                 + in(i+1,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+2,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+3,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+4,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+5,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+6,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+7,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+8,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+9,j-5) * (-0.0006172839506172839d0) &
+                 + in(i-4,j-4) * (-0.006944444444444444d0) &
+                 + in(i+1,j-4) * (-0.000992063492063492d0) &
+                 + in(i+2,j-4) * (-0.000992063492063492d0) &
+                 + in(i+3,j-4) * (-0.000992063492063492d0) &
+                 + in(i+4,j-4) * (-0.000992063492063492d0) &
+                 + in(i+5,j-4) * (-0.000992063492063492d0) &
+                 + in(i+6,j-4) * (-0.000992063492063492d0) &
+                 + in(i+7,j-4) * (-0.000992063492063492d0) &
+                 + in(i+8,j-4) * (-0.000992063492063492d0) &
+                 + in(i+9,j-4) * (-0.000992063492063492d0) &
+                 + in(i-3,j-3) * (-0.009259259259259259d0) &
+                 + in(i+1,j-3) * (-0.001851851851851852d0) &
+                 + in(i+2,j-3) * (-0.001851851851851852d0) &
+                 + in(i+3,j-3) * (-0.001851851851851852d0) &
+                 + in(i+4,j-3) * (-0.001851851851851852d0) &
+                 + in(i+5,j-3) * (-0.001851851851851852d0) &
+                 + in(i+6,j-3) * (-0.001851851851851852d0) &
+                 + in(i+7,j-3) * (-0.001851851851851852d0) &
+                 + in(i+8,j-3) * (-0.001851851851851852d0) &
+                 + in(i+9,j-3) * (-0.001851851851851852d0) &
+                 + in(i-2,j-2) * (-0.013888888888888888d0) &
+                 + in(i+1,j-2) * (-0.004629629629629629d0) &
+                 + in(i+2,j-2) * (-0.004629629629629629d0) &
+                 + in(i+3,j-2) * (-0.004629629629629629d0) &
+                 + in(i+4,j-2) * (-0.004629629629629629d0) &
+                 + in(i+5,j-2) * (-0.004629629629629629d0) &
+                 + in(i+6,j-2) * (-0.004629629629629629d0) &
+                 + in(i+7,j-2) * (-0.004629629629629629d0) &
+                 + in(i+8,j-2) * (-0.004629629629629629d0) &
+                 + in(i+9,j-2) * (-0.004629629629629629d0) &
+                 + in(i-1,j-1) * (-0.027777777777777776d0) &
+                 + in(i+1,j-1) * (-0.027777777777777776d0) &
+                 + in(i+2,j-1) * (-0.027777777777777776d0) &
+                 + in(i+3,j-1) * (-0.027777777777777776d0) &
+                 + in(i+4,j-1) * (-0.027777777777777776d0) &
+                 + in(i+5,j-1) * (-0.027777777777777776d0) &
+                 + in(i+6,j-1) * (-0.027777777777777776d0) &
+                 + in(i+7,j-1) * (-0.027777777777777776d0) &
+                 + in(i+8,j-1) * (-0.027777777777777776d0) &
+                 + in(i+9,j-1) * (-0.027777777777777776d0) &
+                 + in(i-9,j+1) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+1) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+1) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+1) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+1) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+1) * (-0.000992063492063492d0) &
+                 + in(i-3,j+1) * (-0.001851851851851852d0) &
+                 + in(i-2,j+1) * (-0.004629629629629629d0) &
+                 + in(i-1,j+1) * (-0.027777777777777776d0) &
+                 + in(i+1,j+1) * (0.027777777777777776d0) &
+                 + in(i+2,j+1) * (0.004629629629629629d0) &
+                 + in(i+3,j+1) * (0.001851851851851852d0) &
+                 + in(i+4,j+1) * (0.000992063492063492d0) &
+                 + in(i+5,j+1) * (0.0006172839506172839d0) &
+                 + in(i+6,j+1) * (0.00042087542087542086d0) &
+                 + in(i+7,j+1) * (0.00030525030525030525d0) &
+                 + in(i+8,j+1) * (0.0002314814814814815d0) &
+                 + in(i+9,j+1) * (0.00018155410312273057d0) &
+                 + in(i-9,j+2) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+2) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+2) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+2) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+2) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+2) * (-0.000992063492063492d0) &
+                 + in(i-3,j+2) * (-0.001851851851851852d0) &
+                 + in(i-2,j+2) * (-0.004629629629629629d0) &
+                 + in(i-1,j+2) * (-0.027777777777777776d0) &
+                 + in(i+1,j+2) * (0.004629629629629629d0) &
+                 + in(i+2,j+2) * (0.013888888888888888d0) &
+                 + in(i+3,j+2) * (0.001851851851851852d0) &
+                 + in(i+4,j+2) * (0.000992063492063492d0) &
+                 + in(i+5,j+2) * (0.0006172839506172839d0) &
+                 + in(i+6,j+2) * (0.00042087542087542086d0) &
+                 + in(i+7,j+2) * (0.00030525030525030525d0) &
+                 + in(i+8,j+2) * (0.0002314814814814815d0) &
+                 + in(i+9,j+2) * (0.00018155410312273057d0) &
+                 + in(i-9,j+3) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+3) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+3) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+3) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+3) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+3) * (-0.000992063492063492d0) &
+                 + in(i-3,j+3) * (-0.001851851851851852d0) &
+                 + in(i-2,j+3) * (-0.004629629629629629d0) &
+                 + in(i-1,j+3) * (-0.027777777777777776d0) &
+                 + in(i+1,j+3) * (0.001851851851851852d0) &
+                 + in(i+2,j+3) * (0.001851851851851852d0) &
+                 + in(i+3,j+3) * (0.009259259259259259d0) &
+                 + in(i+4,j+3) * (0.000992063492063492d0) &
+                 + in(i+5,j+3) * (0.0006172839506172839d0) &
+                 + in(i+6,j+3) * (0.00042087542087542086d0) &
+                 + in(i+7,j+3) * (0.00030525030525030525d0) &
+                 + in(i+8,j+3) * (0.0002314814814814815d0) &
+                 + in(i+9,j+3) * (0.00018155410312273057d0) &
+                 + in(i-9,j+4) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+4) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+4) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+4) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+4) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+4) * (-0.000992063492063492d0) &
+                 + in(i-3,j+4) * (-0.001851851851851852d0) &
+                 + in(i-2,j+4) * (-0.004629629629629629d0) &
+                 + in(i-1,j+4) * (-0.027777777777777776d0) &
+                 + in(i+1,j+4) * (0.000992063492063492d0) &
+                 + in(i+2,j+4) * (0.000992063492063492d0) &
+                 + in(i+3,j+4) * (0.000992063492063492d0) &
+                 + in(i+4,j+4) * (0.006944444444444444d0) &
+                 + in(i+5,j+4) * (0.0006172839506172839d0) &
+                 + in(i+6,j+4) * (0.00042087542087542086d0) &
+                 + in(i+7,j+4) * (0.00030525030525030525d0) &
+                 + in(i+8,j+4) * (0.0002314814814814815d0) &
+                 + in(i+9,j+4) * (0.00018155410312273057d0) &
+                 + in(i-9,j+5) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+5) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+5) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+5) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+5) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+5) * (-0.000992063492063492d0) &
+                 + in(i-3,j+5) * (-0.001851851851851852d0) &
+                 + in(i-2,j+5) * (-0.004629629629629629d0) &
+                 + in(i-1,j+5) * (-0.027777777777777776d0) &
+                 + in(i+1,j+5) * (0.0006172839506172839d0) &
+                 + in(i+2,j+5) * (0.0006172839506172839d0) &
+                 + in(i+3,j+5) * (0.0006172839506172839d0) &
+                 + in(i+4,j+5) * (0.0006172839506172839d0) &
+                 + in(i+5,j+5) * (0.005555555555555556d0) &
+                 + in(i+6,j+5) * (0.00042087542087542086d0) &
+                 + in(i+7,j+5) * (0.00030525030525030525d0) &
+                 + in(i+8,j+5) * (0.0002314814814814815d0) &
+                 + in(i+9,j+5) * (0.00018155410312273057d0) &
+                 + in(i-9,j+6) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+6) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+6) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+6) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+6) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+6) * (-0.000992063492063492d0) &
+                 + in(i-3,j+6) * (-0.001851851851851852d0) &
+                 + in(i-2,j+6) * (-0.004629629629629629d0) &
+                 + in(i-1,j+6) * (-0.027777777777777776d0) &
+                 + in(i+1,j+6) * (0.00042087542087542086d0) &
+                 + in(i+2,j+6) * (0.00042087542087542086d0) &
+                 + in(i+3,j+6) * (0.00042087542087542086d0) &
+                 + in(i+4,j+6) * (0.00042087542087542086d0) &
+                 + in(i+5,j+6) * (0.00042087542087542086d0) &
+                 + in(i+6,j+6) * (0.004629629629629629d0) &
+                 + in(i+7,j+6) * (0.00030525030525030525d0) &
+                 + in(i+8,j+6) * (0.0002314814814814815d0) &
+                 + in(i+9,j+6) * (0.00018155410312273057d0) &
+                 + in(i-9,j+7) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+7) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+7) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+7) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+7) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+7) * (-0.000992063492063492d0) &
+                 + in(i-3,j+7) * (-0.001851851851851852d0) &
+                 + in(i-2,j+7) * (-0.004629629629629629d0) &
+                 + in(i-1,j+7) * (-0.027777777777777776d0) &
+                 + in(i+1,j+7) * (0.00030525030525030525d0) &
+                 + in(i+2,j+7) * (0.00030525030525030525d0) &
+                 + in(i+3,j+7) * (0.00030525030525030525d0) &
+                 + in(i+4,j+7) * (0.00030525030525030525d0) &
+                 + in(i+5,j+7) * (0.00030525030525030525d0) &
+                 + in(i+6,j+7) * (0.00030525030525030525d0) &
+                 + in(i+7,j+7) * (0.003968253968253968d0) &
+                 + in(i+8,j+7) * (0.0002314814814814815d0) &
+                 + in(i+9,j+7) * (0.00018155410312273057d0) &
+                 + in(i-9,j+8) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+8) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+8) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+8) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+8) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+8) * (-0.000992063492063492d0) &
+                 + in(i-3,j+8) * (-0.001851851851851852d0) &
+                 + in(i-2,j+8) * (-0.004629629629629629d0) &
+                 + in(i-1,j+8) * (-0.027777777777777776d0) &
+                 + in(i+1,j+8) * (0.0002314814814814815d0) &
+                 + in(i+2,j+8) * (0.0002314814814814815d0) &
+                 + in(i+3,j+8) * (0.0002314814814814815d0) &
+                 + in(i+4,j+8) * (0.0002314814814814815d0) &
+                 + in(i+5,j+8) * (0.0002314814814814815d0) &
+                 + in(i+6,j+8) * (0.0002314814814814815d0) &
+                 + in(i+7,j+8) * (0.0002314814814814815d0) &
+                 + in(i+8,j+8) * (0.003472222222222222d0) &
+                 + in(i+9,j+8) * (0.00018155410312273057d0) &
+                 + in(i-9,j+9) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+9) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+9) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+9) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+9) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+9) * (-0.000992063492063492d0) &
+                 + in(i-3,j+9) * (-0.001851851851851852d0) &
+                 + in(i-2,j+9) * (-0.004629629629629629d0) &
+                 + in(i-1,j+9) * (-0.027777777777777776d0) &
+                 + in(i+1,j+9) * (0.00018155410312273057d0) &
+                 + in(i+2,j+9) * (0.00018155410312273057d0) &
+                 + in(i+3,j+9) * (0.00018155410312273057d0) &
+                 + in(i+4,j+9) * (0.00018155410312273057d0) &
+                 + in(i+5,j+9) * (0.00018155410312273057d0) &
+                 + in(i+6,j+9) * (0.00018155410312273057d0) &
+                 + in(i+7,j+9) * (0.00018155410312273057d0) &
+                 + in(i+8,j+9) * (0.00018155410312273057d0) &
+                 + in(i+9,j+9) * (0.0030864197530864196d0) &
 +0.0
       end do
     end do
diff --git a/FORTRAN/stencil_target.f90 b/FORTRAN/stencil_target.f90
index 2f6edffe1..f2c3b7785 100644
--- a/FORTRAN/stencil_target.f90
+++ b/FORTRAN/stencil_target.f90
@@ -6,15 +6,14 @@ subroutine star1(n, in, out)
 real(kind=REAL64), intent(in) :: in(n,n)
 real(kind=REAL64), intent(inout) :: out(n,n)
 integer(kind=INT32) :: i,j
-    !$omp do
+    !$omp teams distribute parallel for simd collapse(2) schedule(static,1)
     do i=1,n-1-1
-      !$omp simd
       do j=1,n-1-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-1) * (-0.5) &
-                 + in(i-1,j+0) * (-0.5) &
-                 + in(i+1,j+0) * (0.5) &
-                 + in(i+0,j+1) * (0.5) &
+                 + in(i+0,j-1) * (-0.5d0) &
+                 + in(i-1,j+0) * (-0.5d0) &
+                 + in(i+1,j+0) * (0.5d0) &
+                 + in(i+0,j+1) * (0.5d0) &
 +0.0
       end do
       !$omp end simd
@@ -30,19 +29,18 @@ subroutine star2(n, in, out)
 real(kind=REAL64), intent(in) :: in(n,n)
 real(kind=REAL64), intent(inout) :: out(n,n)
 integer(kind=INT32) :: i,j
-    !$omp do
+    !$omp teams distribute parallel for simd collapse(2) schedule(static,1)
     do i=2,n-2-1
-      !$omp simd
       do j=2,n-2-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-2) * (-0.125) &
-                 + in(i+0,j-1) * (-0.25) &
-                 + in(i-2,j+0) * (-0.125) &
-                 + in(i-1,j+0) * (-0.25) &
-                 + in(i+1,j+0) * (0.25) &
-                 + in(i+2,j+0) * (0.125) &
-                 + in(i+0,j+1) * (0.25) &
-                 + in(i+0,j+2) * (0.125) &
+                 + in(i+0,j-2) * (-0.125d0) &
+                 + in(i+0,j-1) * (-0.25d0) &
+                 + in(i-2,j+0) * (-0.125d0) &
+                 + in(i-1,j+0) * (-0.25d0) &
+                 + in(i+1,j+0) * (0.25d0) &
+                 + in(i+2,j+0) * (0.125d0) &
+                 + in(i+0,j+1) * (0.25d0) &
+                 + in(i+0,j+2) * (0.125d0) &
 +0.0
       end do
       !$omp end simd
@@ -58,23 +56,22 @@ subroutine star3(n, in, out)
 real(kind=REAL64), intent(in) :: in(n,n)
 real(kind=REAL64), intent(inout) :: out(n,n)
 integer(kind=INT32) :: i,j
-    !$omp do
+    !$omp teams distribute parallel for simd collapse(2) schedule(static,1)
     do i=3,n-3-1
-      !$omp simd
       do j=3,n-3-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-3) * (-0.05555555555555555) &
-                 + in(i+0,j-2) * (-0.08333333333333333) &
-                 + in(i+0,j-1) * (-0.16666666666666666) &
-                 + in(i-3,j+0) * (-0.05555555555555555) &
-                 + in(i-2,j+0) * (-0.08333333333333333) &
-                 + in(i-1,j+0) * (-0.16666666666666666) &
-                 + in(i+1,j+0) * (0.16666666666666666) &
-                 + in(i+2,j+0) * (0.08333333333333333) &
-                 + in(i+3,j+0) * (0.05555555555555555) &
-                 + in(i+0,j+1) * (0.16666666666666666) &
-                 + in(i+0,j+2) * (0.08333333333333333) &
-                 + in(i+0,j+3) * (0.05555555555555555) &
+                 + in(i+0,j-3) * (-0.05555555555555555d0) &
+                 + in(i+0,j-2) * (-0.08333333333333333d0) &
+                 + in(i+0,j-1) * (-0.16666666666666666d0) &
+                 + in(i-3,j+0) * (-0.05555555555555555d0) &
+                 + in(i-2,j+0) * (-0.08333333333333333d0) &
+                 + in(i-1,j+0) * (-0.16666666666666666d0) &
+                 + in(i+1,j+0) * (0.16666666666666666d0) &
+                 + in(i+2,j+0) * (0.08333333333333333d0) &
+                 + in(i+3,j+0) * (0.05555555555555555d0) &
+                 + in(i+0,j+1) * (0.16666666666666666d0) &
+                 + in(i+0,j+2) * (0.08333333333333333d0) &
+                 + in(i+0,j+3) * (0.05555555555555555d0) &
 +0.0
       end do
       !$omp end simd
@@ -90,27 +87,26 @@ subroutine star4(n, in, out)
 real(kind=REAL64), intent(in) :: in(n,n)
 real(kind=REAL64), intent(inout) :: out(n,n)
 integer(kind=INT32) :: i,j
-    !$omp do
+    !$omp teams distribute parallel for simd collapse(2) schedule(static,1)
     do i=4,n-4-1
-      !$omp simd
       do j=4,n-4-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-4) * (-0.03125) &
-                 + in(i+0,j-3) * (-0.041666666666666664) &
-                 + in(i+0,j-2) * (-0.0625) &
-                 + in(i+0,j-1) * (-0.125) &
-                 + in(i-4,j+0) * (-0.03125) &
-                 + in(i-3,j+0) * (-0.041666666666666664) &
-                 + in(i-2,j+0) * (-0.0625) &
-                 + in(i-1,j+0) * (-0.125) &
-                 + in(i+1,j+0) * (0.125) &
-                 + in(i+2,j+0) * (0.0625) &
-                 + in(i+3,j+0) * (0.041666666666666664) &
-                 + in(i+4,j+0) * (0.03125) &
-                 + in(i+0,j+1) * (0.125) &
-                 + in(i+0,j+2) * (0.0625) &
-                 + in(i+0,j+3) * (0.041666666666666664) &
-                 + in(i+0,j+4) * (0.03125) &
+                 + in(i+0,j-4) * (-0.03125d0) &
+                 + in(i+0,j-3) * (-0.041666666666666664d0) &
+                 + in(i+0,j-2) * (-0.0625d0) &
+                 + in(i+0,j-1) * (-0.125d0) &
+                 + in(i-4,j+0) * (-0.03125d0) &
+                 + in(i-3,j+0) * (-0.041666666666666664d0) &
+                 + in(i-2,j+0) * (-0.0625d0) &
+                 + in(i-1,j+0) * (-0.125d0) &
+                 + in(i+1,j+0) * (0.125d0) &
+                 + in(i+2,j+0) * (0.0625d0) &
+                 + in(i+3,j+0) * (0.041666666666666664d0) &
+                 + in(i+4,j+0) * (0.03125d0) &
+                 + in(i+0,j+1) * (0.125d0) &
+                 + in(i+0,j+2) * (0.0625d0) &
+                 + in(i+0,j+3) * (0.041666666666666664d0) &
+                 + in(i+0,j+4) * (0.03125d0) &
 +0.0
       end do
       !$omp end simd
@@ -126,31 +122,30 @@ subroutine star5(n, in, out)
 real(kind=REAL64), intent(in) :: in(n,n)
 real(kind=REAL64), intent(inout) :: out(n,n)
 integer(kind=INT32) :: i,j
-    !$omp do
+    !$omp teams distribute parallel for simd collapse(2) schedule(static,1)
     do i=5,n-5-1
-      !$omp simd
       do j=5,n-5-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-5) * (-0.02) &
-                 + in(i+0,j-4) * (-0.025) &
-                 + in(i+0,j-3) * (-0.03333333333333333) &
-                 + in(i+0,j-2) * (-0.05) &
-                 + in(i+0,j-1) * (-0.1) &
-                 + in(i-5,j+0) * (-0.02) &
-                 + in(i-4,j+0) * (-0.025) &
-                 + in(i-3,j+0) * (-0.03333333333333333) &
-                 + in(i-2,j+0) * (-0.05) &
-                 + in(i-1,j+0) * (-0.1) &
-                 + in(i+1,j+0) * (0.1) &
-                 + in(i+2,j+0) * (0.05) &
-                 + in(i+3,j+0) * (0.03333333333333333) &
-                 + in(i+4,j+0) * (0.025) &
-                 + in(i+5,j+0) * (0.02) &
-                 + in(i+0,j+1) * (0.1) &
-                 + in(i+0,j+2) * (0.05) &
-                 + in(i+0,j+3) * (0.03333333333333333) &
-                 + in(i+0,j+4) * (0.025) &
-                 + in(i+0,j+5) * (0.02) &
+                 + in(i+0,j-5) * (-0.02d0) &
+                 + in(i+0,j-4) * (-0.025d0) &
+                 + in(i+0,j-3) * (-0.03333333333333333d0) &
+                 + in(i+0,j-2) * (-0.05d0) &
+                 + in(i+0,j-1) * (-0.1d0) &
+                 + in(i-5,j+0) * (-0.02d0) &
+                 + in(i-4,j+0) * (-0.025d0) &
+                 + in(i-3,j+0) * (-0.03333333333333333d0) &
+                 + in(i-2,j+0) * (-0.05d0) &
+                 + in(i-1,j+0) * (-0.1d0) &
+                 + in(i+1,j+0) * (0.1d0) &
+                 + in(i+2,j+0) * (0.05d0) &
+                 + in(i+3,j+0) * (0.03333333333333333d0) &
+                 + in(i+4,j+0) * (0.025d0) &
+                 + in(i+5,j+0) * (0.02d0) &
+                 + in(i+0,j+1) * (0.1d0) &
+                 + in(i+0,j+2) * (0.05d0) &
+                 + in(i+0,j+3) * (0.03333333333333333d0) &
+                 + in(i+0,j+4) * (0.025d0) &
+                 + in(i+0,j+5) * (0.02d0) &
 +0.0
       end do
       !$omp end simd
@@ -166,35 +161,34 @@ subroutine star6(n, in, out)
 real(kind=REAL64), intent(in) :: in(n,n)
 real(kind=REAL64), intent(inout) :: out(n,n)
 integer(kind=INT32) :: i,j
-    !$omp do
+    !$omp teams distribute parallel for simd collapse(2) schedule(static,1)
     do i=6,n-6-1
-      !$omp simd
       do j=6,n-6-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-6) * (-0.013888888888888888) &
-                 + in(i+0,j-5) * (-0.016666666666666666) &
-                 + in(i+0,j-4) * (-0.020833333333333332) &
-                 + in(i+0,j-3) * (-0.027777777777777776) &
-                 + in(i+0,j-2) * (-0.041666666666666664) &
-                 + in(i+0,j-1) * (-0.08333333333333333) &
-                 + in(i-6,j+0) * (-0.013888888888888888) &
-                 + in(i-5,j+0) * (-0.016666666666666666) &
-                 + in(i-4,j+0) * (-0.020833333333333332) &
-                 + in(i-3,j+0) * (-0.027777777777777776) &
-                 + in(i-2,j+0) * (-0.041666666666666664) &
-                 + in(i-1,j+0) * (-0.08333333333333333) &
-                 + in(i+1,j+0) * (0.08333333333333333) &
-                 + in(i+2,j+0) * (0.041666666666666664) &
-                 + in(i+3,j+0) * (0.027777777777777776) &
-                 + in(i+4,j+0) * (0.020833333333333332) &
-                 + in(i+5,j+0) * (0.016666666666666666) &
-                 + in(i+6,j+0) * (0.013888888888888888) &
-                 + in(i+0,j+1) * (0.08333333333333333) &
-                 + in(i+0,j+2) * (0.041666666666666664) &
-                 + in(i+0,j+3) * (0.027777777777777776) &
-                 + in(i+0,j+4) * (0.020833333333333332) &
-                 + in(i+0,j+5) * (0.016666666666666666) &
-                 + in(i+0,j+6) * (0.013888888888888888) &
+                 + in(i+0,j-6) * (-0.013888888888888888d0) &
+                 + in(i+0,j-5) * (-0.016666666666666666d0) &
+                 + in(i+0,j-4) * (-0.020833333333333332d0) &
+                 + in(i+0,j-3) * (-0.027777777777777776d0) &
+                 + in(i+0,j-2) * (-0.041666666666666664d0) &
+                 + in(i+0,j-1) * (-0.08333333333333333d0) &
+                 + in(i-6,j+0) * (-0.013888888888888888d0) &
+                 + in(i-5,j+0) * (-0.016666666666666666d0) &
+                 + in(i-4,j+0) * (-0.020833333333333332d0) &
+                 + in(i-3,j+0) * (-0.027777777777777776d0) &
+                 + in(i-2,j+0) * (-0.041666666666666664d0) &
+                 + in(i-1,j+0) * (-0.08333333333333333d0) &
+                 + in(i+1,j+0) * (0.08333333333333333d0) &
+                 + in(i+2,j+0) * (0.041666666666666664d0) &
+                 + in(i+3,j+0) * (0.027777777777777776d0) &
+                 + in(i+4,j+0) * (0.020833333333333332d0) &
+                 + in(i+5,j+0) * (0.016666666666666666d0) &
+                 + in(i+6,j+0) * (0.013888888888888888d0) &
+                 + in(i+0,j+1) * (0.08333333333333333d0) &
+                 + in(i+0,j+2) * (0.041666666666666664d0) &
+                 + in(i+0,j+3) * (0.027777777777777776d0) &
+                 + in(i+0,j+4) * (0.020833333333333332d0) &
+                 + in(i+0,j+5) * (0.016666666666666666d0) &
+                 + in(i+0,j+6) * (0.013888888888888888d0) &
 +0.0
       end do
       !$omp end simd
@@ -210,39 +204,38 @@ subroutine star7(n, in, out)
 real(kind=REAL64), intent(in) :: in(n,n)
 real(kind=REAL64), intent(inout) :: out(n,n)
 integer(kind=INT32) :: i,j
-    !$omp do
+    !$omp teams distribute parallel for simd collapse(2) schedule(static,1)
     do i=7,n-7-1
-      !$omp simd
       do j=7,n-7-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-7) * (-0.01020408163265306) &
-                 + in(i+0,j-6) * (-0.011904761904761904) &
-                 + in(i+0,j-5) * (-0.014285714285714285) &
-                 + in(i+0,j-4) * (-0.017857142857142856) &
-                 + in(i+0,j-3) * (-0.023809523809523808) &
-                 + in(i+0,j-2) * (-0.03571428571428571) &
-                 + in(i+0,j-1) * (-0.07142857142857142) &
-                 + in(i-7,j+0) * (-0.01020408163265306) &
-                 + in(i-6,j+0) * (-0.011904761904761904) &
-                 + in(i-5,j+0) * (-0.014285714285714285) &
-                 + in(i-4,j+0) * (-0.017857142857142856) &
-                 + in(i-3,j+0) * (-0.023809523809523808) &
-                 + in(i-2,j+0) * (-0.03571428571428571) &
-                 + in(i-1,j+0) * (-0.07142857142857142) &
-                 + in(i+1,j+0) * (0.07142857142857142) &
-                 + in(i+2,j+0) * (0.03571428571428571) &
-                 + in(i+3,j+0) * (0.023809523809523808) &
-                 + in(i+4,j+0) * (0.017857142857142856) &
-                 + in(i+5,j+0) * (0.014285714285714285) &
-                 + in(i+6,j+0) * (0.011904761904761904) &
-                 + in(i+7,j+0) * (0.01020408163265306) &
-                 + in(i+0,j+1) * (0.07142857142857142) &
-                 + in(i+0,j+2) * (0.03571428571428571) &
-                 + in(i+0,j+3) * (0.023809523809523808) &
-                 + in(i+0,j+4) * (0.017857142857142856) &
-                 + in(i+0,j+5) * (0.014285714285714285) &
-                 + in(i+0,j+6) * (0.011904761904761904) &
-                 + in(i+0,j+7) * (0.01020408163265306) &
+                 + in(i+0,j-7) * (-0.01020408163265306d0) &
+                 + in(i+0,j-6) * (-0.011904761904761904d0) &
+                 + in(i+0,j-5) * (-0.014285714285714285d0) &
+                 + in(i+0,j-4) * (-0.017857142857142856d0) &
+                 + in(i+0,j-3) * (-0.023809523809523808d0) &
+                 + in(i+0,j-2) * (-0.03571428571428571d0) &
+                 + in(i+0,j-1) * (-0.07142857142857142d0) &
+                 + in(i-7,j+0) * (-0.01020408163265306d0) &
+                 + in(i-6,j+0) * (-0.011904761904761904d0) &
+                 + in(i-5,j+0) * (-0.014285714285714285d0) &
+                 + in(i-4,j+0) * (-0.017857142857142856d0) &
+                 + in(i-3,j+0) * (-0.023809523809523808d0) &
+                 + in(i-2,j+0) * (-0.03571428571428571d0) &
+                 + in(i-1,j+0) * (-0.07142857142857142d0) &
+                 + in(i+1,j+0) * (0.07142857142857142d0) &
+                 + in(i+2,j+0) * (0.03571428571428571d0) &
+                 + in(i+3,j+0) * (0.023809523809523808d0) &
+                 + in(i+4,j+0) * (0.017857142857142856d0) &
+                 + in(i+5,j+0) * (0.014285714285714285d0) &
+                 + in(i+6,j+0) * (0.011904761904761904d0) &
+                 + in(i+7,j+0) * (0.01020408163265306d0) &
+                 + in(i+0,j+1) * (0.07142857142857142d0) &
+                 + in(i+0,j+2) * (0.03571428571428571d0) &
+                 + in(i+0,j+3) * (0.023809523809523808d0) &
+                 + in(i+0,j+4) * (0.017857142857142856d0) &
+                 + in(i+0,j+5) * (0.014285714285714285d0) &
+                 + in(i+0,j+6) * (0.011904761904761904d0) &
+                 + in(i+0,j+7) * (0.01020408163265306d0) &
 +0.0
       end do
       !$omp end simd
@@ -258,43 +251,42 @@ subroutine star8(n, in, out)
 real(kind=REAL64), intent(in) :: in(n,n)
 real(kind=REAL64), intent(inout) :: out(n,n)
 integer(kind=INT32) :: i,j
-    !$omp do
+    !$omp teams distribute parallel for simd collapse(2) schedule(static,1)
     do i=8,n-8-1
-      !$omp simd
       do j=8,n-8-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-8) * (-0.0078125) &
-                 + in(i+0,j-7) * (-0.008928571428571428) &
-                 + in(i+0,j-6) * (-0.010416666666666666) &
-                 + in(i+0,j-5) * (-0.0125) &
-                 + in(i+0,j-4) * (-0.015625) &
-                 + in(i+0,j-3) * (-0.020833333333333332) &
-                 + in(i+0,j-2) * (-0.03125) &
-                 + in(i+0,j-1) * (-0.0625) &
-                 + in(i-8,j+0) * (-0.0078125) &
-                 + in(i-7,j+0) * (-0.008928571428571428) &
-                 + in(i-6,j+0) * (-0.010416666666666666) &
-                 + in(i-5,j+0) * (-0.0125) &
-                 + in(i-4,j+0) * (-0.015625) &
-                 + in(i-3,j+0) * (-0.020833333333333332) &
-                 + in(i-2,j+0) * (-0.03125) &
-                 + in(i-1,j+0) * (-0.0625) &
-                 + in(i+1,j+0) * (0.0625) &
-                 + in(i+2,j+0) * (0.03125) &
-                 + in(i+3,j+0) * (0.020833333333333332) &
-                 + in(i+4,j+0) * (0.015625) &
-                 + in(i+5,j+0) * (0.0125) &
-                 + in(i+6,j+0) * (0.010416666666666666) &
-                 + in(i+7,j+0) * (0.008928571428571428) &
-                 + in(i+8,j+0) * (0.0078125) &
-                 + in(i+0,j+1) * (0.0625) &
-                 + in(i+0,j+2) * (0.03125) &
-                 + in(i+0,j+3) * (0.020833333333333332) &
-                 + in(i+0,j+4) * (0.015625) &
-                 + in(i+0,j+5) * (0.0125) &
-                 + in(i+0,j+6) * (0.010416666666666666) &
-                 + in(i+0,j+7) * (0.008928571428571428) &
-                 + in(i+0,j+8) * (0.0078125) &
+                 + in(i+0,j-8) * (-0.0078125d0) &
+                 + in(i+0,j-7) * (-0.008928571428571428d0) &
+                 + in(i+0,j-6) * (-0.010416666666666666d0) &
+                 + in(i+0,j-5) * (-0.0125d0) &
+                 + in(i+0,j-4) * (-0.015625d0) &
+                 + in(i+0,j-3) * (-0.020833333333333332d0) &
+                 + in(i+0,j-2) * (-0.03125d0) &
+                 + in(i+0,j-1) * (-0.0625d0) &
+                 + in(i-8,j+0) * (-0.0078125d0) &
+                 + in(i-7,j+0) * (-0.008928571428571428d0) &
+                 + in(i-6,j+0) * (-0.010416666666666666d0) &
+                 + in(i-5,j+0) * (-0.0125d0) &
+                 + in(i-4,j+0) * (-0.015625d0) &
+                 + in(i-3,j+0) * (-0.020833333333333332d0) &
+                 + in(i-2,j+0) * (-0.03125d0) &
+                 + in(i-1,j+0) * (-0.0625d0) &
+                 + in(i+1,j+0) * (0.0625d0) &
+                 + in(i+2,j+0) * (0.03125d0) &
+                 + in(i+3,j+0) * (0.020833333333333332d0) &
+                 + in(i+4,j+0) * (0.015625d0) &
+                 + in(i+5,j+0) * (0.0125d0) &
+                 + in(i+6,j+0) * (0.010416666666666666d0) &
+                 + in(i+7,j+0) * (0.008928571428571428d0) &
+                 + in(i+8,j+0) * (0.0078125d0) &
+                 + in(i+0,j+1) * (0.0625d0) &
+                 + in(i+0,j+2) * (0.03125d0) &
+                 + in(i+0,j+3) * (0.020833333333333332d0) &
+                 + in(i+0,j+4) * (0.015625d0) &
+                 + in(i+0,j+5) * (0.0125d0) &
+                 + in(i+0,j+6) * (0.010416666666666666d0) &
+                 + in(i+0,j+7) * (0.008928571428571428d0) &
+                 + in(i+0,j+8) * (0.0078125d0) &
 +0.0
       end do
       !$omp end simd
@@ -310,47 +302,46 @@ subroutine star9(n, in, out)
 real(kind=REAL64), intent(in) :: in(n,n)
 real(kind=REAL64), intent(inout) :: out(n,n)
 integer(kind=INT32) :: i,j
-    !$omp do
+    !$omp teams distribute parallel for simd collapse(2) schedule(static,1)
     do i=9,n-9-1
-      !$omp simd
       do j=9,n-9-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-9) * (-0.006172839506172839) &
-                 + in(i+0,j-8) * (-0.006944444444444444) &
-                 + in(i+0,j-7) * (-0.007936507936507936) &
-                 + in(i+0,j-6) * (-0.009259259259259259) &
-                 + in(i+0,j-5) * (-0.011111111111111112) &
-                 + in(i+0,j-4) * (-0.013888888888888888) &
-                 + in(i+0,j-3) * (-0.018518518518518517) &
-                 + in(i+0,j-2) * (-0.027777777777777776) &
-                 + in(i+0,j-1) * (-0.05555555555555555) &
-                 + in(i-9,j+0) * (-0.006172839506172839) &
-                 + in(i-8,j+0) * (-0.006944444444444444) &
-                 + in(i-7,j+0) * (-0.007936507936507936) &
-                 + in(i-6,j+0) * (-0.009259259259259259) &
-                 + in(i-5,j+0) * (-0.011111111111111112) &
-                 + in(i-4,j+0) * (-0.013888888888888888) &
-                 + in(i-3,j+0) * (-0.018518518518518517) &
-                 + in(i-2,j+0) * (-0.027777777777777776) &
-                 + in(i-1,j+0) * (-0.05555555555555555) &
-                 + in(i+1,j+0) * (0.05555555555555555) &
-                 + in(i+2,j+0) * (0.027777777777777776) &
-                 + in(i+3,j+0) * (0.018518518518518517) &
-                 + in(i+4,j+0) * (0.013888888888888888) &
-                 + in(i+5,j+0) * (0.011111111111111112) &
-                 + in(i+6,j+0) * (0.009259259259259259) &
-                 + in(i+7,j+0) * (0.007936507936507936) &
-                 + in(i+8,j+0) * (0.006944444444444444) &
-                 + in(i+9,j+0) * (0.006172839506172839) &
-                 + in(i+0,j+1) * (0.05555555555555555) &
-                 + in(i+0,j+2) * (0.027777777777777776) &
-                 + in(i+0,j+3) * (0.018518518518518517) &
-                 + in(i+0,j+4) * (0.013888888888888888) &
-                 + in(i+0,j+5) * (0.011111111111111112) &
-                 + in(i+0,j+6) * (0.009259259259259259) &
-                 + in(i+0,j+7) * (0.007936507936507936) &
-                 + in(i+0,j+8) * (0.006944444444444444) &
-                 + in(i+0,j+9) * (0.006172839506172839) &
+                 + in(i+0,j-9) * (-0.006172839506172839d0) &
+                 + in(i+0,j-8) * (-0.006944444444444444d0) &
+                 + in(i+0,j-7) * (-0.007936507936507936d0) &
+                 + in(i+0,j-6) * (-0.009259259259259259d0) &
+                 + in(i+0,j-5) * (-0.011111111111111112d0) &
+                 + in(i+0,j-4) * (-0.013888888888888888d0) &
+                 + in(i+0,j-3) * (-0.018518518518518517d0) &
+                 + in(i+0,j-2) * (-0.027777777777777776d0) &
+                 + in(i+0,j-1) * (-0.05555555555555555d0) &
+                 + in(i-9,j+0) * (-0.006172839506172839d0) &
+                 + in(i-8,j+0) * (-0.006944444444444444d0) &
+                 + in(i-7,j+0) * (-0.007936507936507936d0) &
+                 + in(i-6,j+0) * (-0.009259259259259259d0) &
+                 + in(i-5,j+0) * (-0.011111111111111112d0) &
+                 + in(i-4,j+0) * (-0.013888888888888888d0) &
+                 + in(i-3,j+0) * (-0.018518518518518517d0) &
+                 + in(i-2,j+0) * (-0.027777777777777776d0) &
+                 + in(i-1,j+0) * (-0.05555555555555555d0) &
+                 + in(i+1,j+0) * (0.05555555555555555d0) &
+                 + in(i+2,j+0) * (0.027777777777777776d0) &
+                 + in(i+3,j+0) * (0.018518518518518517d0) &
+                 + in(i+4,j+0) * (0.013888888888888888d0) &
+                 + in(i+5,j+0) * (0.011111111111111112d0) &
+                 + in(i+6,j+0) * (0.009259259259259259d0) &
+                 + in(i+7,j+0) * (0.007936507936507936d0) &
+                 + in(i+8,j+0) * (0.006944444444444444d0) &
+                 + in(i+9,j+0) * (0.006172839506172839d0) &
+                 + in(i+0,j+1) * (0.05555555555555555d0) &
+                 + in(i+0,j+2) * (0.027777777777777776d0) &
+                 + in(i+0,j+3) * (0.018518518518518517d0) &
+                 + in(i+0,j+4) * (0.013888888888888888d0) &
+                 + in(i+0,j+5) * (0.011111111111111112d0) &
+                 + in(i+0,j+6) * (0.009259259259259259d0) &
+                 + in(i+0,j+7) * (0.007936507936507936d0) &
+                 + in(i+0,j+8) * (0.006944444444444444d0) &
+                 + in(i+0,j+9) * (0.006172839506172839d0) &
 +0.0
       end do
       !$omp end simd
@@ -366,15 +357,14 @@ subroutine grid1(n, in, out)
 real(kind=REAL64), intent(in) :: in(n,n)
 real(kind=REAL64), intent(inout) :: out(n,n)
 integer(kind=INT32) :: i,j
-    !$omp do
+    !$omp teams distribute parallel for simd collapse(2) schedule(static,1)
     do i=1,n-1-1
-      !$omp simd
       do j=1,n-1-1
         out(i,j) = out(i,j) &
-                 + in(i-1,j-1) * (-0.25) &
-                 + in(i+1,j-1) * (-0.25) &
-                 + in(i-1,j+1) * (-0.25) &
-                 + in(i+1,j+1) * (0.25) &
+                 + in(i-1,j-1) * (-0.25d0) &
+                 + in(i+1,j-1) * (-0.25d0) &
+                 + in(i-1,j+1) * (-0.25d0) &
+                 + in(i+1,j+1) * (0.25d0) &
 +0.0
       end do
       !$omp end simd
@@ -390,25 +380,24 @@ subroutine grid2(n, in, out)
 real(kind=REAL64), intent(in) :: in(n,n)
 real(kind=REAL64), intent(inout) :: out(n,n)
 integer(kind=INT32) :: i,j
-    !$omp do
+    !$omp teams distribute parallel for simd collapse(2) schedule(static,1)
     do i=2,n-2-1
-      !$omp simd
       do j=2,n-2-1
         out(i,j) = out(i,j) &
-                 + in(i-2,j-2) * (-0.0625) &
-                 + in(i+1,j-2) * (-0.020833333333333332) &
-                 + in(i+2,j-2) * (-0.020833333333333332) &
-                 + in(i-1,j-1) * (-0.125) &
-                 + in(i+1,j-1) * (-0.125) &
-                 + in(i+2,j-1) * (-0.125) &
-                 + in(i-2,j+1) * (-0.020833333333333332) &
-                 + in(i-1,j+1) * (-0.125) &
-                 + in(i+1,j+1) * (0.125) &
-                 + in(i+2,j+1) * (0.020833333333333332) &
-                 + in(i-2,j+2) * (-0.020833333333333332) &
-                 + in(i-1,j+2) * (-0.125) &
-                 + in(i+1,j+2) * (0.020833333333333332) &
-                 + in(i+2,j+2) * (0.0625) &
+                 + in(i-2,j-2) * (-0.0625d0) &
+                 + in(i+1,j-2) * (-0.020833333333333332d0) &
+                 + in(i+2,j-2) * (-0.020833333333333332d0) &
+                 + in(i-1,j-1) * (-0.125d0) &
+                 + in(i+1,j-1) * (-0.125d0) &
+                 + in(i+2,j-1) * (-0.125d0) &
+                 + in(i-2,j+1) * (-0.020833333333333332d0) &
+                 + in(i-1,j+1) * (-0.125d0) &
+                 + in(i+1,j+1) * (0.125d0) &
+                 + in(i+2,j+1) * (0.020833333333333332d0) &
+                 + in(i-2,j+2) * (-0.020833333333333332d0) &
+                 + in(i-1,j+2) * (-0.125d0) &
+                 + in(i+1,j+2) * (0.020833333333333332d0) &
+                 + in(i+2,j+2) * (0.0625d0) &
 +0.0
       end do
       !$omp end simd
@@ -424,41 +413,40 @@ subroutine grid3(n, in, out)
 real(kind=REAL64), intent(in) :: in(n,n)
 real(kind=REAL64), intent(inout) :: out(n,n)
 integer(kind=INT32) :: i,j
-    !$omp do
+    !$omp teams distribute parallel for simd collapse(2) schedule(static,1)
     do i=3,n-3-1
-      !$omp simd
       do j=3,n-3-1
         out(i,j) = out(i,j) &
-                 + in(i-3,j-3) * (-0.027777777777777776) &
-                 + in(i+1,j-3) * (-0.005555555555555556) &
-                 + in(i+2,j-3) * (-0.005555555555555556) &
-                 + in(i+3,j-3) * (-0.005555555555555556) &
-                 + in(i-2,j-2) * (-0.041666666666666664) &
-                 + in(i+1,j-2) * (-0.013888888888888888) &
-                 + in(i+2,j-2) * (-0.013888888888888888) &
-                 + in(i+3,j-2) * (-0.013888888888888888) &
-                 + in(i-1,j-1) * (-0.08333333333333333) &
-                 + in(i+1,j-1) * (-0.08333333333333333) &
-                 + in(i+2,j-1) * (-0.08333333333333333) &
-                 + in(i+3,j-1) * (-0.08333333333333333) &
-                 + in(i-3,j+1) * (-0.005555555555555556) &
-                 + in(i-2,j+1) * (-0.013888888888888888) &
-                 + in(i-1,j+1) * (-0.08333333333333333) &
-                 + in(i+1,j+1) * (0.08333333333333333) &
-                 + in(i+2,j+1) * (0.013888888888888888) &
-                 + in(i+3,j+1) * (0.005555555555555556) &
-                 + in(i-3,j+2) * (-0.005555555555555556) &
-                 + in(i-2,j+2) * (-0.013888888888888888) &
-                 + in(i-1,j+2) * (-0.08333333333333333) &
-                 + in(i+1,j+2) * (0.013888888888888888) &
-                 + in(i+2,j+2) * (0.041666666666666664) &
-                 + in(i+3,j+2) * (0.005555555555555556) &
-                 + in(i-3,j+3) * (-0.005555555555555556) &
-                 + in(i-2,j+3) * (-0.013888888888888888) &
-                 + in(i-1,j+3) * (-0.08333333333333333) &
-                 + in(i+1,j+3) * (0.005555555555555556) &
-                 + in(i+2,j+3) * (0.005555555555555556) &
-                 + in(i+3,j+3) * (0.027777777777777776) &
+                 + in(i-3,j-3) * (-0.027777777777777776d0) &
+                 + in(i+1,j-3) * (-0.005555555555555556d0) &
+                 + in(i+2,j-3) * (-0.005555555555555556d0) &
+                 + in(i+3,j-3) * (-0.005555555555555556d0) &
+                 + in(i-2,j-2) * (-0.041666666666666664d0) &
+                 + in(i+1,j-2) * (-0.013888888888888888d0) &
+                 + in(i+2,j-2) * (-0.013888888888888888d0) &
+                 + in(i+3,j-2) * (-0.013888888888888888d0) &
+                 + in(i-1,j-1) * (-0.08333333333333333d0) &
+                 + in(i+1,j-1) * (-0.08333333333333333d0) &
+                 + in(i+2,j-1) * (-0.08333333333333333d0) &
+                 + in(i+3,j-1) * (-0.08333333333333333d0) &
+                 + in(i-3,j+1) * (-0.005555555555555556d0) &
+                 + in(i-2,j+1) * (-0.013888888888888888d0) &
+                 + in(i-1,j+1) * (-0.08333333333333333d0) &
+                 + in(i+1,j+1) * (0.08333333333333333d0) &
+                 + in(i+2,j+1) * (0.013888888888888888d0) &
+                 + in(i+3,j+1) * (0.005555555555555556d0) &
+                 + in(i-3,j+2) * (-0.005555555555555556d0) &
+                 + in(i-2,j+2) * (-0.013888888888888888d0) &
+                 + in(i-1,j+2) * (-0.08333333333333333d0) &
+                 + in(i+1,j+2) * (0.013888888888888888d0) &
+                 + in(i+2,j+2) * (0.041666666666666664d0) &
+                 + in(i+3,j+2) * (0.005555555555555556d0) &
+                 + in(i-3,j+3) * (-0.005555555555555556d0) &
+                 + in(i-2,j+3) * (-0.013888888888888888d0) &
+                 + in(i-1,j+3) * (-0.08333333333333333d0) &
+                 + in(i+1,j+3) * (0.005555555555555556d0) &
+                 + in(i+2,j+3) * (0.005555555555555556d0) &
+                 + in(i+3,j+3) * (0.027777777777777776d0) &
 +0.0
       end do
       !$omp end simd
@@ -474,63 +462,62 @@ subroutine grid4(n, in, out)
 real(kind=REAL64), intent(in) :: in(n,n)
 real(kind=REAL64), intent(inout) :: out(n,n)
 integer(kind=INT32) :: i,j
-    !$omp do
+    !$omp teams distribute parallel for simd collapse(2) schedule(static,1)
     do i=4,n-4-1
-      !$omp simd
       do j=4,n-4-1
         out(i,j) = out(i,j) &
-                 + in(i-4,j-4) * (-0.015625) &
-                 + in(i+1,j-4) * (-0.002232142857142857) &
-                 + in(i+2,j-4) * (-0.002232142857142857) &
-                 + in(i+3,j-4) * (-0.002232142857142857) &
-                 + in(i+4,j-4) * (-0.002232142857142857) &
-                 + in(i-3,j-3) * (-0.020833333333333332) &
-                 + in(i+1,j-3) * (-0.004166666666666667) &
-                 + in(i+2,j-3) * (-0.004166666666666667) &
-                 + in(i+3,j-3) * (-0.004166666666666667) &
-                 + in(i+4,j-3) * (-0.004166666666666667) &
-                 + in(i-2,j-2) * (-0.03125) &
-                 + in(i+1,j-2) * (-0.010416666666666666) &
-                 + in(i+2,j-2) * (-0.010416666666666666) &
-                 + in(i+3,j-2) * (-0.010416666666666666) &
-                 + in(i+4,j-2) * (-0.010416666666666666) &
-                 + in(i-1,j-1) * (-0.0625) &
-                 + in(i+1,j-1) * (-0.0625) &
-                 + in(i+2,j-1) * (-0.0625) &
-                 + in(i+3,j-1) * (-0.0625) &
-                 + in(i+4,j-1) * (-0.0625) &
-                 + in(i-4,j+1) * (-0.002232142857142857) &
-                 + in(i-3,j+1) * (-0.004166666666666667) &
-                 + in(i-2,j+1) * (-0.010416666666666666) &
-                 + in(i-1,j+1) * (-0.0625) &
-                 + in(i+1,j+1) * (0.0625) &
-                 + in(i+2,j+1) * (0.010416666666666666) &
-                 + in(i+3,j+1) * (0.004166666666666667) &
-                 + in(i+4,j+1) * (0.002232142857142857) &
-                 + in(i-4,j+2) * (-0.002232142857142857) &
-                 + in(i-3,j+2) * (-0.004166666666666667) &
-                 + in(i-2,j+2) * (-0.010416666666666666) &
-                 + in(i-1,j+2) * (-0.0625) &
-                 + in(i+1,j+2) * (0.010416666666666666) &
-                 + in(i+2,j+2) * (0.03125) &
-                 + in(i+3,j+2) * (0.004166666666666667) &
-                 + in(i+4,j+2) * (0.002232142857142857) &
-                 + in(i-4,j+3) * (-0.002232142857142857) &
-                 + in(i-3,j+3) * (-0.004166666666666667) &
-                 + in(i-2,j+3) * (-0.010416666666666666) &
-                 + in(i-1,j+3) * (-0.0625) &
-                 + in(i+1,j+3) * (0.004166666666666667) &
-                 + in(i+2,j+3) * (0.004166666666666667) &
-                 + in(i+3,j+3) * (0.020833333333333332) &
-                 + in(i+4,j+3) * (0.002232142857142857) &
-                 + in(i-4,j+4) * (-0.002232142857142857) &
-                 + in(i-3,j+4) * (-0.004166666666666667) &
-                 + in(i-2,j+4) * (-0.010416666666666666) &
-                 + in(i-1,j+4) * (-0.0625) &
-                 + in(i+1,j+4) * (0.002232142857142857) &
-                 + in(i+2,j+4) * (0.002232142857142857) &
-                 + in(i+3,j+4) * (0.002232142857142857) &
-                 + in(i+4,j+4) * (0.015625) &
+                 + in(i-4,j-4) * (-0.015625d0) &
+                 + in(i+1,j-4) * (-0.002232142857142857d0) &
+                 + in(i+2,j-4) * (-0.002232142857142857d0) &
+                 + in(i+3,j-4) * (-0.002232142857142857d0) &
+                 + in(i+4,j-4) * (-0.002232142857142857d0) &
+                 + in(i-3,j-3) * (-0.020833333333333332d0) &
+                 + in(i+1,j-3) * (-0.004166666666666667d0) &
+                 + in(i+2,j-3) * (-0.004166666666666667d0) &
+                 + in(i+3,j-3) * (-0.004166666666666667d0) &
+                 + in(i+4,j-3) * (-0.004166666666666667d0) &
+                 + in(i-2,j-2) * (-0.03125d0) &
+                 + in(i+1,j-2) * (-0.010416666666666666d0) &
+                 + in(i+2,j-2) * (-0.010416666666666666d0) &
+                 + in(i+3,j-2) * (-0.010416666666666666d0) &
+                 + in(i+4,j-2) * (-0.010416666666666666d0) &
+                 + in(i-1,j-1) * (-0.0625d0) &
+                 + in(i+1,j-1) * (-0.0625d0) &
+                 + in(i+2,j-1) * (-0.0625d0) &
+                 + in(i+3,j-1) * (-0.0625d0) &
+                 + in(i+4,j-1) * (-0.0625d0) &
+                 + in(i-4,j+1) * (-0.002232142857142857d0) &
+                 + in(i-3,j+1) * (-0.004166666666666667d0) &
+                 + in(i-2,j+1) * (-0.010416666666666666d0) &
+                 + in(i-1,j+1) * (-0.0625d0) &
+                 + in(i+1,j+1) * (0.0625d0) &
+                 + in(i+2,j+1) * (0.010416666666666666d0) &
+                 + in(i+3,j+1) * (0.004166666666666667d0) &
+                 + in(i+4,j+1) * (0.002232142857142857d0) &
+                 + in(i-4,j+2) * (-0.002232142857142857d0) &
+                 + in(i-3,j+2) * (-0.004166666666666667d0) &
+                 + in(i-2,j+2) * (-0.010416666666666666d0) &
+                 + in(i-1,j+2) * (-0.0625d0) &
+                 + in(i+1,j+2) * (0.010416666666666666d0) &
+                 + in(i+2,j+2) * (0.03125d0) &
+                 + in(i+3,j+2) * (0.004166666666666667d0) &
+                 + in(i+4,j+2) * (0.002232142857142857d0) &
+                 + in(i-4,j+3) * (-0.002232142857142857d0) &
+                 + in(i-3,j+3) * (-0.004166666666666667d0) &
+                 + in(i-2,j+3) * (-0.010416666666666666d0) &
+                 + in(i-1,j+3) * (-0.0625d0) &
+                 + in(i+1,j+3) * (0.004166666666666667d0) &
+                 + in(i+2,j+3) * (0.004166666666666667d0) &
+                 + in(i+3,j+3) * (0.020833333333333332d0) &
+                 + in(i+4,j+3) * (0.002232142857142857d0) &
+                 + in(i-4,j+4) * (-0.002232142857142857d0) &
+                 + in(i-3,j+4) * (-0.004166666666666667d0) &
+                 + in(i-2,j+4) * (-0.010416666666666666d0) &
+                 + in(i-1,j+4) * (-0.0625d0) &
+                 + in(i+1,j+4) * (0.002232142857142857d0) &
+                 + in(i+2,j+4) * (0.002232142857142857d0) &
+                 + in(i+3,j+4) * (0.002232142857142857d0) &
+                 + in(i+4,j+4) * (0.015625d0) &
 +0.0
       end do
       !$omp end simd
@@ -546,91 +533,90 @@ subroutine grid5(n, in, out)
 real(kind=REAL64), intent(in) :: in(n,n)
 real(kind=REAL64), intent(inout) :: out(n,n)
 integer(kind=INT32) :: i,j
-    !$omp do
+    !$omp teams distribute parallel for simd collapse(2) schedule(static,1)
     do i=5,n-5-1
-      !$omp simd
       do j=5,n-5-1
         out(i,j) = out(i,j) &
-                 + in(i-5,j-5) * (-0.01) &
-                 + in(i+1,j-5) * (-0.0011111111111111111) &
-                 + in(i+2,j-5) * (-0.0011111111111111111) &
-                 + in(i+3,j-5) * (-0.0011111111111111111) &
-                 + in(i+4,j-5) * (-0.0011111111111111111) &
-                 + in(i+5,j-5) * (-0.0011111111111111111) &
-                 + in(i-4,j-4) * (-0.0125) &
-                 + in(i+1,j-4) * (-0.0017857142857142857) &
-                 + in(i+2,j-4) * (-0.0017857142857142857) &
-                 + in(i+3,j-4) * (-0.0017857142857142857) &
-                 + in(i+4,j-4) * (-0.0017857142857142857) &
-                 + in(i+5,j-4) * (-0.0017857142857142857) &
-                 + in(i-3,j-3) * (-0.016666666666666666) &
-                 + in(i+1,j-3) * (-0.0033333333333333335) &
-                 + in(i+2,j-3) * (-0.0033333333333333335) &
-                 + in(i+3,j-3) * (-0.0033333333333333335) &
-                 + in(i+4,j-3) * (-0.0033333333333333335) &
-                 + in(i+5,j-3) * (-0.0033333333333333335) &
-                 + in(i-2,j-2) * (-0.025) &
-                 + in(i+1,j-2) * (-0.008333333333333333) &
-                 + in(i+2,j-2) * (-0.008333333333333333) &
-                 + in(i+3,j-2) * (-0.008333333333333333) &
-                 + in(i+4,j-2) * (-0.008333333333333333) &
-                 + in(i+5,j-2) * (-0.008333333333333333) &
-                 + in(i-1,j-1) * (-0.05) &
-                 + in(i+1,j-1) * (-0.05) &
-                 + in(i+2,j-1) * (-0.05) &
-                 + in(i+3,j-1) * (-0.05) &
-                 + in(i+4,j-1) * (-0.05) &
-                 + in(i+5,j-1) * (-0.05) &
-                 + in(i-5,j+1) * (-0.0011111111111111111) &
-                 + in(i-4,j+1) * (-0.0017857142857142857) &
-                 + in(i-3,j+1) * (-0.0033333333333333335) &
-                 + in(i-2,j+1) * (-0.008333333333333333) &
-                 + in(i-1,j+1) * (-0.05) &
-                 + in(i+1,j+1) * (0.05) &
-                 + in(i+2,j+1) * (0.008333333333333333) &
-                 + in(i+3,j+1) * (0.0033333333333333335) &
-                 + in(i+4,j+1) * (0.0017857142857142857) &
-                 + in(i+5,j+1) * (0.0011111111111111111) &
-                 + in(i-5,j+2) * (-0.0011111111111111111) &
-                 + in(i-4,j+2) * (-0.0017857142857142857) &
-                 + in(i-3,j+2) * (-0.0033333333333333335) &
-                 + in(i-2,j+2) * (-0.008333333333333333) &
-                 + in(i-1,j+2) * (-0.05) &
-                 + in(i+1,j+2) * (0.008333333333333333) &
-                 + in(i+2,j+2) * (0.025) &
-                 + in(i+3,j+2) * (0.0033333333333333335) &
-                 + in(i+4,j+2) * (0.0017857142857142857) &
-                 + in(i+5,j+2) * (0.0011111111111111111) &
-                 + in(i-5,j+3) * (-0.0011111111111111111) &
-                 + in(i-4,j+3) * (-0.0017857142857142857) &
-                 + in(i-3,j+3) * (-0.0033333333333333335) &
-                 + in(i-2,j+3) * (-0.008333333333333333) &
-                 + in(i-1,j+3) * (-0.05) &
-                 + in(i+1,j+3) * (0.0033333333333333335) &
-                 + in(i+2,j+3) * (0.0033333333333333335) &
-                 + in(i+3,j+3) * (0.016666666666666666) &
-                 + in(i+4,j+3) * (0.0017857142857142857) &
-                 + in(i+5,j+3) * (0.0011111111111111111) &
-                 + in(i-5,j+4) * (-0.0011111111111111111) &
-                 + in(i-4,j+4) * (-0.0017857142857142857) &
-                 + in(i-3,j+4) * (-0.0033333333333333335) &
-                 + in(i-2,j+4) * (-0.008333333333333333) &
-                 + in(i-1,j+4) * (-0.05) &
-                 + in(i+1,j+4) * (0.0017857142857142857) &
-                 + in(i+2,j+4) * (0.0017857142857142857) &
-                 + in(i+3,j+4) * (0.0017857142857142857) &
-                 + in(i+4,j+4) * (0.0125) &
-                 + in(i+5,j+4) * (0.0011111111111111111) &
-                 + in(i-5,j+5) * (-0.0011111111111111111) &
-                 + in(i-4,j+5) * (-0.0017857142857142857) &
-                 + in(i-3,j+5) * (-0.0033333333333333335) &
-                 + in(i-2,j+5) * (-0.008333333333333333) &
-                 + in(i-1,j+5) * (-0.05) &
-                 + in(i+1,j+5) * (0.0011111111111111111) &
-                 + in(i+2,j+5) * (0.0011111111111111111) &
-                 + in(i+3,j+5) * (0.0011111111111111111) &
-                 + in(i+4,j+5) * (0.0011111111111111111) &
-                 + in(i+5,j+5) * (0.01) &
+                 + in(i-5,j-5) * (-0.01d0) &
+                 + in(i+1,j-5) * (-0.0011111111111111111d0) &
+                 + in(i+2,j-5) * (-0.0011111111111111111d0) &
+                 + in(i+3,j-5) * (-0.0011111111111111111d0) &
+                 + in(i+4,j-5) * (-0.0011111111111111111d0) &
+                 + in(i+5,j-5) * (-0.0011111111111111111d0) &
+                 + in(i-4,j-4) * (-0.0125d0) &
+                 + in(i+1,j-4) * (-0.0017857142857142857d0) &
+                 + in(i+2,j-4) * (-0.0017857142857142857d0) &
+                 + in(i+3,j-4) * (-0.0017857142857142857d0) &
+                 + in(i+4,j-4) * (-0.0017857142857142857d0) &
+                 + in(i+5,j-4) * (-0.0017857142857142857d0) &
+                 + in(i-3,j-3) * (-0.016666666666666666d0) &
+                 + in(i+1,j-3) * (-0.0033333333333333335d0) &
+                 + in(i+2,j-3) * (-0.0033333333333333335d0) &
+                 + in(i+3,j-3) * (-0.0033333333333333335d0) &
+                 + in(i+4,j-3) * (-0.0033333333333333335d0) &
+                 + in(i+5,j-3) * (-0.0033333333333333335d0) &
+                 + in(i-2,j-2) * (-0.025d0) &
+                 + in(i+1,j-2) * (-0.008333333333333333d0) &
+                 + in(i+2,j-2) * (-0.008333333333333333d0) &
+                 + in(i+3,j-2) * (-0.008333333333333333d0) &
+                 + in(i+4,j-2) * (-0.008333333333333333d0) &
+                 + in(i+5,j-2) * (-0.008333333333333333d0) &
+                 + in(i-1,j-1) * (-0.05d0) &
+                 + in(i+1,j-1) * (-0.05d0) &
+                 + in(i+2,j-1) * (-0.05d0) &
+                 + in(i+3,j-1) * (-0.05d0) &
+                 + in(i+4,j-1) * (-0.05d0) &
+                 + in(i+5,j-1) * (-0.05d0) &
+                 + in(i-5,j+1) * (-0.0011111111111111111d0) &
+                 + in(i-4,j+1) * (-0.0017857142857142857d0) &
+                 + in(i-3,j+1) * (-0.0033333333333333335d0) &
+                 + in(i-2,j+1) * (-0.008333333333333333d0) &
+                 + in(i-1,j+1) * (-0.05d0) &
+                 + in(i+1,j+1) * (0.05d0) &
+                 + in(i+2,j+1) * (0.008333333333333333d0) &
+                 + in(i+3,j+1) * (0.0033333333333333335d0) &
+                 + in(i+4,j+1) * (0.0017857142857142857d0) &
+                 + in(i+5,j+1) * (0.0011111111111111111d0) &
+                 + in(i-5,j+2) * (-0.0011111111111111111d0) &
+                 + in(i-4,j+2) * (-0.0017857142857142857d0) &
+                 + in(i-3,j+2) * (-0.0033333333333333335d0) &
+                 + in(i-2,j+2) * (-0.008333333333333333d0) &
+                 + in(i-1,j+2) * (-0.05d0) &
+                 + in(i+1,j+2) * (0.008333333333333333d0) &
+                 + in(i+2,j+2) * (0.025d0) &
+                 + in(i+3,j+2) * (0.0033333333333333335d0) &
+                 + in(i+4,j+2) * (0.0017857142857142857d0) &
+                 + in(i+5,j+2) * (0.0011111111111111111d0) &
+                 + in(i-5,j+3) * (-0.0011111111111111111d0) &
+                 + in(i-4,j+3) * (-0.0017857142857142857d0) &
+                 + in(i-3,j+3) * (-0.0033333333333333335d0) &
+                 + in(i-2,j+3) * (-0.008333333333333333d0) &
+                 + in(i-1,j+3) * (-0.05d0) &
+                 + in(i+1,j+3) * (0.0033333333333333335d0) &
+                 + in(i+2,j+3) * (0.0033333333333333335d0) &
+                 + in(i+3,j+3) * (0.016666666666666666d0) &
+                 + in(i+4,j+3) * (0.0017857142857142857d0) &
+                 + in(i+5,j+3) * (0.0011111111111111111d0) &
+                 + in(i-5,j+4) * (-0.0011111111111111111d0) &
+                 + in(i-4,j+4) * (-0.0017857142857142857d0) &
+                 + in(i-3,j+4) * (-0.0033333333333333335d0) &
+                 + in(i-2,j+4) * (-0.008333333333333333d0) &
+                 + in(i-1,j+4) * (-0.05d0) &
+                 + in(i+1,j+4) * (0.0017857142857142857d0) &
+                 + in(i+2,j+4) * (0.0017857142857142857d0) &
+                 + in(i+3,j+4) * (0.0017857142857142857d0) &
+                 + in(i+4,j+4) * (0.0125d0) &
+                 + in(i+5,j+4) * (0.0011111111111111111d0) &
+                 + in(i-5,j+5) * (-0.0011111111111111111d0) &
+                 + in(i-4,j+5) * (-0.0017857142857142857d0) &
+                 + in(i-3,j+5) * (-0.0033333333333333335d0) &
+                 + in(i-2,j+5) * (-0.008333333333333333d0) &
+                 + in(i-1,j+5) * (-0.05d0) &
+                 + in(i+1,j+5) * (0.0011111111111111111d0) &
+                 + in(i+2,j+5) * (0.0011111111111111111d0) &
+                 + in(i+3,j+5) * (0.0011111111111111111d0) &
+                 + in(i+4,j+5) * (0.0011111111111111111d0) &
+                 + in(i+5,j+5) * (0.01d0) &
 +0.0
       end do
       !$omp end simd
@@ -646,125 +632,124 @@ subroutine grid6(n, in, out)
 real(kind=REAL64), intent(in) :: in(n,n)
 real(kind=REAL64), intent(inout) :: out(n,n)
 integer(kind=INT32) :: i,j
-    !$omp do
+    !$omp teams distribute parallel for simd collapse(2) schedule(static,1)
     do i=6,n-6-1
-      !$omp simd
       do j=6,n-6-1
         out(i,j) = out(i,j) &
-                 + in(i-6,j-6) * (-0.006944444444444444) &
-                 + in(i+1,j-6) * (-0.0006313131313131314) &
-                 + in(i+2,j-6) * (-0.0006313131313131314) &
-                 + in(i+3,j-6) * (-0.0006313131313131314) &
-                 + in(i+4,j-6) * (-0.0006313131313131314) &
-                 + in(i+5,j-6) * (-0.0006313131313131314) &
-                 + in(i+6,j-6) * (-0.0006313131313131314) &
-                 + in(i-5,j-5) * (-0.008333333333333333) &
-                 + in(i+1,j-5) * (-0.000925925925925926) &
-                 + in(i+2,j-5) * (-0.000925925925925926) &
-                 + in(i+3,j-5) * (-0.000925925925925926) &
-                 + in(i+4,j-5) * (-0.000925925925925926) &
-                 + in(i+5,j-5) * (-0.000925925925925926) &
-                 + in(i+6,j-5) * (-0.000925925925925926) &
-                 + in(i-4,j-4) * (-0.010416666666666666) &
-                 + in(i+1,j-4) * (-0.001488095238095238) &
-                 + in(i+2,j-4) * (-0.001488095238095238) &
-                 + in(i+3,j-4) * (-0.001488095238095238) &
-                 + in(i+4,j-4) * (-0.001488095238095238) &
-                 + in(i+5,j-4) * (-0.001488095238095238) &
-                 + in(i+6,j-4) * (-0.001488095238095238) &
-                 + in(i-3,j-3) * (-0.013888888888888888) &
-                 + in(i+1,j-3) * (-0.002777777777777778) &
-                 + in(i+2,j-3) * (-0.002777777777777778) &
-                 + in(i+3,j-3) * (-0.002777777777777778) &
-                 + in(i+4,j-3) * (-0.002777777777777778) &
-                 + in(i+5,j-3) * (-0.002777777777777778) &
-                 + in(i+6,j-3) * (-0.002777777777777778) &
-                 + in(i-2,j-2) * (-0.020833333333333332) &
-                 + in(i+1,j-2) * (-0.006944444444444444) &
-                 + in(i+2,j-2) * (-0.006944444444444444) &
-                 + in(i+3,j-2) * (-0.006944444444444444) &
-                 + in(i+4,j-2) * (-0.006944444444444444) &
-                 + in(i+5,j-2) * (-0.006944444444444444) &
-                 + in(i+6,j-2) * (-0.006944444444444444) &
-                 + in(i-1,j-1) * (-0.041666666666666664) &
-                 + in(i+1,j-1) * (-0.041666666666666664) &
-                 + in(i+2,j-1) * (-0.041666666666666664) &
-                 + in(i+3,j-1) * (-0.041666666666666664) &
-                 + in(i+4,j-1) * (-0.041666666666666664) &
-                 + in(i+5,j-1) * (-0.041666666666666664) &
-                 + in(i+6,j-1) * (-0.041666666666666664) &
-                 + in(i-6,j+1) * (-0.0006313131313131314) &
-                 + in(i-5,j+1) * (-0.000925925925925926) &
-                 + in(i-4,j+1) * (-0.001488095238095238) &
-                 + in(i-3,j+1) * (-0.002777777777777778) &
-                 + in(i-2,j+1) * (-0.006944444444444444) &
-                 + in(i-1,j+1) * (-0.041666666666666664) &
-                 + in(i+1,j+1) * (0.041666666666666664) &
-                 + in(i+2,j+1) * (0.006944444444444444) &
-                 + in(i+3,j+1) * (0.002777777777777778) &
-                 + in(i+4,j+1) * (0.001488095238095238) &
-                 + in(i+5,j+1) * (0.000925925925925926) &
-                 + in(i+6,j+1) * (0.0006313131313131314) &
-                 + in(i-6,j+2) * (-0.0006313131313131314) &
-                 + in(i-5,j+2) * (-0.000925925925925926) &
-                 + in(i-4,j+2) * (-0.001488095238095238) &
-                 + in(i-3,j+2) * (-0.002777777777777778) &
-                 + in(i-2,j+2) * (-0.006944444444444444) &
-                 + in(i-1,j+2) * (-0.041666666666666664) &
-                 + in(i+1,j+2) * (0.006944444444444444) &
-                 + in(i+2,j+2) * (0.020833333333333332) &
-                 + in(i+3,j+2) * (0.002777777777777778) &
-                 + in(i+4,j+2) * (0.001488095238095238) &
-                 + in(i+5,j+2) * (0.000925925925925926) &
-                 + in(i+6,j+2) * (0.0006313131313131314) &
-                 + in(i-6,j+3) * (-0.0006313131313131314) &
-                 + in(i-5,j+3) * (-0.000925925925925926) &
-                 + in(i-4,j+3) * (-0.001488095238095238) &
-                 + in(i-3,j+3) * (-0.002777777777777778) &
-                 + in(i-2,j+3) * (-0.006944444444444444) &
-                 + in(i-1,j+3) * (-0.041666666666666664) &
-                 + in(i+1,j+3) * (0.002777777777777778) &
-                 + in(i+2,j+3) * (0.002777777777777778) &
-                 + in(i+3,j+3) * (0.013888888888888888) &
-                 + in(i+4,j+3) * (0.001488095238095238) &
-                 + in(i+5,j+3) * (0.000925925925925926) &
-                 + in(i+6,j+3) * (0.0006313131313131314) &
-                 + in(i-6,j+4) * (-0.0006313131313131314) &
-                 + in(i-5,j+4) * (-0.000925925925925926) &
-                 + in(i-4,j+4) * (-0.001488095238095238) &
-                 + in(i-3,j+4) * (-0.002777777777777778) &
-                 + in(i-2,j+4) * (-0.006944444444444444) &
-                 + in(i-1,j+4) * (-0.041666666666666664) &
-                 + in(i+1,j+4) * (0.001488095238095238) &
-                 + in(i+2,j+4) * (0.001488095238095238) &
-                 + in(i+3,j+4) * (0.001488095238095238) &
-                 + in(i+4,j+4) * (0.010416666666666666) &
-                 + in(i+5,j+4) * (0.000925925925925926) &
-                 + in(i+6,j+4) * (0.0006313131313131314) &
-                 + in(i-6,j+5) * (-0.0006313131313131314) &
-                 + in(i-5,j+5) * (-0.000925925925925926) &
-                 + in(i-4,j+5) * (-0.001488095238095238) &
-                 + in(i-3,j+5) * (-0.002777777777777778) &
-                 + in(i-2,j+5) * (-0.006944444444444444) &
-                 + in(i-1,j+5) * (-0.041666666666666664) &
-                 + in(i+1,j+5) * (0.000925925925925926) &
-                 + in(i+2,j+5) * (0.000925925925925926) &
-                 + in(i+3,j+5) * (0.000925925925925926) &
-                 + in(i+4,j+5) * (0.000925925925925926) &
-                 + in(i+5,j+5) * (0.008333333333333333) &
-                 + in(i+6,j+5) * (0.0006313131313131314) &
-                 + in(i-6,j+6) * (-0.0006313131313131314) &
-                 + in(i-5,j+6) * (-0.000925925925925926) &
-                 + in(i-4,j+6) * (-0.001488095238095238) &
-                 + in(i-3,j+6) * (-0.002777777777777778) &
-                 + in(i-2,j+6) * (-0.006944444444444444) &
-                 + in(i-1,j+6) * (-0.041666666666666664) &
-                 + in(i+1,j+6) * (0.0006313131313131314) &
-                 + in(i+2,j+6) * (0.0006313131313131314) &
-                 + in(i+3,j+6) * (0.0006313131313131314) &
-                 + in(i+4,j+6) * (0.0006313131313131314) &
-                 + in(i+5,j+6) * (0.0006313131313131314) &
-                 + in(i+6,j+6) * (0.006944444444444444) &
+                 + in(i-6,j-6) * (-0.006944444444444444d0) &
+                 + in(i+1,j-6) * (-0.0006313131313131314d0) &
+                 + in(i+2,j-6) * (-0.0006313131313131314d0) &
+                 + in(i+3,j-6) * (-0.0006313131313131314d0) &
+                 + in(i+4,j-6) * (-0.0006313131313131314d0) &
+                 + in(i+5,j-6) * (-0.0006313131313131314d0) &
+                 + in(i+6,j-6) * (-0.0006313131313131314d0) &
+                 + in(i-5,j-5) * (-0.008333333333333333d0) &
+                 + in(i+1,j-5) * (-0.000925925925925926d0) &
+                 + in(i+2,j-5) * (-0.000925925925925926d0) &
+                 + in(i+3,j-5) * (-0.000925925925925926d0) &
+                 + in(i+4,j-5) * (-0.000925925925925926d0) &
+                 + in(i+5,j-5) * (-0.000925925925925926d0) &
+                 + in(i+6,j-5) * (-0.000925925925925926d0) &
+                 + in(i-4,j-4) * (-0.010416666666666666d0) &
+                 + in(i+1,j-4) * (-0.001488095238095238d0) &
+                 + in(i+2,j-4) * (-0.001488095238095238d0) &
+                 + in(i+3,j-4) * (-0.001488095238095238d0) &
+                 + in(i+4,j-4) * (-0.001488095238095238d0) &
+                 + in(i+5,j-4) * (-0.001488095238095238d0) &
+                 + in(i+6,j-4) * (-0.001488095238095238d0) &
+                 + in(i-3,j-3) * (-0.013888888888888888d0) &
+                 + in(i+1,j-3) * (-0.002777777777777778d0) &
+                 + in(i+2,j-3) * (-0.002777777777777778d0) &
+                 + in(i+3,j-3) * (-0.002777777777777778d0) &
+                 + in(i+4,j-3) * (-0.002777777777777778d0) &
+                 + in(i+5,j-3) * (-0.002777777777777778d0) &
+                 + in(i+6,j-3) * (-0.002777777777777778d0) &
+                 + in(i-2,j-2) * (-0.020833333333333332d0) &
+                 + in(i+1,j-2) * (-0.006944444444444444d0) &
+                 + in(i+2,j-2) * (-0.006944444444444444d0) &
+                 + in(i+3,j-2) * (-0.006944444444444444d0) &
+                 + in(i+4,j-2) * (-0.006944444444444444d0) &
+                 + in(i+5,j-2) * (-0.006944444444444444d0) &
+                 + in(i+6,j-2) * (-0.006944444444444444d0) &
+                 + in(i-1,j-1) * (-0.041666666666666664d0) &
+                 + in(i+1,j-1) * (-0.041666666666666664d0) &
+                 + in(i+2,j-1) * (-0.041666666666666664d0) &
+                 + in(i+3,j-1) * (-0.041666666666666664d0) &
+                 + in(i+4,j-1) * (-0.041666666666666664d0) &
+                 + in(i+5,j-1) * (-0.041666666666666664d0) &
+                 + in(i+6,j-1) * (-0.041666666666666664d0) &
+                 + in(i-6,j+1) * (-0.0006313131313131314d0) &
+                 + in(i-5,j+1) * (-0.000925925925925926d0) &
+                 + in(i-4,j+1) * (-0.001488095238095238d0) &
+                 + in(i-3,j+1) * (-0.002777777777777778d0) &
+                 + in(i-2,j+1) * (-0.006944444444444444d0) &
+                 + in(i-1,j+1) * (-0.041666666666666664d0) &
+                 + in(i+1,j+1) * (0.041666666666666664d0) &
+                 + in(i+2,j+1) * (0.006944444444444444d0) &
+                 + in(i+3,j+1) * (0.002777777777777778d0) &
+                 + in(i+4,j+1) * (0.001488095238095238d0) &
+                 + in(i+5,j+1) * (0.000925925925925926d0) &
+                 + in(i+6,j+1) * (0.0006313131313131314d0) &
+                 + in(i-6,j+2) * (-0.0006313131313131314d0) &
+                 + in(i-5,j+2) * (-0.000925925925925926d0) &
+                 + in(i-4,j+2) * (-0.001488095238095238d0) &
+                 + in(i-3,j+2) * (-0.002777777777777778d0) &
+                 + in(i-2,j+2) * (-0.006944444444444444d0) &
+                 + in(i-1,j+2) * (-0.041666666666666664d0) &
+                 + in(i+1,j+2) * (0.006944444444444444d0) &
+                 + in(i+2,j+2) * (0.020833333333333332d0) &
+                 + in(i+3,j+2) * (0.002777777777777778d0) &
+                 + in(i+4,j+2) * (0.001488095238095238d0) &
+                 + in(i+5,j+2) * (0.000925925925925926d0) &
+                 + in(i+6,j+2) * (0.0006313131313131314d0) &
+                 + in(i-6,j+3) * (-0.0006313131313131314d0) &
+                 + in(i-5,j+3) * (-0.000925925925925926d0) &
+                 + in(i-4,j+3) * (-0.001488095238095238d0) &
+                 + in(i-3,j+3) * (-0.002777777777777778d0) &
+                 + in(i-2,j+3) * (-0.006944444444444444d0) &
+                 + in(i-1,j+3) * (-0.041666666666666664d0) &
+                 + in(i+1,j+3) * (0.002777777777777778d0) &
+                 + in(i+2,j+3) * (0.002777777777777778d0) &
+                 + in(i+3,j+3) * (0.013888888888888888d0) &
+                 + in(i+4,j+3) * (0.001488095238095238d0) &
+                 + in(i+5,j+3) * (0.000925925925925926d0) &
+                 + in(i+6,j+3) * (0.0006313131313131314d0) &
+                 + in(i-6,j+4) * (-0.0006313131313131314d0) &
+                 + in(i-5,j+4) * (-0.000925925925925926d0) &
+                 + in(i-4,j+4) * (-0.001488095238095238d0) &
+                 + in(i-3,j+4) * (-0.002777777777777778d0) &
+                 + in(i-2,j+4) * (-0.006944444444444444d0) &
+                 + in(i-1,j+4) * (-0.041666666666666664d0) &
+                 + in(i+1,j+4) * (0.001488095238095238d0) &
+                 + in(i+2,j+4) * (0.001488095238095238d0) &
+                 + in(i+3,j+4) * (0.001488095238095238d0) &
+                 + in(i+4,j+4) * (0.010416666666666666d0) &
+                 + in(i+5,j+4) * (0.000925925925925926d0) &
+                 + in(i+6,j+4) * (0.0006313131313131314d0) &
+                 + in(i-6,j+5) * (-0.0006313131313131314d0) &
+                 + in(i-5,j+5) * (-0.000925925925925926d0) &
+                 + in(i-4,j+5) * (-0.001488095238095238d0) &
+                 + in(i-3,j+5) * (-0.002777777777777778d0) &
+                 + in(i-2,j+5) * (-0.006944444444444444d0) &
+                 + in(i-1,j+5) * (-0.041666666666666664d0) &
+                 + in(i+1,j+5) * (0.000925925925925926d0) &
+                 + in(i+2,j+5) * (0.000925925925925926d0) &
+                 + in(i+3,j+5) * (0.000925925925925926d0) &
+                 + in(i+4,j+5) * (0.000925925925925926d0) &
+                 + in(i+5,j+5) * (0.008333333333333333d0) &
+                 + in(i+6,j+5) * (0.0006313131313131314d0) &
+                 + in(i-6,j+6) * (-0.0006313131313131314d0) &
+                 + in(i-5,j+6) * (-0.000925925925925926d0) &
+                 + in(i-4,j+6) * (-0.001488095238095238d0) &
+                 + in(i-3,j+6) * (-0.002777777777777778d0) &
+                 + in(i-2,j+6) * (-0.006944444444444444d0) &
+                 + in(i-1,j+6) * (-0.041666666666666664d0) &
+                 + in(i+1,j+6) * (0.0006313131313131314d0) &
+                 + in(i+2,j+6) * (0.0006313131313131314d0) &
+                 + in(i+3,j+6) * (0.0006313131313131314d0) &
+                 + in(i+4,j+6) * (0.0006313131313131314d0) &
+                 + in(i+5,j+6) * (0.0006313131313131314d0) &
+                 + in(i+6,j+6) * (0.006944444444444444d0) &
 +0.0
       end do
       !$omp end simd
@@ -780,165 +765,164 @@ subroutine grid7(n, in, out)
 real(kind=REAL64), intent(in) :: in(n,n)
 real(kind=REAL64), intent(inout) :: out(n,n)
 integer(kind=INT32) :: i,j
-    !$omp do
+    !$omp teams distribute parallel for simd collapse(2) schedule(static,1)
     do i=7,n-7-1
-      !$omp simd
       do j=7,n-7-1
         out(i,j) = out(i,j) &
-                 + in(i-7,j-7) * (-0.00510204081632653) &
-                 + in(i+1,j-7) * (-0.0003924646781789639) &
-                 + in(i+2,j-7) * (-0.0003924646781789639) &
-                 + in(i+3,j-7) * (-0.0003924646781789639) &
-                 + in(i+4,j-7) * (-0.0003924646781789639) &
-                 + in(i+5,j-7) * (-0.0003924646781789639) &
-                 + in(i+6,j-7) * (-0.0003924646781789639) &
-                 + in(i+7,j-7) * (-0.0003924646781789639) &
-                 + in(i-6,j-6) * (-0.005952380952380952) &
-                 + in(i+1,j-6) * (-0.0005411255411255411) &
-                 + in(i+2,j-6) * (-0.0005411255411255411) &
-                 + in(i+3,j-6) * (-0.0005411255411255411) &
-                 + in(i+4,j-6) * (-0.0005411255411255411) &
-                 + in(i+5,j-6) * (-0.0005411255411255411) &
-                 + in(i+6,j-6) * (-0.0005411255411255411) &
-                 + in(i+7,j-6) * (-0.0005411255411255411) &
-                 + in(i-5,j-5) * (-0.007142857142857143) &
-                 + in(i+1,j-5) * (-0.0007936507936507937) &
-                 + in(i+2,j-5) * (-0.0007936507936507937) &
-                 + in(i+3,j-5) * (-0.0007936507936507937) &
-                 + in(i+4,j-5) * (-0.0007936507936507937) &
-                 + in(i+5,j-5) * (-0.0007936507936507937) &
-                 + in(i+6,j-5) * (-0.0007936507936507937) &
-                 + in(i+7,j-5) * (-0.0007936507936507937) &
-                 + in(i-4,j-4) * (-0.008928571428571428) &
-                 + in(i+1,j-4) * (-0.0012755102040816326) &
-                 + in(i+2,j-4) * (-0.0012755102040816326) &
-                 + in(i+3,j-4) * (-0.0012755102040816326) &
-                 + in(i+4,j-4) * (-0.0012755102040816326) &
-                 + in(i+5,j-4) * (-0.0012755102040816326) &
-                 + in(i+6,j-4) * (-0.0012755102040816326) &
-                 + in(i+7,j-4) * (-0.0012755102040816326) &
-                 + in(i-3,j-3) * (-0.011904761904761904) &
-                 + in(i+1,j-3) * (-0.002380952380952381) &
-                 + in(i+2,j-3) * (-0.002380952380952381) &
-                 + in(i+3,j-3) * (-0.002380952380952381) &
-                 + in(i+4,j-3) * (-0.002380952380952381) &
-                 + in(i+5,j-3) * (-0.002380952380952381) &
-                 + in(i+6,j-3) * (-0.002380952380952381) &
-                 + in(i+7,j-3) * (-0.002380952380952381) &
-                 + in(i-2,j-2) * (-0.017857142857142856) &
-                 + in(i+1,j-2) * (-0.005952380952380952) &
-                 + in(i+2,j-2) * (-0.005952380952380952) &
-                 + in(i+3,j-2) * (-0.005952380952380952) &
-                 + in(i+4,j-2) * (-0.005952380952380952) &
-                 + in(i+5,j-2) * (-0.005952380952380952) &
-                 + in(i+6,j-2) * (-0.005952380952380952) &
-                 + in(i+7,j-2) * (-0.005952380952380952) &
-                 + in(i-1,j-1) * (-0.03571428571428571) &
-                 + in(i+1,j-1) * (-0.03571428571428571) &
-                 + in(i+2,j-1) * (-0.03571428571428571) &
-                 + in(i+3,j-1) * (-0.03571428571428571) &
-                 + in(i+4,j-1) * (-0.03571428571428571) &
-                 + in(i+5,j-1) * (-0.03571428571428571) &
-                 + in(i+6,j-1) * (-0.03571428571428571) &
-                 + in(i+7,j-1) * (-0.03571428571428571) &
-                 + in(i-7,j+1) * (-0.0003924646781789639) &
-                 + in(i-6,j+1) * (-0.0005411255411255411) &
-                 + in(i-5,j+1) * (-0.0007936507936507937) &
-                 + in(i-4,j+1) * (-0.0012755102040816326) &
-                 + in(i-3,j+1) * (-0.002380952380952381) &
-                 + in(i-2,j+1) * (-0.005952380952380952) &
-                 + in(i-1,j+1) * (-0.03571428571428571) &
-                 + in(i+1,j+1) * (0.03571428571428571) &
-                 + in(i+2,j+1) * (0.005952380952380952) &
-                 + in(i+3,j+1) * (0.002380952380952381) &
-                 + in(i+4,j+1) * (0.0012755102040816326) &
-                 + in(i+5,j+1) * (0.0007936507936507937) &
-                 + in(i+6,j+1) * (0.0005411255411255411) &
-                 + in(i+7,j+1) * (0.0003924646781789639) &
-                 + in(i-7,j+2) * (-0.0003924646781789639) &
-                 + in(i-6,j+2) * (-0.0005411255411255411) &
-                 + in(i-5,j+2) * (-0.0007936507936507937) &
-                 + in(i-4,j+2) * (-0.0012755102040816326) &
-                 + in(i-3,j+2) * (-0.002380952380952381) &
-                 + in(i-2,j+2) * (-0.005952380952380952) &
-                 + in(i-1,j+2) * (-0.03571428571428571) &
-                 + in(i+1,j+2) * (0.005952380952380952) &
-                 + in(i+2,j+2) * (0.017857142857142856) &
-                 + in(i+3,j+2) * (0.002380952380952381) &
-                 + in(i+4,j+2) * (0.0012755102040816326) &
-                 + in(i+5,j+2) * (0.0007936507936507937) &
-                 + in(i+6,j+2) * (0.0005411255411255411) &
-                 + in(i+7,j+2) * (0.0003924646781789639) &
-                 + in(i-7,j+3) * (-0.0003924646781789639) &
-                 + in(i-6,j+3) * (-0.0005411255411255411) &
-                 + in(i-5,j+3) * (-0.0007936507936507937) &
-                 + in(i-4,j+3) * (-0.0012755102040816326) &
-                 + in(i-3,j+3) * (-0.002380952380952381) &
-                 + in(i-2,j+3) * (-0.005952380952380952) &
-                 + in(i-1,j+3) * (-0.03571428571428571) &
-                 + in(i+1,j+3) * (0.002380952380952381) &
-                 + in(i+2,j+3) * (0.002380952380952381) &
-                 + in(i+3,j+3) * (0.011904761904761904) &
-                 + in(i+4,j+3) * (0.0012755102040816326) &
-                 + in(i+5,j+3) * (0.0007936507936507937) &
-                 + in(i+6,j+3) * (0.0005411255411255411) &
-                 + in(i+7,j+3) * (0.0003924646781789639) &
-                 + in(i-7,j+4) * (-0.0003924646781789639) &
-                 + in(i-6,j+4) * (-0.0005411255411255411) &
-                 + in(i-5,j+4) * (-0.0007936507936507937) &
-                 + in(i-4,j+4) * (-0.0012755102040816326) &
-                 + in(i-3,j+4) * (-0.002380952380952381) &
-                 + in(i-2,j+4) * (-0.005952380952380952) &
-                 + in(i-1,j+4) * (-0.03571428571428571) &
-                 + in(i+1,j+4) * (0.0012755102040816326) &
-                 + in(i+2,j+4) * (0.0012755102040816326) &
-                 + in(i+3,j+4) * (0.0012755102040816326) &
-                 + in(i+4,j+4) * (0.008928571428571428) &
-                 + in(i+5,j+4) * (0.0007936507936507937) &
-                 + in(i+6,j+4) * (0.0005411255411255411) &
-                 + in(i+7,j+4) * (0.0003924646781789639) &
-                 + in(i-7,j+5) * (-0.0003924646781789639) &
-                 + in(i-6,j+5) * (-0.0005411255411255411) &
-                 + in(i-5,j+5) * (-0.0007936507936507937) &
-                 + in(i-4,j+5) * (-0.0012755102040816326) &
-                 + in(i-3,j+5) * (-0.002380952380952381) &
-                 + in(i-2,j+5) * (-0.005952380952380952) &
-                 + in(i-1,j+5) * (-0.03571428571428571) &
-                 + in(i+1,j+5) * (0.0007936507936507937) &
-                 + in(i+2,j+5) * (0.0007936507936507937) &
-                 + in(i+3,j+5) * (0.0007936507936507937) &
-                 + in(i+4,j+5) * (0.0007936507936507937) &
-                 + in(i+5,j+5) * (0.007142857142857143) &
-                 + in(i+6,j+5) * (0.0005411255411255411) &
-                 + in(i+7,j+5) * (0.0003924646781789639) &
-                 + in(i-7,j+6) * (-0.0003924646781789639) &
-                 + in(i-6,j+6) * (-0.0005411255411255411) &
-                 + in(i-5,j+6) * (-0.0007936507936507937) &
-                 + in(i-4,j+6) * (-0.0012755102040816326) &
-                 + in(i-3,j+6) * (-0.002380952380952381) &
-                 + in(i-2,j+6) * (-0.005952380952380952) &
-                 + in(i-1,j+6) * (-0.03571428571428571) &
-                 + in(i+1,j+6) * (0.0005411255411255411) &
-                 + in(i+2,j+6) * (0.0005411255411255411) &
-                 + in(i+3,j+6) * (0.0005411255411255411) &
-                 + in(i+4,j+6) * (0.0005411255411255411) &
-                 + in(i+5,j+6) * (0.0005411255411255411) &
-                 + in(i+6,j+6) * (0.005952380952380952) &
-                 + in(i+7,j+6) * (0.0003924646781789639) &
-                 + in(i-7,j+7) * (-0.0003924646781789639) &
-                 + in(i-6,j+7) * (-0.0005411255411255411) &
-                 + in(i-5,j+7) * (-0.0007936507936507937) &
-                 + in(i-4,j+7) * (-0.0012755102040816326) &
-                 + in(i-3,j+7) * (-0.002380952380952381) &
-                 + in(i-2,j+7) * (-0.005952380952380952) &
-                 + in(i-1,j+7) * (-0.03571428571428571) &
-                 + in(i+1,j+7) * (0.0003924646781789639) &
-                 + in(i+2,j+7) * (0.0003924646781789639) &
-                 + in(i+3,j+7) * (0.0003924646781789639) &
-                 + in(i+4,j+7) * (0.0003924646781789639) &
-                 + in(i+5,j+7) * (0.0003924646781789639) &
-                 + in(i+6,j+7) * (0.0003924646781789639) &
-                 + in(i+7,j+7) * (0.00510204081632653) &
+                 + in(i-7,j-7) * (-0.00510204081632653d0) &
+                 + in(i+1,j-7) * (-0.0003924646781789639d0) &
+                 + in(i+2,j-7) * (-0.0003924646781789639d0) &
+                 + in(i+3,j-7) * (-0.0003924646781789639d0) &
+                 + in(i+4,j-7) * (-0.0003924646781789639d0) &
+                 + in(i+5,j-7) * (-0.0003924646781789639d0) &
+                 + in(i+6,j-7) * (-0.0003924646781789639d0) &
+                 + in(i+7,j-7) * (-0.0003924646781789639d0) &
+                 + in(i-6,j-6) * (-0.005952380952380952d0) &
+                 + in(i+1,j-6) * (-0.0005411255411255411d0) &
+                 + in(i+2,j-6) * (-0.0005411255411255411d0) &
+                 + in(i+3,j-6) * (-0.0005411255411255411d0) &
+                 + in(i+4,j-6) * (-0.0005411255411255411d0) &
+                 + in(i+5,j-6) * (-0.0005411255411255411d0) &
+                 + in(i+6,j-6) * (-0.0005411255411255411d0) &
+                 + in(i+7,j-6) * (-0.0005411255411255411d0) &
+                 + in(i-5,j-5) * (-0.007142857142857143d0) &
+                 + in(i+1,j-5) * (-0.0007936507936507937d0) &
+                 + in(i+2,j-5) * (-0.0007936507936507937d0) &
+                 + in(i+3,j-5) * (-0.0007936507936507937d0) &
+                 + in(i+4,j-5) * (-0.0007936507936507937d0) &
+                 + in(i+5,j-5) * (-0.0007936507936507937d0) &
+                 + in(i+6,j-5) * (-0.0007936507936507937d0) &
+                 + in(i+7,j-5) * (-0.0007936507936507937d0) &
+                 + in(i-4,j-4) * (-0.008928571428571428d0) &
+                 + in(i+1,j-4) * (-0.0012755102040816326d0) &
+                 + in(i+2,j-4) * (-0.0012755102040816326d0) &
+                 + in(i+3,j-4) * (-0.0012755102040816326d0) &
+                 + in(i+4,j-4) * (-0.0012755102040816326d0) &
+                 + in(i+5,j-4) * (-0.0012755102040816326d0) &
+                 + in(i+6,j-4) * (-0.0012755102040816326d0) &
+                 + in(i+7,j-4) * (-0.0012755102040816326d0) &
+                 + in(i-3,j-3) * (-0.011904761904761904d0) &
+                 + in(i+1,j-3) * (-0.002380952380952381d0) &
+                 + in(i+2,j-3) * (-0.002380952380952381d0) &
+                 + in(i+3,j-3) * (-0.002380952380952381d0) &
+                 + in(i+4,j-3) * (-0.002380952380952381d0) &
+                 + in(i+5,j-3) * (-0.002380952380952381d0) &
+                 + in(i+6,j-3) * (-0.002380952380952381d0) &
+                 + in(i+7,j-3) * (-0.002380952380952381d0) &
+                 + in(i-2,j-2) * (-0.017857142857142856d0) &
+                 + in(i+1,j-2) * (-0.005952380952380952d0) &
+                 + in(i+2,j-2) * (-0.005952380952380952d0) &
+                 + in(i+3,j-2) * (-0.005952380952380952d0) &
+                 + in(i+4,j-2) * (-0.005952380952380952d0) &
+                 + in(i+5,j-2) * (-0.005952380952380952d0) &
+                 + in(i+6,j-2) * (-0.005952380952380952d0) &
+                 + in(i+7,j-2) * (-0.005952380952380952d0) &
+                 + in(i-1,j-1) * (-0.03571428571428571d0) &
+                 + in(i+1,j-1) * (-0.03571428571428571d0) &
+                 + in(i+2,j-1) * (-0.03571428571428571d0) &
+                 + in(i+3,j-1) * (-0.03571428571428571d0) &
+                 + in(i+4,j-1) * (-0.03571428571428571d0) &
+                 + in(i+5,j-1) * (-0.03571428571428571d0) &
+                 + in(i+6,j-1) * (-0.03571428571428571d0) &
+                 + in(i+7,j-1) * (-0.03571428571428571d0) &
+                 + in(i-7,j+1) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+1) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+1) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+1) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+1) * (-0.002380952380952381d0) &
+                 + in(i-2,j+1) * (-0.005952380952380952d0) &
+                 + in(i-1,j+1) * (-0.03571428571428571d0) &
+                 + in(i+1,j+1) * (0.03571428571428571d0) &
+                 + in(i+2,j+1) * (0.005952380952380952d0) &
+                 + in(i+3,j+1) * (0.002380952380952381d0) &
+                 + in(i+4,j+1) * (0.0012755102040816326d0) &
+                 + in(i+5,j+1) * (0.0007936507936507937d0) &
+                 + in(i+6,j+1) * (0.0005411255411255411d0) &
+                 + in(i+7,j+1) * (0.0003924646781789639d0) &
+                 + in(i-7,j+2) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+2) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+2) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+2) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+2) * (-0.002380952380952381d0) &
+                 + in(i-2,j+2) * (-0.005952380952380952d0) &
+                 + in(i-1,j+2) * (-0.03571428571428571d0) &
+                 + in(i+1,j+2) * (0.005952380952380952d0) &
+                 + in(i+2,j+2) * (0.017857142857142856d0) &
+                 + in(i+3,j+2) * (0.002380952380952381d0) &
+                 + in(i+4,j+2) * (0.0012755102040816326d0) &
+                 + in(i+5,j+2) * (0.0007936507936507937d0) &
+                 + in(i+6,j+2) * (0.0005411255411255411d0) &
+                 + in(i+7,j+2) * (0.0003924646781789639d0) &
+                 + in(i-7,j+3) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+3) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+3) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+3) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+3) * (-0.002380952380952381d0) &
+                 + in(i-2,j+3) * (-0.005952380952380952d0) &
+                 + in(i-1,j+3) * (-0.03571428571428571d0) &
+                 + in(i+1,j+3) * (0.002380952380952381d0) &
+                 + in(i+2,j+3) * (0.002380952380952381d0) &
+                 + in(i+3,j+3) * (0.011904761904761904d0) &
+                 + in(i+4,j+3) * (0.0012755102040816326d0) &
+                 + in(i+5,j+3) * (0.0007936507936507937d0) &
+                 + in(i+6,j+3) * (0.0005411255411255411d0) &
+                 + in(i+7,j+3) * (0.0003924646781789639d0) &
+                 + in(i-7,j+4) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+4) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+4) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+4) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+4) * (-0.002380952380952381d0) &
+                 + in(i-2,j+4) * (-0.005952380952380952d0) &
+                 + in(i-1,j+4) * (-0.03571428571428571d0) &
+                 + in(i+1,j+4) * (0.0012755102040816326d0) &
+                 + in(i+2,j+4) * (0.0012755102040816326d0) &
+                 + in(i+3,j+4) * (0.0012755102040816326d0) &
+                 + in(i+4,j+4) * (0.008928571428571428d0) &
+                 + in(i+5,j+4) * (0.0007936507936507937d0) &
+                 + in(i+6,j+4) * (0.0005411255411255411d0) &
+                 + in(i+7,j+4) * (0.0003924646781789639d0) &
+                 + in(i-7,j+5) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+5) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+5) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+5) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+5) * (-0.002380952380952381d0) &
+                 + in(i-2,j+5) * (-0.005952380952380952d0) &
+                 + in(i-1,j+5) * (-0.03571428571428571d0) &
+                 + in(i+1,j+5) * (0.0007936507936507937d0) &
+                 + in(i+2,j+5) * (0.0007936507936507937d0) &
+                 + in(i+3,j+5) * (0.0007936507936507937d0) &
+                 + in(i+4,j+5) * (0.0007936507936507937d0) &
+                 + in(i+5,j+5) * (0.007142857142857143d0) &
+                 + in(i+6,j+5) * (0.0005411255411255411d0) &
+                 + in(i+7,j+5) * (0.0003924646781789639d0) &
+                 + in(i-7,j+6) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+6) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+6) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+6) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+6) * (-0.002380952380952381d0) &
+                 + in(i-2,j+6) * (-0.005952380952380952d0) &
+                 + in(i-1,j+6) * (-0.03571428571428571d0) &
+                 + in(i+1,j+6) * (0.0005411255411255411d0) &
+                 + in(i+2,j+6) * (0.0005411255411255411d0) &
+                 + in(i+3,j+6) * (0.0005411255411255411d0) &
+                 + in(i+4,j+6) * (0.0005411255411255411d0) &
+                 + in(i+5,j+6) * (0.0005411255411255411d0) &
+                 + in(i+6,j+6) * (0.005952380952380952d0) &
+                 + in(i+7,j+6) * (0.0003924646781789639d0) &
+                 + in(i-7,j+7) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+7) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+7) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+7) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+7) * (-0.002380952380952381d0) &
+                 + in(i-2,j+7) * (-0.005952380952380952d0) &
+                 + in(i-1,j+7) * (-0.03571428571428571d0) &
+                 + in(i+1,j+7) * (0.0003924646781789639d0) &
+                 + in(i+2,j+7) * (0.0003924646781789639d0) &
+                 + in(i+3,j+7) * (0.0003924646781789639d0) &
+                 + in(i+4,j+7) * (0.0003924646781789639d0) &
+                 + in(i+5,j+7) * (0.0003924646781789639d0) &
+                 + in(i+6,j+7) * (0.0003924646781789639d0) &
+                 + in(i+7,j+7) * (0.00510204081632653d0) &
 +0.0
       end do
       !$omp end simd
@@ -954,211 +938,210 @@ subroutine grid8(n, in, out)
 real(kind=REAL64), intent(in) :: in(n,n)
 real(kind=REAL64), intent(inout) :: out(n,n)
 integer(kind=INT32) :: i,j
-    !$omp do
+    !$omp teams distribute parallel for simd collapse(2) schedule(static,1)
     do i=8,n-8-1
-      !$omp simd
       do j=8,n-8-1
         out(i,j) = out(i,j) &
-                 + in(i-8,j-8) * (-0.00390625) &
-                 + in(i+1,j-8) * (-0.00026041666666666666) &
-                 + in(i+2,j-8) * (-0.00026041666666666666) &
-                 + in(i+3,j-8) * (-0.00026041666666666666) &
-                 + in(i+4,j-8) * (-0.00026041666666666666) &
-                 + in(i+5,j-8) * (-0.00026041666666666666) &
-                 + in(i+6,j-8) * (-0.00026041666666666666) &
-                 + in(i+7,j-8) * (-0.00026041666666666666) &
-                 + in(i+8,j-8) * (-0.00026041666666666666) &
-                 + in(i-7,j-7) * (-0.004464285714285714) &
-                 + in(i+1,j-7) * (-0.00034340659340659343) &
-                 + in(i+2,j-7) * (-0.00034340659340659343) &
-                 + in(i+3,j-7) * (-0.00034340659340659343) &
-                 + in(i+4,j-7) * (-0.00034340659340659343) &
-                 + in(i+5,j-7) * (-0.00034340659340659343) &
-                 + in(i+6,j-7) * (-0.00034340659340659343) &
-                 + in(i+7,j-7) * (-0.00034340659340659343) &
-                 + in(i+8,j-7) * (-0.00034340659340659343) &
-                 + in(i-6,j-6) * (-0.005208333333333333) &
-                 + in(i+1,j-6) * (-0.0004734848484848485) &
-                 + in(i+2,j-6) * (-0.0004734848484848485) &
-                 + in(i+3,j-6) * (-0.0004734848484848485) &
-                 + in(i+4,j-6) * (-0.0004734848484848485) &
-                 + in(i+5,j-6) * (-0.0004734848484848485) &
-                 + in(i+6,j-6) * (-0.0004734848484848485) &
-                 + in(i+7,j-6) * (-0.0004734848484848485) &
-                 + in(i+8,j-6) * (-0.0004734848484848485) &
-                 + in(i-5,j-5) * (-0.00625) &
-                 + in(i+1,j-5) * (-0.0006944444444444445) &
-                 + in(i+2,j-5) * (-0.0006944444444444445) &
-                 + in(i+3,j-5) * (-0.0006944444444444445) &
-                 + in(i+4,j-5) * (-0.0006944444444444445) &
-                 + in(i+5,j-5) * (-0.0006944444444444445) &
-                 + in(i+6,j-5) * (-0.0006944444444444445) &
-                 + in(i+7,j-5) * (-0.0006944444444444445) &
-                 + in(i+8,j-5) * (-0.0006944444444444445) &
-                 + in(i-4,j-4) * (-0.0078125) &
-                 + in(i+1,j-4) * (-0.0011160714285714285) &
-                 + in(i+2,j-4) * (-0.0011160714285714285) &
-                 + in(i+3,j-4) * (-0.0011160714285714285) &
-                 + in(i+4,j-4) * (-0.0011160714285714285) &
-                 + in(i+5,j-4) * (-0.0011160714285714285) &
-                 + in(i+6,j-4) * (-0.0011160714285714285) &
-                 + in(i+7,j-4) * (-0.0011160714285714285) &
-                 + in(i+8,j-4) * (-0.0011160714285714285) &
-                 + in(i-3,j-3) * (-0.010416666666666666) &
-                 + in(i+1,j-3) * (-0.0020833333333333333) &
-                 + in(i+2,j-3) * (-0.0020833333333333333) &
-                 + in(i+3,j-3) * (-0.0020833333333333333) &
-                 + in(i+4,j-3) * (-0.0020833333333333333) &
-                 + in(i+5,j-3) * (-0.0020833333333333333) &
-                 + in(i+6,j-3) * (-0.0020833333333333333) &
-                 + in(i+7,j-3) * (-0.0020833333333333333) &
-                 + in(i+8,j-3) * (-0.0020833333333333333) &
-                 + in(i-2,j-2) * (-0.015625) &
-                 + in(i+1,j-2) * (-0.005208333333333333) &
-                 + in(i+2,j-2) * (-0.005208333333333333) &
-                 + in(i+3,j-2) * (-0.005208333333333333) &
-                 + in(i+4,j-2) * (-0.005208333333333333) &
-                 + in(i+5,j-2) * (-0.005208333333333333) &
-                 + in(i+6,j-2) * (-0.005208333333333333) &
-                 + in(i+7,j-2) * (-0.005208333333333333) &
-                 + in(i+8,j-2) * (-0.005208333333333333) &
-                 + in(i-1,j-1) * (-0.03125) &
-                 + in(i+1,j-1) * (-0.03125) &
-                 + in(i+2,j-1) * (-0.03125) &
-                 + in(i+3,j-1) * (-0.03125) &
-                 + in(i+4,j-1) * (-0.03125) &
-                 + in(i+5,j-1) * (-0.03125) &
-                 + in(i+6,j-1) * (-0.03125) &
-                 + in(i+7,j-1) * (-0.03125) &
-                 + in(i+8,j-1) * (-0.03125) &
-                 + in(i-8,j+1) * (-0.00026041666666666666) &
-                 + in(i-7,j+1) * (-0.00034340659340659343) &
-                 + in(i-6,j+1) * (-0.0004734848484848485) &
-                 + in(i-5,j+1) * (-0.0006944444444444445) &
-                 + in(i-4,j+1) * (-0.0011160714285714285) &
-                 + in(i-3,j+1) * (-0.0020833333333333333) &
-                 + in(i-2,j+1) * (-0.005208333333333333) &
-                 + in(i-1,j+1) * (-0.03125) &
-                 + in(i+1,j+1) * (0.03125) &
-                 + in(i+2,j+1) * (0.005208333333333333) &
-                 + in(i+3,j+1) * (0.0020833333333333333) &
-                 + in(i+4,j+1) * (0.0011160714285714285) &
-                 + in(i+5,j+1) * (0.0006944444444444445) &
-                 + in(i+6,j+1) * (0.0004734848484848485) &
-                 + in(i+7,j+1) * (0.00034340659340659343) &
-                 + in(i+8,j+1) * (0.00026041666666666666) &
-                 + in(i-8,j+2) * (-0.00026041666666666666) &
-                 + in(i-7,j+2) * (-0.00034340659340659343) &
-                 + in(i-6,j+2) * (-0.0004734848484848485) &
-                 + in(i-5,j+2) * (-0.0006944444444444445) &
-                 + in(i-4,j+2) * (-0.0011160714285714285) &
-                 + in(i-3,j+2) * (-0.0020833333333333333) &
-                 + in(i-2,j+2) * (-0.005208333333333333) &
-                 + in(i-1,j+2) * (-0.03125) &
-                 + in(i+1,j+2) * (0.005208333333333333) &
-                 + in(i+2,j+2) * (0.015625) &
-                 + in(i+3,j+2) * (0.0020833333333333333) &
-                 + in(i+4,j+2) * (0.0011160714285714285) &
-                 + in(i+5,j+2) * (0.0006944444444444445) &
-                 + in(i+6,j+2) * (0.0004734848484848485) &
-                 + in(i+7,j+2) * (0.00034340659340659343) &
-                 + in(i+8,j+2) * (0.00026041666666666666) &
-                 + in(i-8,j+3) * (-0.00026041666666666666) &
-                 + in(i-7,j+3) * (-0.00034340659340659343) &
-                 + in(i-6,j+3) * (-0.0004734848484848485) &
-                 + in(i-5,j+3) * (-0.0006944444444444445) &
-                 + in(i-4,j+3) * (-0.0011160714285714285) &
-                 + in(i-3,j+3) * (-0.0020833333333333333) &
-                 + in(i-2,j+3) * (-0.005208333333333333) &
-                 + in(i-1,j+3) * (-0.03125) &
-                 + in(i+1,j+3) * (0.0020833333333333333) &
-                 + in(i+2,j+3) * (0.0020833333333333333) &
-                 + in(i+3,j+3) * (0.010416666666666666) &
-                 + in(i+4,j+3) * (0.0011160714285714285) &
-                 + in(i+5,j+3) * (0.0006944444444444445) &
-                 + in(i+6,j+3) * (0.0004734848484848485) &
-                 + in(i+7,j+3) * (0.00034340659340659343) &
-                 + in(i+8,j+3) * (0.00026041666666666666) &
-                 + in(i-8,j+4) * (-0.00026041666666666666) &
-                 + in(i-7,j+4) * (-0.00034340659340659343) &
-                 + in(i-6,j+4) * (-0.0004734848484848485) &
-                 + in(i-5,j+4) * (-0.0006944444444444445) &
-                 + in(i-4,j+4) * (-0.0011160714285714285) &
-                 + in(i-3,j+4) * (-0.0020833333333333333) &
-                 + in(i-2,j+4) * (-0.005208333333333333) &
-                 + in(i-1,j+4) * (-0.03125) &
-                 + in(i+1,j+4) * (0.0011160714285714285) &
-                 + in(i+2,j+4) * (0.0011160714285714285) &
-                 + in(i+3,j+4) * (0.0011160714285714285) &
-                 + in(i+4,j+4) * (0.0078125) &
-                 + in(i+5,j+4) * (0.0006944444444444445) &
-                 + in(i+6,j+4) * (0.0004734848484848485) &
-                 + in(i+7,j+4) * (0.00034340659340659343) &
-                 + in(i+8,j+4) * (0.00026041666666666666) &
-                 + in(i-8,j+5) * (-0.00026041666666666666) &
-                 + in(i-7,j+5) * (-0.00034340659340659343) &
-                 + in(i-6,j+5) * (-0.0004734848484848485) &
-                 + in(i-5,j+5) * (-0.0006944444444444445) &
-                 + in(i-4,j+5) * (-0.0011160714285714285) &
-                 + in(i-3,j+5) * (-0.0020833333333333333) &
-                 + in(i-2,j+5) * (-0.005208333333333333) &
-                 + in(i-1,j+5) * (-0.03125) &
-                 + in(i+1,j+5) * (0.0006944444444444445) &
-                 + in(i+2,j+5) * (0.0006944444444444445) &
-                 + in(i+3,j+5) * (0.0006944444444444445) &
-                 + in(i+4,j+5) * (0.0006944444444444445) &
-                 + in(i+5,j+5) * (0.00625) &
-                 + in(i+6,j+5) * (0.0004734848484848485) &
-                 + in(i+7,j+5) * (0.00034340659340659343) &
-                 + in(i+8,j+5) * (0.00026041666666666666) &
-                 + in(i-8,j+6) * (-0.00026041666666666666) &
-                 + in(i-7,j+6) * (-0.00034340659340659343) &
-                 + in(i-6,j+6) * (-0.0004734848484848485) &
-                 + in(i-5,j+6) * (-0.0006944444444444445) &
-                 + in(i-4,j+6) * (-0.0011160714285714285) &
-                 + in(i-3,j+6) * (-0.0020833333333333333) &
-                 + in(i-2,j+6) * (-0.005208333333333333) &
-                 + in(i-1,j+6) * (-0.03125) &
-                 + in(i+1,j+6) * (0.0004734848484848485) &
-                 + in(i+2,j+6) * (0.0004734848484848485) &
-                 + in(i+3,j+6) * (0.0004734848484848485) &
-                 + in(i+4,j+6) * (0.0004734848484848485) &
-                 + in(i+5,j+6) * (0.0004734848484848485) &
-                 + in(i+6,j+6) * (0.005208333333333333) &
-                 + in(i+7,j+6) * (0.00034340659340659343) &
-                 + in(i+8,j+6) * (0.00026041666666666666) &
-                 + in(i-8,j+7) * (-0.00026041666666666666) &
-                 + in(i-7,j+7) * (-0.00034340659340659343) &
-                 + in(i-6,j+7) * (-0.0004734848484848485) &
-                 + in(i-5,j+7) * (-0.0006944444444444445) &
-                 + in(i-4,j+7) * (-0.0011160714285714285) &
-                 + in(i-3,j+7) * (-0.0020833333333333333) &
-                 + in(i-2,j+7) * (-0.005208333333333333) &
-                 + in(i-1,j+7) * (-0.03125) &
-                 + in(i+1,j+7) * (0.00034340659340659343) &
-                 + in(i+2,j+7) * (0.00034340659340659343) &
-                 + in(i+3,j+7) * (0.00034340659340659343) &
-                 + in(i+4,j+7) * (0.00034340659340659343) &
-                 + in(i+5,j+7) * (0.00034340659340659343) &
-                 + in(i+6,j+7) * (0.00034340659340659343) &
-                 + in(i+7,j+7) * (0.004464285714285714) &
-                 + in(i+8,j+7) * (0.00026041666666666666) &
-                 + in(i-8,j+8) * (-0.00026041666666666666) &
-                 + in(i-7,j+8) * (-0.00034340659340659343) &
-                 + in(i-6,j+8) * (-0.0004734848484848485) &
-                 + in(i-5,j+8) * (-0.0006944444444444445) &
-                 + in(i-4,j+8) * (-0.0011160714285714285) &
-                 + in(i-3,j+8) * (-0.0020833333333333333) &
-                 + in(i-2,j+8) * (-0.005208333333333333) &
-                 + in(i-1,j+8) * (-0.03125) &
-                 + in(i+1,j+8) * (0.00026041666666666666) &
-                 + in(i+2,j+8) * (0.00026041666666666666) &
-                 + in(i+3,j+8) * (0.00026041666666666666) &
-                 + in(i+4,j+8) * (0.00026041666666666666) &
-                 + in(i+5,j+8) * (0.00026041666666666666) &
-                 + in(i+6,j+8) * (0.00026041666666666666) &
-                 + in(i+7,j+8) * (0.00026041666666666666) &
-                 + in(i+8,j+8) * (0.00390625) &
+                 + in(i-8,j-8) * (-0.00390625d0) &
+                 + in(i+1,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+2,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+3,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+4,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+5,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+6,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+7,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+8,j-8) * (-0.00026041666666666666d0) &
+                 + in(i-7,j-7) * (-0.004464285714285714d0) &
+                 + in(i+1,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+2,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+3,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+4,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+5,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+6,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+7,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+8,j-7) * (-0.00034340659340659343d0) &
+                 + in(i-6,j-6) * (-0.005208333333333333d0) &
+                 + in(i+1,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+2,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+3,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+4,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+5,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+6,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+7,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+8,j-6) * (-0.0004734848484848485d0) &
+                 + in(i-5,j-5) * (-0.00625d0) &
+                 + in(i+1,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+2,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+3,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+4,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+5,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+6,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+7,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+8,j-5) * (-0.0006944444444444445d0) &
+                 + in(i-4,j-4) * (-0.0078125d0) &
+                 + in(i+1,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+2,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+3,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+4,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+5,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+6,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+7,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+8,j-4) * (-0.0011160714285714285d0) &
+                 + in(i-3,j-3) * (-0.010416666666666666d0) &
+                 + in(i+1,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+2,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+3,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+4,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+5,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+6,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+7,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+8,j-3) * (-0.0020833333333333333d0) &
+                 + in(i-2,j-2) * (-0.015625d0) &
+                 + in(i+1,j-2) * (-0.005208333333333333d0) &
+                 + in(i+2,j-2) * (-0.005208333333333333d0) &
+                 + in(i+3,j-2) * (-0.005208333333333333d0) &
+                 + in(i+4,j-2) * (-0.005208333333333333d0) &
+                 + in(i+5,j-2) * (-0.005208333333333333d0) &
+                 + in(i+6,j-2) * (-0.005208333333333333d0) &
+                 + in(i+7,j-2) * (-0.005208333333333333d0) &
+                 + in(i+8,j-2) * (-0.005208333333333333d0) &
+                 + in(i-1,j-1) * (-0.03125d0) &
+                 + in(i+1,j-1) * (-0.03125d0) &
+                 + in(i+2,j-1) * (-0.03125d0) &
+                 + in(i+3,j-1) * (-0.03125d0) &
+                 + in(i+4,j-1) * (-0.03125d0) &
+                 + in(i+5,j-1) * (-0.03125d0) &
+                 + in(i+6,j-1) * (-0.03125d0) &
+                 + in(i+7,j-1) * (-0.03125d0) &
+                 + in(i+8,j-1) * (-0.03125d0) &
+                 + in(i-8,j+1) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+1) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+1) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+1) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+1) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+1) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+1) * (-0.005208333333333333d0) &
+                 + in(i-1,j+1) * (-0.03125d0) &
+                 + in(i+1,j+1) * (0.03125d0) &
+                 + in(i+2,j+1) * (0.005208333333333333d0) &
+                 + in(i+3,j+1) * (0.0020833333333333333d0) &
+                 + in(i+4,j+1) * (0.0011160714285714285d0) &
+                 + in(i+5,j+1) * (0.0006944444444444445d0) &
+                 + in(i+6,j+1) * (0.0004734848484848485d0) &
+                 + in(i+7,j+1) * (0.00034340659340659343d0) &
+                 + in(i+8,j+1) * (0.00026041666666666666d0) &
+                 + in(i-8,j+2) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+2) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+2) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+2) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+2) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+2) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+2) * (-0.005208333333333333d0) &
+                 + in(i-1,j+2) * (-0.03125d0) &
+                 + in(i+1,j+2) * (0.005208333333333333d0) &
+                 + in(i+2,j+2) * (0.015625d0) &
+                 + in(i+3,j+2) * (0.0020833333333333333d0) &
+                 + in(i+4,j+2) * (0.0011160714285714285d0) &
+                 + in(i+5,j+2) * (0.0006944444444444445d0) &
+                 + in(i+6,j+2) * (0.0004734848484848485d0) &
+                 + in(i+7,j+2) * (0.00034340659340659343d0) &
+                 + in(i+8,j+2) * (0.00026041666666666666d0) &
+                 + in(i-8,j+3) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+3) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+3) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+3) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+3) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+3) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+3) * (-0.005208333333333333d0) &
+                 + in(i-1,j+3) * (-0.03125d0) &
+                 + in(i+1,j+3) * (0.0020833333333333333d0) &
+                 + in(i+2,j+3) * (0.0020833333333333333d0) &
+                 + in(i+3,j+3) * (0.010416666666666666d0) &
+                 + in(i+4,j+3) * (0.0011160714285714285d0) &
+                 + in(i+5,j+3) * (0.0006944444444444445d0) &
+                 + in(i+6,j+3) * (0.0004734848484848485d0) &
+                 + in(i+7,j+3) * (0.00034340659340659343d0) &
+                 + in(i+8,j+3) * (0.00026041666666666666d0) &
+                 + in(i-8,j+4) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+4) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+4) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+4) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+4) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+4) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+4) * (-0.005208333333333333d0) &
+                 + in(i-1,j+4) * (-0.03125d0) &
+                 + in(i+1,j+4) * (0.0011160714285714285d0) &
+                 + in(i+2,j+4) * (0.0011160714285714285d0) &
+                 + in(i+3,j+4) * (0.0011160714285714285d0) &
+                 + in(i+4,j+4) * (0.0078125d0) &
+                 + in(i+5,j+4) * (0.0006944444444444445d0) &
+                 + in(i+6,j+4) * (0.0004734848484848485d0) &
+                 + in(i+7,j+4) * (0.00034340659340659343d0) &
+                 + in(i+8,j+4) * (0.00026041666666666666d0) &
+                 + in(i-8,j+5) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+5) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+5) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+5) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+5) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+5) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+5) * (-0.005208333333333333d0) &
+                 + in(i-1,j+5) * (-0.03125d0) &
+                 + in(i+1,j+5) * (0.0006944444444444445d0) &
+                 + in(i+2,j+5) * (0.0006944444444444445d0) &
+                 + in(i+3,j+5) * (0.0006944444444444445d0) &
+                 + in(i+4,j+5) * (0.0006944444444444445d0) &
+                 + in(i+5,j+5) * (0.00625d0) &
+                 + in(i+6,j+5) * (0.0004734848484848485d0) &
+                 + in(i+7,j+5) * (0.00034340659340659343d0) &
+                 + in(i+8,j+5) * (0.00026041666666666666d0) &
+                 + in(i-8,j+6) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+6) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+6) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+6) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+6) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+6) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+6) * (-0.005208333333333333d0) &
+                 + in(i-1,j+6) * (-0.03125d0) &
+                 + in(i+1,j+6) * (0.0004734848484848485d0) &
+                 + in(i+2,j+6) * (0.0004734848484848485d0) &
+                 + in(i+3,j+6) * (0.0004734848484848485d0) &
+                 + in(i+4,j+6) * (0.0004734848484848485d0) &
+                 + in(i+5,j+6) * (0.0004734848484848485d0) &
+                 + in(i+6,j+6) * (0.005208333333333333d0) &
+                 + in(i+7,j+6) * (0.00034340659340659343d0) &
+                 + in(i+8,j+6) * (0.00026041666666666666d0) &
+                 + in(i-8,j+7) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+7) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+7) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+7) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+7) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+7) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+7) * (-0.005208333333333333d0) &
+                 + in(i-1,j+7) * (-0.03125d0) &
+                 + in(i+1,j+7) * (0.00034340659340659343d0) &
+                 + in(i+2,j+7) * (0.00034340659340659343d0) &
+                 + in(i+3,j+7) * (0.00034340659340659343d0) &
+                 + in(i+4,j+7) * (0.00034340659340659343d0) &
+                 + in(i+5,j+7) * (0.00034340659340659343d0) &
+                 + in(i+6,j+7) * (0.00034340659340659343d0) &
+                 + in(i+7,j+7) * (0.004464285714285714d0) &
+                 + in(i+8,j+7) * (0.00026041666666666666d0) &
+                 + in(i-8,j+8) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+8) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+8) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+8) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+8) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+8) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+8) * (-0.005208333333333333d0) &
+                 + in(i-1,j+8) * (-0.03125d0) &
+                 + in(i+1,j+8) * (0.00026041666666666666d0) &
+                 + in(i+2,j+8) * (0.00026041666666666666d0) &
+                 + in(i+3,j+8) * (0.00026041666666666666d0) &
+                 + in(i+4,j+8) * (0.00026041666666666666d0) &
+                 + in(i+5,j+8) * (0.00026041666666666666d0) &
+                 + in(i+6,j+8) * (0.00026041666666666666d0) &
+                 + in(i+7,j+8) * (0.00026041666666666666d0) &
+                 + in(i+8,j+8) * (0.00390625d0) &
 +0.0
       end do
       !$omp end simd
@@ -1174,263 +1157,262 @@ subroutine grid9(n, in, out)
 real(kind=REAL64), intent(in) :: in(n,n)
 real(kind=REAL64), intent(inout) :: out(n,n)
 integer(kind=INT32) :: i,j
-    !$omp do
+    !$omp teams distribute parallel for simd collapse(2) schedule(static,1)
     do i=9,n-9-1
-      !$omp simd
       do j=9,n-9-1
         out(i,j) = out(i,j) &
-                 + in(i-9,j-9) * (-0.0030864197530864196) &
-                 + in(i+1,j-9) * (-0.00018155410312273057) &
-                 + in(i+2,j-9) * (-0.00018155410312273057) &
-                 + in(i+3,j-9) * (-0.00018155410312273057) &
-                 + in(i+4,j-9) * (-0.00018155410312273057) &
-                 + in(i+5,j-9) * (-0.00018155410312273057) &
-                 + in(i+6,j-9) * (-0.00018155410312273057) &
-                 + in(i+7,j-9) * (-0.00018155410312273057) &
-                 + in(i+8,j-9) * (-0.00018155410312273057) &
-                 + in(i+9,j-9) * (-0.00018155410312273057) &
-                 + in(i-8,j-8) * (-0.003472222222222222) &
-                 + in(i+1,j-8) * (-0.0002314814814814815) &
-                 + in(i+2,j-8) * (-0.0002314814814814815) &
-                 + in(i+3,j-8) * (-0.0002314814814814815) &
-                 + in(i+4,j-8) * (-0.0002314814814814815) &
-                 + in(i+5,j-8) * (-0.0002314814814814815) &
-                 + in(i+6,j-8) * (-0.0002314814814814815) &
-                 + in(i+7,j-8) * (-0.0002314814814814815) &
-                 + in(i+8,j-8) * (-0.0002314814814814815) &
-                 + in(i+9,j-8) * (-0.0002314814814814815) &
-                 + in(i-7,j-7) * (-0.003968253968253968) &
-                 + in(i+1,j-7) * (-0.00030525030525030525) &
-                 + in(i+2,j-7) * (-0.00030525030525030525) &
-                 + in(i+3,j-7) * (-0.00030525030525030525) &
-                 + in(i+4,j-7) * (-0.00030525030525030525) &
-                 + in(i+5,j-7) * (-0.00030525030525030525) &
-                 + in(i+6,j-7) * (-0.00030525030525030525) &
-                 + in(i+7,j-7) * (-0.00030525030525030525) &
-                 + in(i+8,j-7) * (-0.00030525030525030525) &
-                 + in(i+9,j-7) * (-0.00030525030525030525) &
-                 + in(i-6,j-6) * (-0.004629629629629629) &
-                 + in(i+1,j-6) * (-0.00042087542087542086) &
-                 + in(i+2,j-6) * (-0.00042087542087542086) &
-                 + in(i+3,j-6) * (-0.00042087542087542086) &
-                 + in(i+4,j-6) * (-0.00042087542087542086) &
-                 + in(i+5,j-6) * (-0.00042087542087542086) &
-                 + in(i+6,j-6) * (-0.00042087542087542086) &
-                 + in(i+7,j-6) * (-0.00042087542087542086) &
-                 + in(i+8,j-6) * (-0.00042087542087542086) &
-                 + in(i+9,j-6) * (-0.00042087542087542086) &
-                 + in(i-5,j-5) * (-0.005555555555555556) &
-                 + in(i+1,j-5) * (-0.0006172839506172839) &
-                 + in(i+2,j-5) * (-0.0006172839506172839) &
-                 + in(i+3,j-5) * (-0.0006172839506172839) &
-                 + in(i+4,j-5) * (-0.0006172839506172839) &
-                 + in(i+5,j-5) * (-0.0006172839506172839) &
-                 + in(i+6,j-5) * (-0.0006172839506172839) &
-                 + in(i+7,j-5) * (-0.0006172839506172839) &
-                 + in(i+8,j-5) * (-0.0006172839506172839) &
-                 + in(i+9,j-5) * (-0.0006172839506172839) &
-                 + in(i-4,j-4) * (-0.006944444444444444) &
-                 + in(i+1,j-4) * (-0.000992063492063492) &
-                 + in(i+2,j-4) * (-0.000992063492063492) &
-                 + in(i+3,j-4) * (-0.000992063492063492) &
-                 + in(i+4,j-4) * (-0.000992063492063492) &
-                 + in(i+5,j-4) * (-0.000992063492063492) &
-                 + in(i+6,j-4) * (-0.000992063492063492) &
-                 + in(i+7,j-4) * (-0.000992063492063492) &
-                 + in(i+8,j-4) * (-0.000992063492063492) &
-                 + in(i+9,j-4) * (-0.000992063492063492) &
-                 + in(i-3,j-3) * (-0.009259259259259259) &
-                 + in(i+1,j-3) * (-0.001851851851851852) &
-                 + in(i+2,j-3) * (-0.001851851851851852) &
-                 + in(i+3,j-3) * (-0.001851851851851852) &
-                 + in(i+4,j-3) * (-0.001851851851851852) &
-                 + in(i+5,j-3) * (-0.001851851851851852) &
-                 + in(i+6,j-3) * (-0.001851851851851852) &
-                 + in(i+7,j-3) * (-0.001851851851851852) &
-                 + in(i+8,j-3) * (-0.001851851851851852) &
-                 + in(i+9,j-3) * (-0.001851851851851852) &
-                 + in(i-2,j-2) * (-0.013888888888888888) &
-                 + in(i+1,j-2) * (-0.004629629629629629) &
-                 + in(i+2,j-2) * (-0.004629629629629629) &
-                 + in(i+3,j-2) * (-0.004629629629629629) &
-                 + in(i+4,j-2) * (-0.004629629629629629) &
-                 + in(i+5,j-2) * (-0.004629629629629629) &
-                 + in(i+6,j-2) * (-0.004629629629629629) &
-                 + in(i+7,j-2) * (-0.004629629629629629) &
-                 + in(i+8,j-2) * (-0.004629629629629629) &
-                 + in(i+9,j-2) * (-0.004629629629629629) &
-                 + in(i-1,j-1) * (-0.027777777777777776) &
-                 + in(i+1,j-1) * (-0.027777777777777776) &
-                 + in(i+2,j-1) * (-0.027777777777777776) &
-                 + in(i+3,j-1) * (-0.027777777777777776) &
-                 + in(i+4,j-1) * (-0.027777777777777776) &
-                 + in(i+5,j-1) * (-0.027777777777777776) &
-                 + in(i+6,j-1) * (-0.027777777777777776) &
-                 + in(i+7,j-1) * (-0.027777777777777776) &
-                 + in(i+8,j-1) * (-0.027777777777777776) &
-                 + in(i+9,j-1) * (-0.027777777777777776) &
-                 + in(i-9,j+1) * (-0.00018155410312273057) &
-                 + in(i-8,j+1) * (-0.0002314814814814815) &
-                 + in(i-7,j+1) * (-0.00030525030525030525) &
-                 + in(i-6,j+1) * (-0.00042087542087542086) &
-                 + in(i-5,j+1) * (-0.0006172839506172839) &
-                 + in(i-4,j+1) * (-0.000992063492063492) &
-                 + in(i-3,j+1) * (-0.001851851851851852) &
-                 + in(i-2,j+1) * (-0.004629629629629629) &
-                 + in(i-1,j+1) * (-0.027777777777777776) &
-                 + in(i+1,j+1) * (0.027777777777777776) &
-                 + in(i+2,j+1) * (0.004629629629629629) &
-                 + in(i+3,j+1) * (0.001851851851851852) &
-                 + in(i+4,j+1) * (0.000992063492063492) &
-                 + in(i+5,j+1) * (0.0006172839506172839) &
-                 + in(i+6,j+1) * (0.00042087542087542086) &
-                 + in(i+7,j+1) * (0.00030525030525030525) &
-                 + in(i+8,j+1) * (0.0002314814814814815) &
-                 + in(i+9,j+1) * (0.00018155410312273057) &
-                 + in(i-9,j+2) * (-0.00018155410312273057) &
-                 + in(i-8,j+2) * (-0.0002314814814814815) &
-                 + in(i-7,j+2) * (-0.00030525030525030525) &
-                 + in(i-6,j+2) * (-0.00042087542087542086) &
-                 + in(i-5,j+2) * (-0.0006172839506172839) &
-                 + in(i-4,j+2) * (-0.000992063492063492) &
-                 + in(i-3,j+2) * (-0.001851851851851852) &
-                 + in(i-2,j+2) * (-0.004629629629629629) &
-                 + in(i-1,j+2) * (-0.027777777777777776) &
-                 + in(i+1,j+2) * (0.004629629629629629) &
-                 + in(i+2,j+2) * (0.013888888888888888) &
-                 + in(i+3,j+2) * (0.001851851851851852) &
-                 + in(i+4,j+2) * (0.000992063492063492) &
-                 + in(i+5,j+2) * (0.0006172839506172839) &
-                 + in(i+6,j+2) * (0.00042087542087542086) &
-                 + in(i+7,j+2) * (0.00030525030525030525) &
-                 + in(i+8,j+2) * (0.0002314814814814815) &
-                 + in(i+9,j+2) * (0.00018155410312273057) &
-                 + in(i-9,j+3) * (-0.00018155410312273057) &
-                 + in(i-8,j+3) * (-0.0002314814814814815) &
-                 + in(i-7,j+3) * (-0.00030525030525030525) &
-                 + in(i-6,j+3) * (-0.00042087542087542086) &
-                 + in(i-5,j+3) * (-0.0006172839506172839) &
-                 + in(i-4,j+3) * (-0.000992063492063492) &
-                 + in(i-3,j+3) * (-0.001851851851851852) &
-                 + in(i-2,j+3) * (-0.004629629629629629) &
-                 + in(i-1,j+3) * (-0.027777777777777776) &
-                 + in(i+1,j+3) * (0.001851851851851852) &
-                 + in(i+2,j+3) * (0.001851851851851852) &
-                 + in(i+3,j+3) * (0.009259259259259259) &
-                 + in(i+4,j+3) * (0.000992063492063492) &
-                 + in(i+5,j+3) * (0.0006172839506172839) &
-                 + in(i+6,j+3) * (0.00042087542087542086) &
-                 + in(i+7,j+3) * (0.00030525030525030525) &
-                 + in(i+8,j+3) * (0.0002314814814814815) &
-                 + in(i+9,j+3) * (0.00018155410312273057) &
-                 + in(i-9,j+4) * (-0.00018155410312273057) &
-                 + in(i-8,j+4) * (-0.0002314814814814815) &
-                 + in(i-7,j+4) * (-0.00030525030525030525) &
-                 + in(i-6,j+4) * (-0.00042087542087542086) &
-                 + in(i-5,j+4) * (-0.0006172839506172839) &
-                 + in(i-4,j+4) * (-0.000992063492063492) &
-                 + in(i-3,j+4) * (-0.001851851851851852) &
-                 + in(i-2,j+4) * (-0.004629629629629629) &
-                 + in(i-1,j+4) * (-0.027777777777777776) &
-                 + in(i+1,j+4) * (0.000992063492063492) &
-                 + in(i+2,j+4) * (0.000992063492063492) &
-                 + in(i+3,j+4) * (0.000992063492063492) &
-                 + in(i+4,j+4) * (0.006944444444444444) &
-                 + in(i+5,j+4) * (0.0006172839506172839) &
-                 + in(i+6,j+4) * (0.00042087542087542086) &
-                 + in(i+7,j+4) * (0.00030525030525030525) &
-                 + in(i+8,j+4) * (0.0002314814814814815) &
-                 + in(i+9,j+4) * (0.00018155410312273057) &
-                 + in(i-9,j+5) * (-0.00018155410312273057) &
-                 + in(i-8,j+5) * (-0.0002314814814814815) &
-                 + in(i-7,j+5) * (-0.00030525030525030525) &
-                 + in(i-6,j+5) * (-0.00042087542087542086) &
-                 + in(i-5,j+5) * (-0.0006172839506172839) &
-                 + in(i-4,j+5) * (-0.000992063492063492) &
-                 + in(i-3,j+5) * (-0.001851851851851852) &
-                 + in(i-2,j+5) * (-0.004629629629629629) &
-                 + in(i-1,j+5) * (-0.027777777777777776) &
-                 + in(i+1,j+5) * (0.0006172839506172839) &
-                 + in(i+2,j+5) * (0.0006172839506172839) &
-                 + in(i+3,j+5) * (0.0006172839506172839) &
-                 + in(i+4,j+5) * (0.0006172839506172839) &
-                 + in(i+5,j+5) * (0.005555555555555556) &
-                 + in(i+6,j+5) * (0.00042087542087542086) &
-                 + in(i+7,j+5) * (0.00030525030525030525) &
-                 + in(i+8,j+5) * (0.0002314814814814815) &
-                 + in(i+9,j+5) * (0.00018155410312273057) &
-                 + in(i-9,j+6) * (-0.00018155410312273057) &
-                 + in(i-8,j+6) * (-0.0002314814814814815) &
-                 + in(i-7,j+6) * (-0.00030525030525030525) &
-                 + in(i-6,j+6) * (-0.00042087542087542086) &
-                 + in(i-5,j+6) * (-0.0006172839506172839) &
-                 + in(i-4,j+6) * (-0.000992063492063492) &
-                 + in(i-3,j+6) * (-0.001851851851851852) &
-                 + in(i-2,j+6) * (-0.004629629629629629) &
-                 + in(i-1,j+6) * (-0.027777777777777776) &
-                 + in(i+1,j+6) * (0.00042087542087542086) &
-                 + in(i+2,j+6) * (0.00042087542087542086) &
-                 + in(i+3,j+6) * (0.00042087542087542086) &
-                 + in(i+4,j+6) * (0.00042087542087542086) &
-                 + in(i+5,j+6) * (0.00042087542087542086) &
-                 + in(i+6,j+6) * (0.004629629629629629) &
-                 + in(i+7,j+6) * (0.00030525030525030525) &
-                 + in(i+8,j+6) * (0.0002314814814814815) &
-                 + in(i+9,j+6) * (0.00018155410312273057) &
-                 + in(i-9,j+7) * (-0.00018155410312273057) &
-                 + in(i-8,j+7) * (-0.0002314814814814815) &
-                 + in(i-7,j+7) * (-0.00030525030525030525) &
-                 + in(i-6,j+7) * (-0.00042087542087542086) &
-                 + in(i-5,j+7) * (-0.0006172839506172839) &
-                 + in(i-4,j+7) * (-0.000992063492063492) &
-                 + in(i-3,j+7) * (-0.001851851851851852) &
-                 + in(i-2,j+7) * (-0.004629629629629629) &
-                 + in(i-1,j+7) * (-0.027777777777777776) &
-                 + in(i+1,j+7) * (0.00030525030525030525) &
-                 + in(i+2,j+7) * (0.00030525030525030525) &
-                 + in(i+3,j+7) * (0.00030525030525030525) &
-                 + in(i+4,j+7) * (0.00030525030525030525) &
-                 + in(i+5,j+7) * (0.00030525030525030525) &
-                 + in(i+6,j+7) * (0.00030525030525030525) &
-                 + in(i+7,j+7) * (0.003968253968253968) &
-                 + in(i+8,j+7) * (0.0002314814814814815) &
-                 + in(i+9,j+7) * (0.00018155410312273057) &
-                 + in(i-9,j+8) * (-0.00018155410312273057) &
-                 + in(i-8,j+8) * (-0.0002314814814814815) &
-                 + in(i-7,j+8) * (-0.00030525030525030525) &
-                 + in(i-6,j+8) * (-0.00042087542087542086) &
-                 + in(i-5,j+8) * (-0.0006172839506172839) &
-                 + in(i-4,j+8) * (-0.000992063492063492) &
-                 + in(i-3,j+8) * (-0.001851851851851852) &
-                 + in(i-2,j+8) * (-0.004629629629629629) &
-                 + in(i-1,j+8) * (-0.027777777777777776) &
-                 + in(i+1,j+8) * (0.0002314814814814815) &
-                 + in(i+2,j+8) * (0.0002314814814814815) &
-                 + in(i+3,j+8) * (0.0002314814814814815) &
-                 + in(i+4,j+8) * (0.0002314814814814815) &
-                 + in(i+5,j+8) * (0.0002314814814814815) &
-                 + in(i+6,j+8) * (0.0002314814814814815) &
-                 + in(i+7,j+8) * (0.0002314814814814815) &
-                 + in(i+8,j+8) * (0.003472222222222222) &
-                 + in(i+9,j+8) * (0.00018155410312273057) &
-                 + in(i-9,j+9) * (-0.00018155410312273057) &
-                 + in(i-8,j+9) * (-0.0002314814814814815) &
-                 + in(i-7,j+9) * (-0.00030525030525030525) &
-                 + in(i-6,j+9) * (-0.00042087542087542086) &
-                 + in(i-5,j+9) * (-0.0006172839506172839) &
-                 + in(i-4,j+9) * (-0.000992063492063492) &
-                 + in(i-3,j+9) * (-0.001851851851851852) &
-                 + in(i-2,j+9) * (-0.004629629629629629) &
-                 + in(i-1,j+9) * (-0.027777777777777776) &
-                 + in(i+1,j+9) * (0.00018155410312273057) &
-                 + in(i+2,j+9) * (0.00018155410312273057) &
-                 + in(i+3,j+9) * (0.00018155410312273057) &
-                 + in(i+4,j+9) * (0.00018155410312273057) &
-                 + in(i+5,j+9) * (0.00018155410312273057) &
-                 + in(i+6,j+9) * (0.00018155410312273057) &
-                 + in(i+7,j+9) * (0.00018155410312273057) &
-                 + in(i+8,j+9) * (0.00018155410312273057) &
-                 + in(i+9,j+9) * (0.0030864197530864196) &
+                 + in(i-9,j-9) * (-0.0030864197530864196d0) &
+                 + in(i+1,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+2,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+3,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+4,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+5,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+6,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+7,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+8,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+9,j-9) * (-0.00018155410312273057d0) &
+                 + in(i-8,j-8) * (-0.003472222222222222d0) &
+                 + in(i+1,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+2,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+3,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+4,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+5,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+6,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+7,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+8,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+9,j-8) * (-0.0002314814814814815d0) &
+                 + in(i-7,j-7) * (-0.003968253968253968d0) &
+                 + in(i+1,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+2,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+3,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+4,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+5,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+6,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+7,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+8,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+9,j-7) * (-0.00030525030525030525d0) &
+                 + in(i-6,j-6) * (-0.004629629629629629d0) &
+                 + in(i+1,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+2,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+3,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+4,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+5,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+6,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+7,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+8,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+9,j-6) * (-0.00042087542087542086d0) &
+                 + in(i-5,j-5) * (-0.005555555555555556d0) &
+                 + in(i+1,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+2,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+3,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+4,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+5,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+6,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+7,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+8,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+9,j-5) * (-0.0006172839506172839d0) &
+                 + in(i-4,j-4) * (-0.006944444444444444d0) &
+                 + in(i+1,j-4) * (-0.000992063492063492d0) &
+                 + in(i+2,j-4) * (-0.000992063492063492d0) &
+                 + in(i+3,j-4) * (-0.000992063492063492d0) &
+                 + in(i+4,j-4) * (-0.000992063492063492d0) &
+                 + in(i+5,j-4) * (-0.000992063492063492d0) &
+                 + in(i+6,j-4) * (-0.000992063492063492d0) &
+                 + in(i+7,j-4) * (-0.000992063492063492d0) &
+                 + in(i+8,j-4) * (-0.000992063492063492d0) &
+                 + in(i+9,j-4) * (-0.000992063492063492d0) &
+                 + in(i-3,j-3) * (-0.009259259259259259d0) &
+                 + in(i+1,j-3) * (-0.001851851851851852d0) &
+                 + in(i+2,j-3) * (-0.001851851851851852d0) &
+                 + in(i+3,j-3) * (-0.001851851851851852d0) &
+                 + in(i+4,j-3) * (-0.001851851851851852d0) &
+                 + in(i+5,j-3) * (-0.001851851851851852d0) &
+                 + in(i+6,j-3) * (-0.001851851851851852d0) &
+                 + in(i+7,j-3) * (-0.001851851851851852d0) &
+                 + in(i+8,j-3) * (-0.001851851851851852d0) &
+                 + in(i+9,j-3) * (-0.001851851851851852d0) &
+                 + in(i-2,j-2) * (-0.013888888888888888d0) &
+                 + in(i+1,j-2) * (-0.004629629629629629d0) &
+                 + in(i+2,j-2) * (-0.004629629629629629d0) &
+                 + in(i+3,j-2) * (-0.004629629629629629d0) &
+                 + in(i+4,j-2) * (-0.004629629629629629d0) &
+                 + in(i+5,j-2) * (-0.004629629629629629d0) &
+                 + in(i+6,j-2) * (-0.004629629629629629d0) &
+                 + in(i+7,j-2) * (-0.004629629629629629d0) &
+                 + in(i+8,j-2) * (-0.004629629629629629d0) &
+                 + in(i+9,j-2) * (-0.004629629629629629d0) &
+                 + in(i-1,j-1) * (-0.027777777777777776d0) &
+                 + in(i+1,j-1) * (-0.027777777777777776d0) &
+                 + in(i+2,j-1) * (-0.027777777777777776d0) &
+                 + in(i+3,j-1) * (-0.027777777777777776d0) &
+                 + in(i+4,j-1) * (-0.027777777777777776d0) &
+                 + in(i+5,j-1) * (-0.027777777777777776d0) &
+                 + in(i+6,j-1) * (-0.027777777777777776d0) &
+                 + in(i+7,j-1) * (-0.027777777777777776d0) &
+                 + in(i+8,j-1) * (-0.027777777777777776d0) &
+                 + in(i+9,j-1) * (-0.027777777777777776d0) &
+                 + in(i-9,j+1) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+1) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+1) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+1) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+1) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+1) * (-0.000992063492063492d0) &
+                 + in(i-3,j+1) * (-0.001851851851851852d0) &
+                 + in(i-2,j+1) * (-0.004629629629629629d0) &
+                 + in(i-1,j+1) * (-0.027777777777777776d0) &
+                 + in(i+1,j+1) * (0.027777777777777776d0) &
+                 + in(i+2,j+1) * (0.004629629629629629d0) &
+                 + in(i+3,j+1) * (0.001851851851851852d0) &
+                 + in(i+4,j+1) * (0.000992063492063492d0) &
+                 + in(i+5,j+1) * (0.0006172839506172839d0) &
+                 + in(i+6,j+1) * (0.00042087542087542086d0) &
+                 + in(i+7,j+1) * (0.00030525030525030525d0) &
+                 + in(i+8,j+1) * (0.0002314814814814815d0) &
+                 + in(i+9,j+1) * (0.00018155410312273057d0) &
+                 + in(i-9,j+2) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+2) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+2) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+2) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+2) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+2) * (-0.000992063492063492d0) &
+                 + in(i-3,j+2) * (-0.001851851851851852d0) &
+                 + in(i-2,j+2) * (-0.004629629629629629d0) &
+                 + in(i-1,j+2) * (-0.027777777777777776d0) &
+                 + in(i+1,j+2) * (0.004629629629629629d0) &
+                 + in(i+2,j+2) * (0.013888888888888888d0) &
+                 + in(i+3,j+2) * (0.001851851851851852d0) &
+                 + in(i+4,j+2) * (0.000992063492063492d0) &
+                 + in(i+5,j+2) * (0.0006172839506172839d0) &
+                 + in(i+6,j+2) * (0.00042087542087542086d0) &
+                 + in(i+7,j+2) * (0.00030525030525030525d0) &
+                 + in(i+8,j+2) * (0.0002314814814814815d0) &
+                 + in(i+9,j+2) * (0.00018155410312273057d0) &
+                 + in(i-9,j+3) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+3) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+3) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+3) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+3) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+3) * (-0.000992063492063492d0) &
+                 + in(i-3,j+3) * (-0.001851851851851852d0) &
+                 + in(i-2,j+3) * (-0.004629629629629629d0) &
+                 + in(i-1,j+3) * (-0.027777777777777776d0) &
+                 + in(i+1,j+3) * (0.001851851851851852d0) &
+                 + in(i+2,j+3) * (0.001851851851851852d0) &
+                 + in(i+3,j+3) * (0.009259259259259259d0) &
+                 + in(i+4,j+3) * (0.000992063492063492d0) &
+                 + in(i+5,j+3) * (0.0006172839506172839d0) &
+                 + in(i+6,j+3) * (0.00042087542087542086d0) &
+                 + in(i+7,j+3) * (0.00030525030525030525d0) &
+                 + in(i+8,j+3) * (0.0002314814814814815d0) &
+                 + in(i+9,j+3) * (0.00018155410312273057d0) &
+                 + in(i-9,j+4) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+4) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+4) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+4) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+4) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+4) * (-0.000992063492063492d0) &
+                 + in(i-3,j+4) * (-0.001851851851851852d0) &
+                 + in(i-2,j+4) * (-0.004629629629629629d0) &
+                 + in(i-1,j+4) * (-0.027777777777777776d0) &
+                 + in(i+1,j+4) * (0.000992063492063492d0) &
+                 + in(i+2,j+4) * (0.000992063492063492d0) &
+                 + in(i+3,j+4) * (0.000992063492063492d0) &
+                 + in(i+4,j+4) * (0.006944444444444444d0) &
+                 + in(i+5,j+4) * (0.0006172839506172839d0) &
+                 + in(i+6,j+4) * (0.00042087542087542086d0) &
+                 + in(i+7,j+4) * (0.00030525030525030525d0) &
+                 + in(i+8,j+4) * (0.0002314814814814815d0) &
+                 + in(i+9,j+4) * (0.00018155410312273057d0) &
+                 + in(i-9,j+5) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+5) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+5) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+5) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+5) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+5) * (-0.000992063492063492d0) &
+                 + in(i-3,j+5) * (-0.001851851851851852d0) &
+                 + in(i-2,j+5) * (-0.004629629629629629d0) &
+                 + in(i-1,j+5) * (-0.027777777777777776d0) &
+                 + in(i+1,j+5) * (0.0006172839506172839d0) &
+                 + in(i+2,j+5) * (0.0006172839506172839d0) &
+                 + in(i+3,j+5) * (0.0006172839506172839d0) &
+                 + in(i+4,j+5) * (0.0006172839506172839d0) &
+                 + in(i+5,j+5) * (0.005555555555555556d0) &
+                 + in(i+6,j+5) * (0.00042087542087542086d0) &
+                 + in(i+7,j+5) * (0.00030525030525030525d0) &
+                 + in(i+8,j+5) * (0.0002314814814814815d0) &
+                 + in(i+9,j+5) * (0.00018155410312273057d0) &
+                 + in(i-9,j+6) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+6) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+6) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+6) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+6) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+6) * (-0.000992063492063492d0) &
+                 + in(i-3,j+6) * (-0.001851851851851852d0) &
+                 + in(i-2,j+6) * (-0.004629629629629629d0) &
+                 + in(i-1,j+6) * (-0.027777777777777776d0) &
+                 + in(i+1,j+6) * (0.00042087542087542086d0) &
+                 + in(i+2,j+6) * (0.00042087542087542086d0) &
+                 + in(i+3,j+6) * (0.00042087542087542086d0) &
+                 + in(i+4,j+6) * (0.00042087542087542086d0) &
+                 + in(i+5,j+6) * (0.00042087542087542086d0) &
+                 + in(i+6,j+6) * (0.004629629629629629d0) &
+                 + in(i+7,j+6) * (0.00030525030525030525d0) &
+                 + in(i+8,j+6) * (0.0002314814814814815d0) &
+                 + in(i+9,j+6) * (0.00018155410312273057d0) &
+                 + in(i-9,j+7) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+7) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+7) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+7) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+7) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+7) * (-0.000992063492063492d0) &
+                 + in(i-3,j+7) * (-0.001851851851851852d0) &
+                 + in(i-2,j+7) * (-0.004629629629629629d0) &
+                 + in(i-1,j+7) * (-0.027777777777777776d0) &
+                 + in(i+1,j+7) * (0.00030525030525030525d0) &
+                 + in(i+2,j+7) * (0.00030525030525030525d0) &
+                 + in(i+3,j+7) * (0.00030525030525030525d0) &
+                 + in(i+4,j+7) * (0.00030525030525030525d0) &
+                 + in(i+5,j+7) * (0.00030525030525030525d0) &
+                 + in(i+6,j+7) * (0.00030525030525030525d0) &
+                 + in(i+7,j+7) * (0.003968253968253968d0) &
+                 + in(i+8,j+7) * (0.0002314814814814815d0) &
+                 + in(i+9,j+7) * (0.00018155410312273057d0) &
+                 + in(i-9,j+8) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+8) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+8) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+8) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+8) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+8) * (-0.000992063492063492d0) &
+                 + in(i-3,j+8) * (-0.001851851851851852d0) &
+                 + in(i-2,j+8) * (-0.004629629629629629d0) &
+                 + in(i-1,j+8) * (-0.027777777777777776d0) &
+                 + in(i+1,j+8) * (0.0002314814814814815d0) &
+                 + in(i+2,j+8) * (0.0002314814814814815d0) &
+                 + in(i+3,j+8) * (0.0002314814814814815d0) &
+                 + in(i+4,j+8) * (0.0002314814814814815d0) &
+                 + in(i+5,j+8) * (0.0002314814814814815d0) &
+                 + in(i+6,j+8) * (0.0002314814814814815d0) &
+                 + in(i+7,j+8) * (0.0002314814814814815d0) &
+                 + in(i+8,j+8) * (0.003472222222222222d0) &
+                 + in(i+9,j+8) * (0.00018155410312273057d0) &
+                 + in(i-9,j+9) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+9) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+9) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+9) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+9) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+9) * (-0.000992063492063492d0) &
+                 + in(i-3,j+9) * (-0.001851851851851852d0) &
+                 + in(i-2,j+9) * (-0.004629629629629629d0) &
+                 + in(i-1,j+9) * (-0.027777777777777776d0) &
+                 + in(i+1,j+9) * (0.00018155410312273057d0) &
+                 + in(i+2,j+9) * (0.00018155410312273057d0) &
+                 + in(i+3,j+9) * (0.00018155410312273057d0) &
+                 + in(i+4,j+9) * (0.00018155410312273057d0) &
+                 + in(i+5,j+9) * (0.00018155410312273057d0) &
+                 + in(i+6,j+9) * (0.00018155410312273057d0) &
+                 + in(i+7,j+9) * (0.00018155410312273057d0) &
+                 + in(i+8,j+9) * (0.00018155410312273057d0) &
+                 + in(i+9,j+9) * (0.0030864197530864196d0) &
 +0.0
       end do
       !$omp end simd
diff --git a/FORTRAN/stencil_taskloop.f90 b/FORTRAN/stencil_taskloop.f90
index 5111b5ec4..77735c322 100644
--- a/FORTRAN/stencil_taskloop.f90
+++ b/FORTRAN/stencil_taskloop.f90
@@ -10,10 +10,10 @@ subroutine star1(n, in, out)
       !$omp simd
       do j=1,n-1-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-1) * (-0.5) &
-                 + in(i-1,j+0) * (-0.5) &
-                 + in(i+1,j+0) * (0.5) &
-                 + in(i+0,j+1) * (0.5) &
+                 + in(i+0,j-1) * (-0.5d0) &
+                 + in(i-1,j+0) * (-0.5d0) &
+                 + in(i+1,j+0) * (0.5d0) &
+                 + in(i+0,j+1) * (0.5d0) &
 +0.0
       end do
       !$omp end simd
@@ -33,14 +33,14 @@ subroutine star2(n, in, out)
       !$omp simd
       do j=2,n-2-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-2) * (-0.125) &
-                 + in(i+0,j-1) * (-0.25) &
-                 + in(i-2,j+0) * (-0.125) &
-                 + in(i-1,j+0) * (-0.25) &
-                 + in(i+1,j+0) * (0.25) &
-                 + in(i+2,j+0) * (0.125) &
-                 + in(i+0,j+1) * (0.25) &
-                 + in(i+0,j+2) * (0.125) &
+                 + in(i+0,j-2) * (-0.125d0) &
+                 + in(i+0,j-1) * (-0.25d0) &
+                 + in(i-2,j+0) * (-0.125d0) &
+                 + in(i-1,j+0) * (-0.25d0) &
+                 + in(i+1,j+0) * (0.25d0) &
+                 + in(i+2,j+0) * (0.125d0) &
+                 + in(i+0,j+1) * (0.25d0) &
+                 + in(i+0,j+2) * (0.125d0) &
 +0.0
       end do
       !$omp end simd
@@ -60,18 +60,18 @@ subroutine star3(n, in, out)
       !$omp simd
       do j=3,n-3-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-3) * (-0.05555555555555555) &
-                 + in(i+0,j-2) * (-0.08333333333333333) &
-                 + in(i+0,j-1) * (-0.16666666666666666) &
-                 + in(i-3,j+0) * (-0.05555555555555555) &
-                 + in(i-2,j+0) * (-0.08333333333333333) &
-                 + in(i-1,j+0) * (-0.16666666666666666) &
-                 + in(i+1,j+0) * (0.16666666666666666) &
-                 + in(i+2,j+0) * (0.08333333333333333) &
-                 + in(i+3,j+0) * (0.05555555555555555) &
-                 + in(i+0,j+1) * (0.16666666666666666) &
-                 + in(i+0,j+2) * (0.08333333333333333) &
-                 + in(i+0,j+3) * (0.05555555555555555) &
+                 + in(i+0,j-3) * (-0.05555555555555555d0) &
+                 + in(i+0,j-2) * (-0.08333333333333333d0) &
+                 + in(i+0,j-1) * (-0.16666666666666666d0) &
+                 + in(i-3,j+0) * (-0.05555555555555555d0) &
+                 + in(i-2,j+0) * (-0.08333333333333333d0) &
+                 + in(i-1,j+0) * (-0.16666666666666666d0) &
+                 + in(i+1,j+0) * (0.16666666666666666d0) &
+                 + in(i+2,j+0) * (0.08333333333333333d0) &
+                 + in(i+3,j+0) * (0.05555555555555555d0) &
+                 + in(i+0,j+1) * (0.16666666666666666d0) &
+                 + in(i+0,j+2) * (0.08333333333333333d0) &
+                 + in(i+0,j+3) * (0.05555555555555555d0) &
 +0.0
       end do
       !$omp end simd
@@ -91,22 +91,22 @@ subroutine star4(n, in, out)
       !$omp simd
       do j=4,n-4-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-4) * (-0.03125) &
-                 + in(i+0,j-3) * (-0.041666666666666664) &
-                 + in(i+0,j-2) * (-0.0625) &
-                 + in(i+0,j-1) * (-0.125) &
-                 + in(i-4,j+0) * (-0.03125) &
-                 + in(i-3,j+0) * (-0.041666666666666664) &
-                 + in(i-2,j+0) * (-0.0625) &
-                 + in(i-1,j+0) * (-0.125) &
-                 + in(i+1,j+0) * (0.125) &
-                 + in(i+2,j+0) * (0.0625) &
-                 + in(i+3,j+0) * (0.041666666666666664) &
-                 + in(i+4,j+0) * (0.03125) &
-                 + in(i+0,j+1) * (0.125) &
-                 + in(i+0,j+2) * (0.0625) &
-                 + in(i+0,j+3) * (0.041666666666666664) &
-                 + in(i+0,j+4) * (0.03125) &
+                 + in(i+0,j-4) * (-0.03125d0) &
+                 + in(i+0,j-3) * (-0.041666666666666664d0) &
+                 + in(i+0,j-2) * (-0.0625d0) &
+                 + in(i+0,j-1) * (-0.125d0) &
+                 + in(i-4,j+0) * (-0.03125d0) &
+                 + in(i-3,j+0) * (-0.041666666666666664d0) &
+                 + in(i-2,j+0) * (-0.0625d0) &
+                 + in(i-1,j+0) * (-0.125d0) &
+                 + in(i+1,j+0) * (0.125d0) &
+                 + in(i+2,j+0) * (0.0625d0) &
+                 + in(i+3,j+0) * (0.041666666666666664d0) &
+                 + in(i+4,j+0) * (0.03125d0) &
+                 + in(i+0,j+1) * (0.125d0) &
+                 + in(i+0,j+2) * (0.0625d0) &
+                 + in(i+0,j+3) * (0.041666666666666664d0) &
+                 + in(i+0,j+4) * (0.03125d0) &
 +0.0
       end do
       !$omp end simd
@@ -126,26 +126,26 @@ subroutine star5(n, in, out)
       !$omp simd
       do j=5,n-5-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-5) * (-0.02) &
-                 + in(i+0,j-4) * (-0.025) &
-                 + in(i+0,j-3) * (-0.03333333333333333) &
-                 + in(i+0,j-2) * (-0.05) &
-                 + in(i+0,j-1) * (-0.1) &
-                 + in(i-5,j+0) * (-0.02) &
-                 + in(i-4,j+0) * (-0.025) &
-                 + in(i-3,j+0) * (-0.03333333333333333) &
-                 + in(i-2,j+0) * (-0.05) &
-                 + in(i-1,j+0) * (-0.1) &
-                 + in(i+1,j+0) * (0.1) &
-                 + in(i+2,j+0) * (0.05) &
-                 + in(i+3,j+0) * (0.03333333333333333) &
-                 + in(i+4,j+0) * (0.025) &
-                 + in(i+5,j+0) * (0.02) &
-                 + in(i+0,j+1) * (0.1) &
-                 + in(i+0,j+2) * (0.05) &
-                 + in(i+0,j+3) * (0.03333333333333333) &
-                 + in(i+0,j+4) * (0.025) &
-                 + in(i+0,j+5) * (0.02) &
+                 + in(i+0,j-5) * (-0.02d0) &
+                 + in(i+0,j-4) * (-0.025d0) &
+                 + in(i+0,j-3) * (-0.03333333333333333d0) &
+                 + in(i+0,j-2) * (-0.05d0) &
+                 + in(i+0,j-1) * (-0.1d0) &
+                 + in(i-5,j+0) * (-0.02d0) &
+                 + in(i-4,j+0) * (-0.025d0) &
+                 + in(i-3,j+0) * (-0.03333333333333333d0) &
+                 + in(i-2,j+0) * (-0.05d0) &
+                 + in(i-1,j+0) * (-0.1d0) &
+                 + in(i+1,j+0) * (0.1d0) &
+                 + in(i+2,j+0) * (0.05d0) &
+                 + in(i+3,j+0) * (0.03333333333333333d0) &
+                 + in(i+4,j+0) * (0.025d0) &
+                 + in(i+5,j+0) * (0.02d0) &
+                 + in(i+0,j+1) * (0.1d0) &
+                 + in(i+0,j+2) * (0.05d0) &
+                 + in(i+0,j+3) * (0.03333333333333333d0) &
+                 + in(i+0,j+4) * (0.025d0) &
+                 + in(i+0,j+5) * (0.02d0) &
 +0.0
       end do
       !$omp end simd
@@ -165,30 +165,30 @@ subroutine star6(n, in, out)
       !$omp simd
       do j=6,n-6-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-6) * (-0.013888888888888888) &
-                 + in(i+0,j-5) * (-0.016666666666666666) &
-                 + in(i+0,j-4) * (-0.020833333333333332) &
-                 + in(i+0,j-3) * (-0.027777777777777776) &
-                 + in(i+0,j-2) * (-0.041666666666666664) &
-                 + in(i+0,j-1) * (-0.08333333333333333) &
-                 + in(i-6,j+0) * (-0.013888888888888888) &
-                 + in(i-5,j+0) * (-0.016666666666666666) &
-                 + in(i-4,j+0) * (-0.020833333333333332) &
-                 + in(i-3,j+0) * (-0.027777777777777776) &
-                 + in(i-2,j+0) * (-0.041666666666666664) &
-                 + in(i-1,j+0) * (-0.08333333333333333) &
-                 + in(i+1,j+0) * (0.08333333333333333) &
-                 + in(i+2,j+0) * (0.041666666666666664) &
-                 + in(i+3,j+0) * (0.027777777777777776) &
-                 + in(i+4,j+0) * (0.020833333333333332) &
-                 + in(i+5,j+0) * (0.016666666666666666) &
-                 + in(i+6,j+0) * (0.013888888888888888) &
-                 + in(i+0,j+1) * (0.08333333333333333) &
-                 + in(i+0,j+2) * (0.041666666666666664) &
-                 + in(i+0,j+3) * (0.027777777777777776) &
-                 + in(i+0,j+4) * (0.020833333333333332) &
-                 + in(i+0,j+5) * (0.016666666666666666) &
-                 + in(i+0,j+6) * (0.013888888888888888) &
+                 + in(i+0,j-6) * (-0.013888888888888888d0) &
+                 + in(i+0,j-5) * (-0.016666666666666666d0) &
+                 + in(i+0,j-4) * (-0.020833333333333332d0) &
+                 + in(i+0,j-3) * (-0.027777777777777776d0) &
+                 + in(i+0,j-2) * (-0.041666666666666664d0) &
+                 + in(i+0,j-1) * (-0.08333333333333333d0) &
+                 + in(i-6,j+0) * (-0.013888888888888888d0) &
+                 + in(i-5,j+0) * (-0.016666666666666666d0) &
+                 + in(i-4,j+0) * (-0.020833333333333332d0) &
+                 + in(i-3,j+0) * (-0.027777777777777776d0) &
+                 + in(i-2,j+0) * (-0.041666666666666664d0) &
+                 + in(i-1,j+0) * (-0.08333333333333333d0) &
+                 + in(i+1,j+0) * (0.08333333333333333d0) &
+                 + in(i+2,j+0) * (0.041666666666666664d0) &
+                 + in(i+3,j+0) * (0.027777777777777776d0) &
+                 + in(i+4,j+0) * (0.020833333333333332d0) &
+                 + in(i+5,j+0) * (0.016666666666666666d0) &
+                 + in(i+6,j+0) * (0.013888888888888888d0) &
+                 + in(i+0,j+1) * (0.08333333333333333d0) &
+                 + in(i+0,j+2) * (0.041666666666666664d0) &
+                 + in(i+0,j+3) * (0.027777777777777776d0) &
+                 + in(i+0,j+4) * (0.020833333333333332d0) &
+                 + in(i+0,j+5) * (0.016666666666666666d0) &
+                 + in(i+0,j+6) * (0.013888888888888888d0) &
 +0.0
       end do
       !$omp end simd
@@ -208,34 +208,34 @@ subroutine star7(n, in, out)
       !$omp simd
       do j=7,n-7-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-7) * (-0.01020408163265306) &
-                 + in(i+0,j-6) * (-0.011904761904761904) &
-                 + in(i+0,j-5) * (-0.014285714285714285) &
-                 + in(i+0,j-4) * (-0.017857142857142856) &
-                 + in(i+0,j-3) * (-0.023809523809523808) &
-                 + in(i+0,j-2) * (-0.03571428571428571) &
-                 + in(i+0,j-1) * (-0.07142857142857142) &
-                 + in(i-7,j+0) * (-0.01020408163265306) &
-                 + in(i-6,j+0) * (-0.011904761904761904) &
-                 + in(i-5,j+0) * (-0.014285714285714285) &
-                 + in(i-4,j+0) * (-0.017857142857142856) &
-                 + in(i-3,j+0) * (-0.023809523809523808) &
-                 + in(i-2,j+0) * (-0.03571428571428571) &
-                 + in(i-1,j+0) * (-0.07142857142857142) &
-                 + in(i+1,j+0) * (0.07142857142857142) &
-                 + in(i+2,j+0) * (0.03571428571428571) &
-                 + in(i+3,j+0) * (0.023809523809523808) &
-                 + in(i+4,j+0) * (0.017857142857142856) &
-                 + in(i+5,j+0) * (0.014285714285714285) &
-                 + in(i+6,j+0) * (0.011904761904761904) &
-                 + in(i+7,j+0) * (0.01020408163265306) &
-                 + in(i+0,j+1) * (0.07142857142857142) &
-                 + in(i+0,j+2) * (0.03571428571428571) &
-                 + in(i+0,j+3) * (0.023809523809523808) &
-                 + in(i+0,j+4) * (0.017857142857142856) &
-                 + in(i+0,j+5) * (0.014285714285714285) &
-                 + in(i+0,j+6) * (0.011904761904761904) &
-                 + in(i+0,j+7) * (0.01020408163265306) &
+                 + in(i+0,j-7) * (-0.01020408163265306d0) &
+                 + in(i+0,j-6) * (-0.011904761904761904d0) &
+                 + in(i+0,j-5) * (-0.014285714285714285d0) &
+                 + in(i+0,j-4) * (-0.017857142857142856d0) &
+                 + in(i+0,j-3) * (-0.023809523809523808d0) &
+                 + in(i+0,j-2) * (-0.03571428571428571d0) &
+                 + in(i+0,j-1) * (-0.07142857142857142d0) &
+                 + in(i-7,j+0) * (-0.01020408163265306d0) &
+                 + in(i-6,j+0) * (-0.011904761904761904d0) &
+                 + in(i-5,j+0) * (-0.014285714285714285d0) &
+                 + in(i-4,j+0) * (-0.017857142857142856d0) &
+                 + in(i-3,j+0) * (-0.023809523809523808d0) &
+                 + in(i-2,j+0) * (-0.03571428571428571d0) &
+                 + in(i-1,j+0) * (-0.07142857142857142d0) &
+                 + in(i+1,j+0) * (0.07142857142857142d0) &
+                 + in(i+2,j+0) * (0.03571428571428571d0) &
+                 + in(i+3,j+0) * (0.023809523809523808d0) &
+                 + in(i+4,j+0) * (0.017857142857142856d0) &
+                 + in(i+5,j+0) * (0.014285714285714285d0) &
+                 + in(i+6,j+0) * (0.011904761904761904d0) &
+                 + in(i+7,j+0) * (0.01020408163265306d0) &
+                 + in(i+0,j+1) * (0.07142857142857142d0) &
+                 + in(i+0,j+2) * (0.03571428571428571d0) &
+                 + in(i+0,j+3) * (0.023809523809523808d0) &
+                 + in(i+0,j+4) * (0.017857142857142856d0) &
+                 + in(i+0,j+5) * (0.014285714285714285d0) &
+                 + in(i+0,j+6) * (0.011904761904761904d0) &
+                 + in(i+0,j+7) * (0.01020408163265306d0) &
 +0.0
       end do
       !$omp end simd
@@ -255,38 +255,38 @@ subroutine star8(n, in, out)
       !$omp simd
       do j=8,n-8-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-8) * (-0.0078125) &
-                 + in(i+0,j-7) * (-0.008928571428571428) &
-                 + in(i+0,j-6) * (-0.010416666666666666) &
-                 + in(i+0,j-5) * (-0.0125) &
-                 + in(i+0,j-4) * (-0.015625) &
-                 + in(i+0,j-3) * (-0.020833333333333332) &
-                 + in(i+0,j-2) * (-0.03125) &
-                 + in(i+0,j-1) * (-0.0625) &
-                 + in(i-8,j+0) * (-0.0078125) &
-                 + in(i-7,j+0) * (-0.008928571428571428) &
-                 + in(i-6,j+0) * (-0.010416666666666666) &
-                 + in(i-5,j+0) * (-0.0125) &
-                 + in(i-4,j+0) * (-0.015625) &
-                 + in(i-3,j+0) * (-0.020833333333333332) &
-                 + in(i-2,j+0) * (-0.03125) &
-                 + in(i-1,j+0) * (-0.0625) &
-                 + in(i+1,j+0) * (0.0625) &
-                 + in(i+2,j+0) * (0.03125) &
-                 + in(i+3,j+0) * (0.020833333333333332) &
-                 + in(i+4,j+0) * (0.015625) &
-                 + in(i+5,j+0) * (0.0125) &
-                 + in(i+6,j+0) * (0.010416666666666666) &
-                 + in(i+7,j+0) * (0.008928571428571428) &
-                 + in(i+8,j+0) * (0.0078125) &
-                 + in(i+0,j+1) * (0.0625) &
-                 + in(i+0,j+2) * (0.03125) &
-                 + in(i+0,j+3) * (0.020833333333333332) &
-                 + in(i+0,j+4) * (0.015625) &
-                 + in(i+0,j+5) * (0.0125) &
-                 + in(i+0,j+6) * (0.010416666666666666) &
-                 + in(i+0,j+7) * (0.008928571428571428) &
-                 + in(i+0,j+8) * (0.0078125) &
+                 + in(i+0,j-8) * (-0.0078125d0) &
+                 + in(i+0,j-7) * (-0.008928571428571428d0) &
+                 + in(i+0,j-6) * (-0.010416666666666666d0) &
+                 + in(i+0,j-5) * (-0.0125d0) &
+                 + in(i+0,j-4) * (-0.015625d0) &
+                 + in(i+0,j-3) * (-0.020833333333333332d0) &
+                 + in(i+0,j-2) * (-0.03125d0) &
+                 + in(i+0,j-1) * (-0.0625d0) &
+                 + in(i-8,j+0) * (-0.0078125d0) &
+                 + in(i-7,j+0) * (-0.008928571428571428d0) &
+                 + in(i-6,j+0) * (-0.010416666666666666d0) &
+                 + in(i-5,j+0) * (-0.0125d0) &
+                 + in(i-4,j+0) * (-0.015625d0) &
+                 + in(i-3,j+0) * (-0.020833333333333332d0) &
+                 + in(i-2,j+0) * (-0.03125d0) &
+                 + in(i-1,j+0) * (-0.0625d0) &
+                 + in(i+1,j+0) * (0.0625d0) &
+                 + in(i+2,j+0) * (0.03125d0) &
+                 + in(i+3,j+0) * (0.020833333333333332d0) &
+                 + in(i+4,j+0) * (0.015625d0) &
+                 + in(i+5,j+0) * (0.0125d0) &
+                 + in(i+6,j+0) * (0.010416666666666666d0) &
+                 + in(i+7,j+0) * (0.008928571428571428d0) &
+                 + in(i+8,j+0) * (0.0078125d0) &
+                 + in(i+0,j+1) * (0.0625d0) &
+                 + in(i+0,j+2) * (0.03125d0) &
+                 + in(i+0,j+3) * (0.020833333333333332d0) &
+                 + in(i+0,j+4) * (0.015625d0) &
+                 + in(i+0,j+5) * (0.0125d0) &
+                 + in(i+0,j+6) * (0.010416666666666666d0) &
+                 + in(i+0,j+7) * (0.008928571428571428d0) &
+                 + in(i+0,j+8) * (0.0078125d0) &
 +0.0
       end do
       !$omp end simd
@@ -306,42 +306,42 @@ subroutine star9(n, in, out)
       !$omp simd
       do j=9,n-9-1
         out(i,j) = out(i,j) &
-                 + in(i+0,j-9) * (-0.006172839506172839) &
-                 + in(i+0,j-8) * (-0.006944444444444444) &
-                 + in(i+0,j-7) * (-0.007936507936507936) &
-                 + in(i+0,j-6) * (-0.009259259259259259) &
-                 + in(i+0,j-5) * (-0.011111111111111112) &
-                 + in(i+0,j-4) * (-0.013888888888888888) &
-                 + in(i+0,j-3) * (-0.018518518518518517) &
-                 + in(i+0,j-2) * (-0.027777777777777776) &
-                 + in(i+0,j-1) * (-0.05555555555555555) &
-                 + in(i-9,j+0) * (-0.006172839506172839) &
-                 + in(i-8,j+0) * (-0.006944444444444444) &
-                 + in(i-7,j+0) * (-0.007936507936507936) &
-                 + in(i-6,j+0) * (-0.009259259259259259) &
-                 + in(i-5,j+0) * (-0.011111111111111112) &
-                 + in(i-4,j+0) * (-0.013888888888888888) &
-                 + in(i-3,j+0) * (-0.018518518518518517) &
-                 + in(i-2,j+0) * (-0.027777777777777776) &
-                 + in(i-1,j+0) * (-0.05555555555555555) &
-                 + in(i+1,j+0) * (0.05555555555555555) &
-                 + in(i+2,j+0) * (0.027777777777777776) &
-                 + in(i+3,j+0) * (0.018518518518518517) &
-                 + in(i+4,j+0) * (0.013888888888888888) &
-                 + in(i+5,j+0) * (0.011111111111111112) &
-                 + in(i+6,j+0) * (0.009259259259259259) &
-                 + in(i+7,j+0) * (0.007936507936507936) &
-                 + in(i+8,j+0) * (0.006944444444444444) &
-                 + in(i+9,j+0) * (0.006172839506172839) &
-                 + in(i+0,j+1) * (0.05555555555555555) &
-                 + in(i+0,j+2) * (0.027777777777777776) &
-                 + in(i+0,j+3) * (0.018518518518518517) &
-                 + in(i+0,j+4) * (0.013888888888888888) &
-                 + in(i+0,j+5) * (0.011111111111111112) &
-                 + in(i+0,j+6) * (0.009259259259259259) &
-                 + in(i+0,j+7) * (0.007936507936507936) &
-                 + in(i+0,j+8) * (0.006944444444444444) &
-                 + in(i+0,j+9) * (0.006172839506172839) &
+                 + in(i+0,j-9) * (-0.006172839506172839d0) &
+                 + in(i+0,j-8) * (-0.006944444444444444d0) &
+                 + in(i+0,j-7) * (-0.007936507936507936d0) &
+                 + in(i+0,j-6) * (-0.009259259259259259d0) &
+                 + in(i+0,j-5) * (-0.011111111111111112d0) &
+                 + in(i+0,j-4) * (-0.013888888888888888d0) &
+                 + in(i+0,j-3) * (-0.018518518518518517d0) &
+                 + in(i+0,j-2) * (-0.027777777777777776d0) &
+                 + in(i+0,j-1) * (-0.05555555555555555d0) &
+                 + in(i-9,j+0) * (-0.006172839506172839d0) &
+                 + in(i-8,j+0) * (-0.006944444444444444d0) &
+                 + in(i-7,j+0) * (-0.007936507936507936d0) &
+                 + in(i-6,j+0) * (-0.009259259259259259d0) &
+                 + in(i-5,j+0) * (-0.011111111111111112d0) &
+                 + in(i-4,j+0) * (-0.013888888888888888d0) &
+                 + in(i-3,j+0) * (-0.018518518518518517d0) &
+                 + in(i-2,j+0) * (-0.027777777777777776d0) &
+                 + in(i-1,j+0) * (-0.05555555555555555d0) &
+                 + in(i+1,j+0) * (0.05555555555555555d0) &
+                 + in(i+2,j+0) * (0.027777777777777776d0) &
+                 + in(i+3,j+0) * (0.018518518518518517d0) &
+                 + in(i+4,j+0) * (0.013888888888888888d0) &
+                 + in(i+5,j+0) * (0.011111111111111112d0) &
+                 + in(i+6,j+0) * (0.009259259259259259d0) &
+                 + in(i+7,j+0) * (0.007936507936507936d0) &
+                 + in(i+8,j+0) * (0.006944444444444444d0) &
+                 + in(i+9,j+0) * (0.006172839506172839d0) &
+                 + in(i+0,j+1) * (0.05555555555555555d0) &
+                 + in(i+0,j+2) * (0.027777777777777776d0) &
+                 + in(i+0,j+3) * (0.018518518518518517d0) &
+                 + in(i+0,j+4) * (0.013888888888888888d0) &
+                 + in(i+0,j+5) * (0.011111111111111112d0) &
+                 + in(i+0,j+6) * (0.009259259259259259d0) &
+                 + in(i+0,j+7) * (0.007936507936507936d0) &
+                 + in(i+0,j+8) * (0.006944444444444444d0) &
+                 + in(i+0,j+9) * (0.006172839506172839d0) &
 +0.0
       end do
       !$omp end simd
@@ -361,10 +361,10 @@ subroutine grid1(n, in, out)
       !$omp simd
       do j=1,n-1-1
         out(i,j) = out(i,j) &
-                 + in(i-1,j-1) * (-0.25) &
-                 + in(i+1,j-1) * (-0.25) &
-                 + in(i-1,j+1) * (-0.25) &
-                 + in(i+1,j+1) * (0.25) &
+                 + in(i-1,j-1) * (-0.25d0) &
+                 + in(i+1,j-1) * (-0.25d0) &
+                 + in(i-1,j+1) * (-0.25d0) &
+                 + in(i+1,j+1) * (0.25d0) &
 +0.0
       end do
       !$omp end simd
@@ -384,20 +384,20 @@ subroutine grid2(n, in, out)
       !$omp simd
       do j=2,n-2-1
         out(i,j) = out(i,j) &
-                 + in(i-2,j-2) * (-0.0625) &
-                 + in(i+1,j-2) * (-0.020833333333333332) &
-                 + in(i+2,j-2) * (-0.020833333333333332) &
-                 + in(i-1,j-1) * (-0.125) &
-                 + in(i+1,j-1) * (-0.125) &
-                 + in(i+2,j-1) * (-0.125) &
-                 + in(i-2,j+1) * (-0.020833333333333332) &
-                 + in(i-1,j+1) * (-0.125) &
-                 + in(i+1,j+1) * (0.125) &
-                 + in(i+2,j+1) * (0.020833333333333332) &
-                 + in(i-2,j+2) * (-0.020833333333333332) &
-                 + in(i-1,j+2) * (-0.125) &
-                 + in(i+1,j+2) * (0.020833333333333332) &
-                 + in(i+2,j+2) * (0.0625) &
+                 + in(i-2,j-2) * (-0.0625d0) &
+                 + in(i+1,j-2) * (-0.020833333333333332d0) &
+                 + in(i+2,j-2) * (-0.020833333333333332d0) &
+                 + in(i-1,j-1) * (-0.125d0) &
+                 + in(i+1,j-1) * (-0.125d0) &
+                 + in(i+2,j-1) * (-0.125d0) &
+                 + in(i-2,j+1) * (-0.020833333333333332d0) &
+                 + in(i-1,j+1) * (-0.125d0) &
+                 + in(i+1,j+1) * (0.125d0) &
+                 + in(i+2,j+1) * (0.020833333333333332d0) &
+                 + in(i-2,j+2) * (-0.020833333333333332d0) &
+                 + in(i-1,j+2) * (-0.125d0) &
+                 + in(i+1,j+2) * (0.020833333333333332d0) &
+                 + in(i+2,j+2) * (0.0625d0) &
 +0.0
       end do
       !$omp end simd
@@ -417,36 +417,36 @@ subroutine grid3(n, in, out)
       !$omp simd
       do j=3,n-3-1
         out(i,j) = out(i,j) &
-                 + in(i-3,j-3) * (-0.027777777777777776) &
-                 + in(i+1,j-3) * (-0.005555555555555556) &
-                 + in(i+2,j-3) * (-0.005555555555555556) &
-                 + in(i+3,j-3) * (-0.005555555555555556) &
-                 + in(i-2,j-2) * (-0.041666666666666664) &
-                 + in(i+1,j-2) * (-0.013888888888888888) &
-                 + in(i+2,j-2) * (-0.013888888888888888) &
-                 + in(i+3,j-2) * (-0.013888888888888888) &
-                 + in(i-1,j-1) * (-0.08333333333333333) &
-                 + in(i+1,j-1) * (-0.08333333333333333) &
-                 + in(i+2,j-1) * (-0.08333333333333333) &
-                 + in(i+3,j-1) * (-0.08333333333333333) &
-                 + in(i-3,j+1) * (-0.005555555555555556) &
-                 + in(i-2,j+1) * (-0.013888888888888888) &
-                 + in(i-1,j+1) * (-0.08333333333333333) &
-                 + in(i+1,j+1) * (0.08333333333333333) &
-                 + in(i+2,j+1) * (0.013888888888888888) &
-                 + in(i+3,j+1) * (0.005555555555555556) &
-                 + in(i-3,j+2) * (-0.005555555555555556) &
-                 + in(i-2,j+2) * (-0.013888888888888888) &
-                 + in(i-1,j+2) * (-0.08333333333333333) &
-                 + in(i+1,j+2) * (0.013888888888888888) &
-                 + in(i+2,j+2) * (0.041666666666666664) &
-                 + in(i+3,j+2) * (0.005555555555555556) &
-                 + in(i-3,j+3) * (-0.005555555555555556) &
-                 + in(i-2,j+3) * (-0.013888888888888888) &
-                 + in(i-1,j+3) * (-0.08333333333333333) &
-                 + in(i+1,j+3) * (0.005555555555555556) &
-                 + in(i+2,j+3) * (0.005555555555555556) &
-                 + in(i+3,j+3) * (0.027777777777777776) &
+                 + in(i-3,j-3) * (-0.027777777777777776d0) &
+                 + in(i+1,j-3) * (-0.005555555555555556d0) &
+                 + in(i+2,j-3) * (-0.005555555555555556d0) &
+                 + in(i+3,j-3) * (-0.005555555555555556d0) &
+                 + in(i-2,j-2) * (-0.041666666666666664d0) &
+                 + in(i+1,j-2) * (-0.013888888888888888d0) &
+                 + in(i+2,j-2) * (-0.013888888888888888d0) &
+                 + in(i+3,j-2) * (-0.013888888888888888d0) &
+                 + in(i-1,j-1) * (-0.08333333333333333d0) &
+                 + in(i+1,j-1) * (-0.08333333333333333d0) &
+                 + in(i+2,j-1) * (-0.08333333333333333d0) &
+                 + in(i+3,j-1) * (-0.08333333333333333d0) &
+                 + in(i-3,j+1) * (-0.005555555555555556d0) &
+                 + in(i-2,j+1) * (-0.013888888888888888d0) &
+                 + in(i-1,j+1) * (-0.08333333333333333d0) &
+                 + in(i+1,j+1) * (0.08333333333333333d0) &
+                 + in(i+2,j+1) * (0.013888888888888888d0) &
+                 + in(i+3,j+1) * (0.005555555555555556d0) &
+                 + in(i-3,j+2) * (-0.005555555555555556d0) &
+                 + in(i-2,j+2) * (-0.013888888888888888d0) &
+                 + in(i-1,j+2) * (-0.08333333333333333d0) &
+                 + in(i+1,j+2) * (0.013888888888888888d0) &
+                 + in(i+2,j+2) * (0.041666666666666664d0) &
+                 + in(i+3,j+2) * (0.005555555555555556d0) &
+                 + in(i-3,j+3) * (-0.005555555555555556d0) &
+                 + in(i-2,j+3) * (-0.013888888888888888d0) &
+                 + in(i-1,j+3) * (-0.08333333333333333d0) &
+                 + in(i+1,j+3) * (0.005555555555555556d0) &
+                 + in(i+2,j+3) * (0.005555555555555556d0) &
+                 + in(i+3,j+3) * (0.027777777777777776d0) &
 +0.0
       end do
       !$omp end simd
@@ -466,58 +466,58 @@ subroutine grid4(n, in, out)
       !$omp simd
       do j=4,n-4-1
         out(i,j) = out(i,j) &
-                 + in(i-4,j-4) * (-0.015625) &
-                 + in(i+1,j-4) * (-0.002232142857142857) &
-                 + in(i+2,j-4) * (-0.002232142857142857) &
-                 + in(i+3,j-4) * (-0.002232142857142857) &
-                 + in(i+4,j-4) * (-0.002232142857142857) &
-                 + in(i-3,j-3) * (-0.020833333333333332) &
-                 + in(i+1,j-3) * (-0.004166666666666667) &
-                 + in(i+2,j-3) * (-0.004166666666666667) &
-                 + in(i+3,j-3) * (-0.004166666666666667) &
-                 + in(i+4,j-3) * (-0.004166666666666667) &
-                 + in(i-2,j-2) * (-0.03125) &
-                 + in(i+1,j-2) * (-0.010416666666666666) &
-                 + in(i+2,j-2) * (-0.010416666666666666) &
-                 + in(i+3,j-2) * (-0.010416666666666666) &
-                 + in(i+4,j-2) * (-0.010416666666666666) &
-                 + in(i-1,j-1) * (-0.0625) &
-                 + in(i+1,j-1) * (-0.0625) &
-                 + in(i+2,j-1) * (-0.0625) &
-                 + in(i+3,j-1) * (-0.0625) &
-                 + in(i+4,j-1) * (-0.0625) &
-                 + in(i-4,j+1) * (-0.002232142857142857) &
-                 + in(i-3,j+1) * (-0.004166666666666667) &
-                 + in(i-2,j+1) * (-0.010416666666666666) &
-                 + in(i-1,j+1) * (-0.0625) &
-                 + in(i+1,j+1) * (0.0625) &
-                 + in(i+2,j+1) * (0.010416666666666666) &
-                 + in(i+3,j+1) * (0.004166666666666667) &
-                 + in(i+4,j+1) * (0.002232142857142857) &
-                 + in(i-4,j+2) * (-0.002232142857142857) &
-                 + in(i-3,j+2) * (-0.004166666666666667) &
-                 + in(i-2,j+2) * (-0.010416666666666666) &
-                 + in(i-1,j+2) * (-0.0625) &
-                 + in(i+1,j+2) * (0.010416666666666666) &
-                 + in(i+2,j+2) * (0.03125) &
-                 + in(i+3,j+2) * (0.004166666666666667) &
-                 + in(i+4,j+2) * (0.002232142857142857) &
-                 + in(i-4,j+3) * (-0.002232142857142857) &
-                 + in(i-3,j+3) * (-0.004166666666666667) &
-                 + in(i-2,j+3) * (-0.010416666666666666) &
-                 + in(i-1,j+3) * (-0.0625) &
-                 + in(i+1,j+3) * (0.004166666666666667) &
-                 + in(i+2,j+3) * (0.004166666666666667) &
-                 + in(i+3,j+3) * (0.020833333333333332) &
-                 + in(i+4,j+3) * (0.002232142857142857) &
-                 + in(i-4,j+4) * (-0.002232142857142857) &
-                 + in(i-3,j+4) * (-0.004166666666666667) &
-                 + in(i-2,j+4) * (-0.010416666666666666) &
-                 + in(i-1,j+4) * (-0.0625) &
-                 + in(i+1,j+4) * (0.002232142857142857) &
-                 + in(i+2,j+4) * (0.002232142857142857) &
-                 + in(i+3,j+4) * (0.002232142857142857) &
-                 + in(i+4,j+4) * (0.015625) &
+                 + in(i-4,j-4) * (-0.015625d0) &
+                 + in(i+1,j-4) * (-0.002232142857142857d0) &
+                 + in(i+2,j-4) * (-0.002232142857142857d0) &
+                 + in(i+3,j-4) * (-0.002232142857142857d0) &
+                 + in(i+4,j-4) * (-0.002232142857142857d0) &
+                 + in(i-3,j-3) * (-0.020833333333333332d0) &
+                 + in(i+1,j-3) * (-0.004166666666666667d0) &
+                 + in(i+2,j-3) * (-0.004166666666666667d0) &
+                 + in(i+3,j-3) * (-0.004166666666666667d0) &
+                 + in(i+4,j-3) * (-0.004166666666666667d0) &
+                 + in(i-2,j-2) * (-0.03125d0) &
+                 + in(i+1,j-2) * (-0.010416666666666666d0) &
+                 + in(i+2,j-2) * (-0.010416666666666666d0) &
+                 + in(i+3,j-2) * (-0.010416666666666666d0) &
+                 + in(i+4,j-2) * (-0.010416666666666666d0) &
+                 + in(i-1,j-1) * (-0.0625d0) &
+                 + in(i+1,j-1) * (-0.0625d0) &
+                 + in(i+2,j-1) * (-0.0625d0) &
+                 + in(i+3,j-1) * (-0.0625d0) &
+                 + in(i+4,j-1) * (-0.0625d0) &
+                 + in(i-4,j+1) * (-0.002232142857142857d0) &
+                 + in(i-3,j+1) * (-0.004166666666666667d0) &
+                 + in(i-2,j+1) * (-0.010416666666666666d0) &
+                 + in(i-1,j+1) * (-0.0625d0) &
+                 + in(i+1,j+1) * (0.0625d0) &
+                 + in(i+2,j+1) * (0.010416666666666666d0) &
+                 + in(i+3,j+1) * (0.004166666666666667d0) &
+                 + in(i+4,j+1) * (0.002232142857142857d0) &
+                 + in(i-4,j+2) * (-0.002232142857142857d0) &
+                 + in(i-3,j+2) * (-0.004166666666666667d0) &
+                 + in(i-2,j+2) * (-0.010416666666666666d0) &
+                 + in(i-1,j+2) * (-0.0625d0) &
+                 + in(i+1,j+2) * (0.010416666666666666d0) &
+                 + in(i+2,j+2) * (0.03125d0) &
+                 + in(i+3,j+2) * (0.004166666666666667d0) &
+                 + in(i+4,j+2) * (0.002232142857142857d0) &
+                 + in(i-4,j+3) * (-0.002232142857142857d0) &
+                 + in(i-3,j+3) * (-0.004166666666666667d0) &
+                 + in(i-2,j+3) * (-0.010416666666666666d0) &
+                 + in(i-1,j+3) * (-0.0625d0) &
+                 + in(i+1,j+3) * (0.004166666666666667d0) &
+                 + in(i+2,j+3) * (0.004166666666666667d0) &
+                 + in(i+3,j+3) * (0.020833333333333332d0) &
+                 + in(i+4,j+3) * (0.002232142857142857d0) &
+                 + in(i-4,j+4) * (-0.002232142857142857d0) &
+                 + in(i-3,j+4) * (-0.004166666666666667d0) &
+                 + in(i-2,j+4) * (-0.010416666666666666d0) &
+                 + in(i-1,j+4) * (-0.0625d0) &
+                 + in(i+1,j+4) * (0.002232142857142857d0) &
+                 + in(i+2,j+4) * (0.002232142857142857d0) &
+                 + in(i+3,j+4) * (0.002232142857142857d0) &
+                 + in(i+4,j+4) * (0.015625d0) &
 +0.0
       end do
       !$omp end simd
@@ -537,86 +537,86 @@ subroutine grid5(n, in, out)
       !$omp simd
       do j=5,n-5-1
         out(i,j) = out(i,j) &
-                 + in(i-5,j-5) * (-0.01) &
-                 + in(i+1,j-5) * (-0.0011111111111111111) &
-                 + in(i+2,j-5) * (-0.0011111111111111111) &
-                 + in(i+3,j-5) * (-0.0011111111111111111) &
-                 + in(i+4,j-5) * (-0.0011111111111111111) &
-                 + in(i+5,j-5) * (-0.0011111111111111111) &
-                 + in(i-4,j-4) * (-0.0125) &
-                 + in(i+1,j-4) * (-0.0017857142857142857) &
-                 + in(i+2,j-4) * (-0.0017857142857142857) &
-                 + in(i+3,j-4) * (-0.0017857142857142857) &
-                 + in(i+4,j-4) * (-0.0017857142857142857) &
-                 + in(i+5,j-4) * (-0.0017857142857142857) &
-                 + in(i-3,j-3) * (-0.016666666666666666) &
-                 + in(i+1,j-3) * (-0.0033333333333333335) &
-                 + in(i+2,j-3) * (-0.0033333333333333335) &
-                 + in(i+3,j-3) * (-0.0033333333333333335) &
-                 + in(i+4,j-3) * (-0.0033333333333333335) &
-                 + in(i+5,j-3) * (-0.0033333333333333335) &
-                 + in(i-2,j-2) * (-0.025) &
-                 + in(i+1,j-2) * (-0.008333333333333333) &
-                 + in(i+2,j-2) * (-0.008333333333333333) &
-                 + in(i+3,j-2) * (-0.008333333333333333) &
-                 + in(i+4,j-2) * (-0.008333333333333333) &
-                 + in(i+5,j-2) * (-0.008333333333333333) &
-                 + in(i-1,j-1) * (-0.05) &
-                 + in(i+1,j-1) * (-0.05) &
-                 + in(i+2,j-1) * (-0.05) &
-                 + in(i+3,j-1) * (-0.05) &
-                 + in(i+4,j-1) * (-0.05) &
-                 + in(i+5,j-1) * (-0.05) &
-                 + in(i-5,j+1) * (-0.0011111111111111111) &
-                 + in(i-4,j+1) * (-0.0017857142857142857) &
-                 + in(i-3,j+1) * (-0.0033333333333333335) &
-                 + in(i-2,j+1) * (-0.008333333333333333) &
-                 + in(i-1,j+1) * (-0.05) &
-                 + in(i+1,j+1) * (0.05) &
-                 + in(i+2,j+1) * (0.008333333333333333) &
-                 + in(i+3,j+1) * (0.0033333333333333335) &
-                 + in(i+4,j+1) * (0.0017857142857142857) &
-                 + in(i+5,j+1) * (0.0011111111111111111) &
-                 + in(i-5,j+2) * (-0.0011111111111111111) &
-                 + in(i-4,j+2) * (-0.0017857142857142857) &
-                 + in(i-3,j+2) * (-0.0033333333333333335) &
-                 + in(i-2,j+2) * (-0.008333333333333333) &
-                 + in(i-1,j+2) * (-0.05) &
-                 + in(i+1,j+2) * (0.008333333333333333) &
-                 + in(i+2,j+2) * (0.025) &
-                 + in(i+3,j+2) * (0.0033333333333333335) &
-                 + in(i+4,j+2) * (0.0017857142857142857) &
-                 + in(i+5,j+2) * (0.0011111111111111111) &
-                 + in(i-5,j+3) * (-0.0011111111111111111) &
-                 + in(i-4,j+3) * (-0.0017857142857142857) &
-                 + in(i-3,j+3) * (-0.0033333333333333335) &
-                 + in(i-2,j+3) * (-0.008333333333333333) &
-                 + in(i-1,j+3) * (-0.05) &
-                 + in(i+1,j+3) * (0.0033333333333333335) &
-                 + in(i+2,j+3) * (0.0033333333333333335) &
-                 + in(i+3,j+3) * (0.016666666666666666) &
-                 + in(i+4,j+3) * (0.0017857142857142857) &
-                 + in(i+5,j+3) * (0.0011111111111111111) &
-                 + in(i-5,j+4) * (-0.0011111111111111111) &
-                 + in(i-4,j+4) * (-0.0017857142857142857) &
-                 + in(i-3,j+4) * (-0.0033333333333333335) &
-                 + in(i-2,j+4) * (-0.008333333333333333) &
-                 + in(i-1,j+4) * (-0.05) &
-                 + in(i+1,j+4) * (0.0017857142857142857) &
-                 + in(i+2,j+4) * (0.0017857142857142857) &
-                 + in(i+3,j+4) * (0.0017857142857142857) &
-                 + in(i+4,j+4) * (0.0125) &
-                 + in(i+5,j+4) * (0.0011111111111111111) &
-                 + in(i-5,j+5) * (-0.0011111111111111111) &
-                 + in(i-4,j+5) * (-0.0017857142857142857) &
-                 + in(i-3,j+5) * (-0.0033333333333333335) &
-                 + in(i-2,j+5) * (-0.008333333333333333) &
-                 + in(i-1,j+5) * (-0.05) &
-                 + in(i+1,j+5) * (0.0011111111111111111) &
-                 + in(i+2,j+5) * (0.0011111111111111111) &
-                 + in(i+3,j+5) * (0.0011111111111111111) &
-                 + in(i+4,j+5) * (0.0011111111111111111) &
-                 + in(i+5,j+5) * (0.01) &
+                 + in(i-5,j-5) * (-0.01d0) &
+                 + in(i+1,j-5) * (-0.0011111111111111111d0) &
+                 + in(i+2,j-5) * (-0.0011111111111111111d0) &
+                 + in(i+3,j-5) * (-0.0011111111111111111d0) &
+                 + in(i+4,j-5) * (-0.0011111111111111111d0) &
+                 + in(i+5,j-5) * (-0.0011111111111111111d0) &
+                 + in(i-4,j-4) * (-0.0125d0) &
+                 + in(i+1,j-4) * (-0.0017857142857142857d0) &
+                 + in(i+2,j-4) * (-0.0017857142857142857d0) &
+                 + in(i+3,j-4) * (-0.0017857142857142857d0) &
+                 + in(i+4,j-4) * (-0.0017857142857142857d0) &
+                 + in(i+5,j-4) * (-0.0017857142857142857d0) &
+                 + in(i-3,j-3) * (-0.016666666666666666d0) &
+                 + in(i+1,j-3) * (-0.0033333333333333335d0) &
+                 + in(i+2,j-3) * (-0.0033333333333333335d0) &
+                 + in(i+3,j-3) * (-0.0033333333333333335d0) &
+                 + in(i+4,j-3) * (-0.0033333333333333335d0) &
+                 + in(i+5,j-3) * (-0.0033333333333333335d0) &
+                 + in(i-2,j-2) * (-0.025d0) &
+                 + in(i+1,j-2) * (-0.008333333333333333d0) &
+                 + in(i+2,j-2) * (-0.008333333333333333d0) &
+                 + in(i+3,j-2) * (-0.008333333333333333d0) &
+                 + in(i+4,j-2) * (-0.008333333333333333d0) &
+                 + in(i+5,j-2) * (-0.008333333333333333d0) &
+                 + in(i-1,j-1) * (-0.05d0) &
+                 + in(i+1,j-1) * (-0.05d0) &
+                 + in(i+2,j-1) * (-0.05d0) &
+                 + in(i+3,j-1) * (-0.05d0) &
+                 + in(i+4,j-1) * (-0.05d0) &
+                 + in(i+5,j-1) * (-0.05d0) &
+                 + in(i-5,j+1) * (-0.0011111111111111111d0) &
+                 + in(i-4,j+1) * (-0.0017857142857142857d0) &
+                 + in(i-3,j+1) * (-0.0033333333333333335d0) &
+                 + in(i-2,j+1) * (-0.008333333333333333d0) &
+                 + in(i-1,j+1) * (-0.05d0) &
+                 + in(i+1,j+1) * (0.05d0) &
+                 + in(i+2,j+1) * (0.008333333333333333d0) &
+                 + in(i+3,j+1) * (0.0033333333333333335d0) &
+                 + in(i+4,j+1) * (0.0017857142857142857d0) &
+                 + in(i+5,j+1) * (0.0011111111111111111d0) &
+                 + in(i-5,j+2) * (-0.0011111111111111111d0) &
+                 + in(i-4,j+2) * (-0.0017857142857142857d0) &
+                 + in(i-3,j+2) * (-0.0033333333333333335d0) &
+                 + in(i-2,j+2) * (-0.008333333333333333d0) &
+                 + in(i-1,j+2) * (-0.05d0) &
+                 + in(i+1,j+2) * (0.008333333333333333d0) &
+                 + in(i+2,j+2) * (0.025d0) &
+                 + in(i+3,j+2) * (0.0033333333333333335d0) &
+                 + in(i+4,j+2) * (0.0017857142857142857d0) &
+                 + in(i+5,j+2) * (0.0011111111111111111d0) &
+                 + in(i-5,j+3) * (-0.0011111111111111111d0) &
+                 + in(i-4,j+3) * (-0.0017857142857142857d0) &
+                 + in(i-3,j+3) * (-0.0033333333333333335d0) &
+                 + in(i-2,j+3) * (-0.008333333333333333d0) &
+                 + in(i-1,j+3) * (-0.05d0) &
+                 + in(i+1,j+3) * (0.0033333333333333335d0) &
+                 + in(i+2,j+3) * (0.0033333333333333335d0) &
+                 + in(i+3,j+3) * (0.016666666666666666d0) &
+                 + in(i+4,j+3) * (0.0017857142857142857d0) &
+                 + in(i+5,j+3) * (0.0011111111111111111d0) &
+                 + in(i-5,j+4) * (-0.0011111111111111111d0) &
+                 + in(i-4,j+4) * (-0.0017857142857142857d0) &
+                 + in(i-3,j+4) * (-0.0033333333333333335d0) &
+                 + in(i-2,j+4) * (-0.008333333333333333d0) &
+                 + in(i-1,j+4) * (-0.05d0) &
+                 + in(i+1,j+4) * (0.0017857142857142857d0) &
+                 + in(i+2,j+4) * (0.0017857142857142857d0) &
+                 + in(i+3,j+4) * (0.0017857142857142857d0) &
+                 + in(i+4,j+4) * (0.0125d0) &
+                 + in(i+5,j+4) * (0.0011111111111111111d0) &
+                 + in(i-5,j+5) * (-0.0011111111111111111d0) &
+                 + in(i-4,j+5) * (-0.0017857142857142857d0) &
+                 + in(i-3,j+5) * (-0.0033333333333333335d0) &
+                 + in(i-2,j+5) * (-0.008333333333333333d0) &
+                 + in(i-1,j+5) * (-0.05d0) &
+                 + in(i+1,j+5) * (0.0011111111111111111d0) &
+                 + in(i+2,j+5) * (0.0011111111111111111d0) &
+                 + in(i+3,j+5) * (0.0011111111111111111d0) &
+                 + in(i+4,j+5) * (0.0011111111111111111d0) &
+                 + in(i+5,j+5) * (0.01d0) &
 +0.0
       end do
       !$omp end simd
@@ -636,120 +636,120 @@ subroutine grid6(n, in, out)
       !$omp simd
       do j=6,n-6-1
         out(i,j) = out(i,j) &
-                 + in(i-6,j-6) * (-0.006944444444444444) &
-                 + in(i+1,j-6) * (-0.0006313131313131314) &
-                 + in(i+2,j-6) * (-0.0006313131313131314) &
-                 + in(i+3,j-6) * (-0.0006313131313131314) &
-                 + in(i+4,j-6) * (-0.0006313131313131314) &
-                 + in(i+5,j-6) * (-0.0006313131313131314) &
-                 + in(i+6,j-6) * (-0.0006313131313131314) &
-                 + in(i-5,j-5) * (-0.008333333333333333) &
-                 + in(i+1,j-5) * (-0.000925925925925926) &
-                 + in(i+2,j-5) * (-0.000925925925925926) &
-                 + in(i+3,j-5) * (-0.000925925925925926) &
-                 + in(i+4,j-5) * (-0.000925925925925926) &
-                 + in(i+5,j-5) * (-0.000925925925925926) &
-                 + in(i+6,j-5) * (-0.000925925925925926) &
-                 + in(i-4,j-4) * (-0.010416666666666666) &
-                 + in(i+1,j-4) * (-0.001488095238095238) &
-                 + in(i+2,j-4) * (-0.001488095238095238) &
-                 + in(i+3,j-4) * (-0.001488095238095238) &
-                 + in(i+4,j-4) * (-0.001488095238095238) &
-                 + in(i+5,j-4) * (-0.001488095238095238) &
-                 + in(i+6,j-4) * (-0.001488095238095238) &
-                 + in(i-3,j-3) * (-0.013888888888888888) &
-                 + in(i+1,j-3) * (-0.002777777777777778) &
-                 + in(i+2,j-3) * (-0.002777777777777778) &
-                 + in(i+3,j-3) * (-0.002777777777777778) &
-                 + in(i+4,j-3) * (-0.002777777777777778) &
-                 + in(i+5,j-3) * (-0.002777777777777778) &
-                 + in(i+6,j-3) * (-0.002777777777777778) &
-                 + in(i-2,j-2) * (-0.020833333333333332) &
-                 + in(i+1,j-2) * (-0.006944444444444444) &
-                 + in(i+2,j-2) * (-0.006944444444444444) &
-                 + in(i+3,j-2) * (-0.006944444444444444) &
-                 + in(i+4,j-2) * (-0.006944444444444444) &
-                 + in(i+5,j-2) * (-0.006944444444444444) &
-                 + in(i+6,j-2) * (-0.006944444444444444) &
-                 + in(i-1,j-1) * (-0.041666666666666664) &
-                 + in(i+1,j-1) * (-0.041666666666666664) &
-                 + in(i+2,j-1) * (-0.041666666666666664) &
-                 + in(i+3,j-1) * (-0.041666666666666664) &
-                 + in(i+4,j-1) * (-0.041666666666666664) &
-                 + in(i+5,j-1) * (-0.041666666666666664) &
-                 + in(i+6,j-1) * (-0.041666666666666664) &
-                 + in(i-6,j+1) * (-0.0006313131313131314) &
-                 + in(i-5,j+1) * (-0.000925925925925926) &
-                 + in(i-4,j+1) * (-0.001488095238095238) &
-                 + in(i-3,j+1) * (-0.002777777777777778) &
-                 + in(i-2,j+1) * (-0.006944444444444444) &
-                 + in(i-1,j+1) * (-0.041666666666666664) &
-                 + in(i+1,j+1) * (0.041666666666666664) &
-                 + in(i+2,j+1) * (0.006944444444444444) &
-                 + in(i+3,j+1) * (0.002777777777777778) &
-                 + in(i+4,j+1) * (0.001488095238095238) &
-                 + in(i+5,j+1) * (0.000925925925925926) &
-                 + in(i+6,j+1) * (0.0006313131313131314) &
-                 + in(i-6,j+2) * (-0.0006313131313131314) &
-                 + in(i-5,j+2) * (-0.000925925925925926) &
-                 + in(i-4,j+2) * (-0.001488095238095238) &
-                 + in(i-3,j+2) * (-0.002777777777777778) &
-                 + in(i-2,j+2) * (-0.006944444444444444) &
-                 + in(i-1,j+2) * (-0.041666666666666664) &
-                 + in(i+1,j+2) * (0.006944444444444444) &
-                 + in(i+2,j+2) * (0.020833333333333332) &
-                 + in(i+3,j+2) * (0.002777777777777778) &
-                 + in(i+4,j+2) * (0.001488095238095238) &
-                 + in(i+5,j+2) * (0.000925925925925926) &
-                 + in(i+6,j+2) * (0.0006313131313131314) &
-                 + in(i-6,j+3) * (-0.0006313131313131314) &
-                 + in(i-5,j+3) * (-0.000925925925925926) &
-                 + in(i-4,j+3) * (-0.001488095238095238) &
-                 + in(i-3,j+3) * (-0.002777777777777778) &
-                 + in(i-2,j+3) * (-0.006944444444444444) &
-                 + in(i-1,j+3) * (-0.041666666666666664) &
-                 + in(i+1,j+3) * (0.002777777777777778) &
-                 + in(i+2,j+3) * (0.002777777777777778) &
-                 + in(i+3,j+3) * (0.013888888888888888) &
-                 + in(i+4,j+3) * (0.001488095238095238) &
-                 + in(i+5,j+3) * (0.000925925925925926) &
-                 + in(i+6,j+3) * (0.0006313131313131314) &
-                 + in(i-6,j+4) * (-0.0006313131313131314) &
-                 + in(i-5,j+4) * (-0.000925925925925926) &
-                 + in(i-4,j+4) * (-0.001488095238095238) &
-                 + in(i-3,j+4) * (-0.002777777777777778) &
-                 + in(i-2,j+4) * (-0.006944444444444444) &
-                 + in(i-1,j+4) * (-0.041666666666666664) &
-                 + in(i+1,j+4) * (0.001488095238095238) &
-                 + in(i+2,j+4) * (0.001488095238095238) &
-                 + in(i+3,j+4) * (0.001488095238095238) &
-                 + in(i+4,j+4) * (0.010416666666666666) &
-                 + in(i+5,j+4) * (0.000925925925925926) &
-                 + in(i+6,j+4) * (0.0006313131313131314) &
-                 + in(i-6,j+5) * (-0.0006313131313131314) &
-                 + in(i-5,j+5) * (-0.000925925925925926) &
-                 + in(i-4,j+5) * (-0.001488095238095238) &
-                 + in(i-3,j+5) * (-0.002777777777777778) &
-                 + in(i-2,j+5) * (-0.006944444444444444) &
-                 + in(i-1,j+5) * (-0.041666666666666664) &
-                 + in(i+1,j+5) * (0.000925925925925926) &
-                 + in(i+2,j+5) * (0.000925925925925926) &
-                 + in(i+3,j+5) * (0.000925925925925926) &
-                 + in(i+4,j+5) * (0.000925925925925926) &
-                 + in(i+5,j+5) * (0.008333333333333333) &
-                 + in(i+6,j+5) * (0.0006313131313131314) &
-                 + in(i-6,j+6) * (-0.0006313131313131314) &
-                 + in(i-5,j+6) * (-0.000925925925925926) &
-                 + in(i-4,j+6) * (-0.001488095238095238) &
-                 + in(i-3,j+6) * (-0.002777777777777778) &
-                 + in(i-2,j+6) * (-0.006944444444444444) &
-                 + in(i-1,j+6) * (-0.041666666666666664) &
-                 + in(i+1,j+6) * (0.0006313131313131314) &
-                 + in(i+2,j+6) * (0.0006313131313131314) &
-                 + in(i+3,j+6) * (0.0006313131313131314) &
-                 + in(i+4,j+6) * (0.0006313131313131314) &
-                 + in(i+5,j+6) * (0.0006313131313131314) &
-                 + in(i+6,j+6) * (0.006944444444444444) &
+                 + in(i-6,j-6) * (-0.006944444444444444d0) &
+                 + in(i+1,j-6) * (-0.0006313131313131314d0) &
+                 + in(i+2,j-6) * (-0.0006313131313131314d0) &
+                 + in(i+3,j-6) * (-0.0006313131313131314d0) &
+                 + in(i+4,j-6) * (-0.0006313131313131314d0) &
+                 + in(i+5,j-6) * (-0.0006313131313131314d0) &
+                 + in(i+6,j-6) * (-0.0006313131313131314d0) &
+                 + in(i-5,j-5) * (-0.008333333333333333d0) &
+                 + in(i+1,j-5) * (-0.000925925925925926d0) &
+                 + in(i+2,j-5) * (-0.000925925925925926d0) &
+                 + in(i+3,j-5) * (-0.000925925925925926d0) &
+                 + in(i+4,j-5) * (-0.000925925925925926d0) &
+                 + in(i+5,j-5) * (-0.000925925925925926d0) &
+                 + in(i+6,j-5) * (-0.000925925925925926d0) &
+                 + in(i-4,j-4) * (-0.010416666666666666d0) &
+                 + in(i+1,j-4) * (-0.001488095238095238d0) &
+                 + in(i+2,j-4) * (-0.001488095238095238d0) &
+                 + in(i+3,j-4) * (-0.001488095238095238d0) &
+                 + in(i+4,j-4) * (-0.001488095238095238d0) &
+                 + in(i+5,j-4) * (-0.001488095238095238d0) &
+                 + in(i+6,j-4) * (-0.001488095238095238d0) &
+                 + in(i-3,j-3) * (-0.013888888888888888d0) &
+                 + in(i+1,j-3) * (-0.002777777777777778d0) &
+                 + in(i+2,j-3) * (-0.002777777777777778d0) &
+                 + in(i+3,j-3) * (-0.002777777777777778d0) &
+                 + in(i+4,j-3) * (-0.002777777777777778d0) &
+                 + in(i+5,j-3) * (-0.002777777777777778d0) &
+                 + in(i+6,j-3) * (-0.002777777777777778d0) &
+                 + in(i-2,j-2) * (-0.020833333333333332d0) &
+                 + in(i+1,j-2) * (-0.006944444444444444d0) &
+                 + in(i+2,j-2) * (-0.006944444444444444d0) &
+                 + in(i+3,j-2) * (-0.006944444444444444d0) &
+                 + in(i+4,j-2) * (-0.006944444444444444d0) &
+                 + in(i+5,j-2) * (-0.006944444444444444d0) &
+                 + in(i+6,j-2) * (-0.006944444444444444d0) &
+                 + in(i-1,j-1) * (-0.041666666666666664d0) &
+                 + in(i+1,j-1) * (-0.041666666666666664d0) &
+                 + in(i+2,j-1) * (-0.041666666666666664d0) &
+                 + in(i+3,j-1) * (-0.041666666666666664d0) &
+                 + in(i+4,j-1) * (-0.041666666666666664d0) &
+                 + in(i+5,j-1) * (-0.041666666666666664d0) &
+                 + in(i+6,j-1) * (-0.041666666666666664d0) &
+                 + in(i-6,j+1) * (-0.0006313131313131314d0) &
+                 + in(i-5,j+1) * (-0.000925925925925926d0) &
+                 + in(i-4,j+1) * (-0.001488095238095238d0) &
+                 + in(i-3,j+1) * (-0.002777777777777778d0) &
+                 + in(i-2,j+1) * (-0.006944444444444444d0) &
+                 + in(i-1,j+1) * (-0.041666666666666664d0) &
+                 + in(i+1,j+1) * (0.041666666666666664d0) &
+                 + in(i+2,j+1) * (0.006944444444444444d0) &
+                 + in(i+3,j+1) * (0.002777777777777778d0) &
+                 + in(i+4,j+1) * (0.001488095238095238d0) &
+                 + in(i+5,j+1) * (0.000925925925925926d0) &
+                 + in(i+6,j+1) * (0.0006313131313131314d0) &
+                 + in(i-6,j+2) * (-0.0006313131313131314d0) &
+                 + in(i-5,j+2) * (-0.000925925925925926d0) &
+                 + in(i-4,j+2) * (-0.001488095238095238d0) &
+                 + in(i-3,j+2) * (-0.002777777777777778d0) &
+                 + in(i-2,j+2) * (-0.006944444444444444d0) &
+                 + in(i-1,j+2) * (-0.041666666666666664d0) &
+                 + in(i+1,j+2) * (0.006944444444444444d0) &
+                 + in(i+2,j+2) * (0.020833333333333332d0) &
+                 + in(i+3,j+2) * (0.002777777777777778d0) &
+                 + in(i+4,j+2) * (0.001488095238095238d0) &
+                 + in(i+5,j+2) * (0.000925925925925926d0) &
+                 + in(i+6,j+2) * (0.0006313131313131314d0) &
+                 + in(i-6,j+3) * (-0.0006313131313131314d0) &
+                 + in(i-5,j+3) * (-0.000925925925925926d0) &
+                 + in(i-4,j+3) * (-0.001488095238095238d0) &
+                 + in(i-3,j+3) * (-0.002777777777777778d0) &
+                 + in(i-2,j+3) * (-0.006944444444444444d0) &
+                 + in(i-1,j+3) * (-0.041666666666666664d0) &
+                 + in(i+1,j+3) * (0.002777777777777778d0) &
+                 + in(i+2,j+3) * (0.002777777777777778d0) &
+                 + in(i+3,j+3) * (0.013888888888888888d0) &
+                 + in(i+4,j+3) * (0.001488095238095238d0) &
+                 + in(i+5,j+3) * (0.000925925925925926d0) &
+                 + in(i+6,j+3) * (0.0006313131313131314d0) &
+                 + in(i-6,j+4) * (-0.0006313131313131314d0) &
+                 + in(i-5,j+4) * (-0.000925925925925926d0) &
+                 + in(i-4,j+4) * (-0.001488095238095238d0) &
+                 + in(i-3,j+4) * (-0.002777777777777778d0) &
+                 + in(i-2,j+4) * (-0.006944444444444444d0) &
+                 + in(i-1,j+4) * (-0.041666666666666664d0) &
+                 + in(i+1,j+4) * (0.001488095238095238d0) &
+                 + in(i+2,j+4) * (0.001488095238095238d0) &
+                 + in(i+3,j+4) * (0.001488095238095238d0) &
+                 + in(i+4,j+4) * (0.010416666666666666d0) &
+                 + in(i+5,j+4) * (0.000925925925925926d0) &
+                 + in(i+6,j+4) * (0.0006313131313131314d0) &
+                 + in(i-6,j+5) * (-0.0006313131313131314d0) &
+                 + in(i-5,j+5) * (-0.000925925925925926d0) &
+                 + in(i-4,j+5) * (-0.001488095238095238d0) &
+                 + in(i-3,j+5) * (-0.002777777777777778d0) &
+                 + in(i-2,j+5) * (-0.006944444444444444d0) &
+                 + in(i-1,j+5) * (-0.041666666666666664d0) &
+                 + in(i+1,j+5) * (0.000925925925925926d0) &
+                 + in(i+2,j+5) * (0.000925925925925926d0) &
+                 + in(i+3,j+5) * (0.000925925925925926d0) &
+                 + in(i+4,j+5) * (0.000925925925925926d0) &
+                 + in(i+5,j+5) * (0.008333333333333333d0) &
+                 + in(i+6,j+5) * (0.0006313131313131314d0) &
+                 + in(i-6,j+6) * (-0.0006313131313131314d0) &
+                 + in(i-5,j+6) * (-0.000925925925925926d0) &
+                 + in(i-4,j+6) * (-0.001488095238095238d0) &
+                 + in(i-3,j+6) * (-0.002777777777777778d0) &
+                 + in(i-2,j+6) * (-0.006944444444444444d0) &
+                 + in(i-1,j+6) * (-0.041666666666666664d0) &
+                 + in(i+1,j+6) * (0.0006313131313131314d0) &
+                 + in(i+2,j+6) * (0.0006313131313131314d0) &
+                 + in(i+3,j+6) * (0.0006313131313131314d0) &
+                 + in(i+4,j+6) * (0.0006313131313131314d0) &
+                 + in(i+5,j+6) * (0.0006313131313131314d0) &
+                 + in(i+6,j+6) * (0.006944444444444444d0) &
 +0.0
       end do
       !$omp end simd
@@ -769,160 +769,160 @@ subroutine grid7(n, in, out)
       !$omp simd
       do j=7,n-7-1
         out(i,j) = out(i,j) &
-                 + in(i-7,j-7) * (-0.00510204081632653) &
-                 + in(i+1,j-7) * (-0.0003924646781789639) &
-                 + in(i+2,j-7) * (-0.0003924646781789639) &
-                 + in(i+3,j-7) * (-0.0003924646781789639) &
-                 + in(i+4,j-7) * (-0.0003924646781789639) &
-                 + in(i+5,j-7) * (-0.0003924646781789639) &
-                 + in(i+6,j-7) * (-0.0003924646781789639) &
-                 + in(i+7,j-7) * (-0.0003924646781789639) &
-                 + in(i-6,j-6) * (-0.005952380952380952) &
-                 + in(i+1,j-6) * (-0.0005411255411255411) &
-                 + in(i+2,j-6) * (-0.0005411255411255411) &
-                 + in(i+3,j-6) * (-0.0005411255411255411) &
-                 + in(i+4,j-6) * (-0.0005411255411255411) &
-                 + in(i+5,j-6) * (-0.0005411255411255411) &
-                 + in(i+6,j-6) * (-0.0005411255411255411) &
-                 + in(i+7,j-6) * (-0.0005411255411255411) &
-                 + in(i-5,j-5) * (-0.007142857142857143) &
-                 + in(i+1,j-5) * (-0.0007936507936507937) &
-                 + in(i+2,j-5) * (-0.0007936507936507937) &
-                 + in(i+3,j-5) * (-0.0007936507936507937) &
-                 + in(i+4,j-5) * (-0.0007936507936507937) &
-                 + in(i+5,j-5) * (-0.0007936507936507937) &
-                 + in(i+6,j-5) * (-0.0007936507936507937) &
-                 + in(i+7,j-5) * (-0.0007936507936507937) &
-                 + in(i-4,j-4) * (-0.008928571428571428) &
-                 + in(i+1,j-4) * (-0.0012755102040816326) &
-                 + in(i+2,j-4) * (-0.0012755102040816326) &
-                 + in(i+3,j-4) * (-0.0012755102040816326) &
-                 + in(i+4,j-4) * (-0.0012755102040816326) &
-                 + in(i+5,j-4) * (-0.0012755102040816326) &
-                 + in(i+6,j-4) * (-0.0012755102040816326) &
-                 + in(i+7,j-4) * (-0.0012755102040816326) &
-                 + in(i-3,j-3) * (-0.011904761904761904) &
-                 + in(i+1,j-3) * (-0.002380952380952381) &
-                 + in(i+2,j-3) * (-0.002380952380952381) &
-                 + in(i+3,j-3) * (-0.002380952380952381) &
-                 + in(i+4,j-3) * (-0.002380952380952381) &
-                 + in(i+5,j-3) * (-0.002380952380952381) &
-                 + in(i+6,j-3) * (-0.002380952380952381) &
-                 + in(i+7,j-3) * (-0.002380952380952381) &
-                 + in(i-2,j-2) * (-0.017857142857142856) &
-                 + in(i+1,j-2) * (-0.005952380952380952) &
-                 + in(i+2,j-2) * (-0.005952380952380952) &
-                 + in(i+3,j-2) * (-0.005952380952380952) &
-                 + in(i+4,j-2) * (-0.005952380952380952) &
-                 + in(i+5,j-2) * (-0.005952380952380952) &
-                 + in(i+6,j-2) * (-0.005952380952380952) &
-                 + in(i+7,j-2) * (-0.005952380952380952) &
-                 + in(i-1,j-1) * (-0.03571428571428571) &
-                 + in(i+1,j-1) * (-0.03571428571428571) &
-                 + in(i+2,j-1) * (-0.03571428571428571) &
-                 + in(i+3,j-1) * (-0.03571428571428571) &
-                 + in(i+4,j-1) * (-0.03571428571428571) &
-                 + in(i+5,j-1) * (-0.03571428571428571) &
-                 + in(i+6,j-1) * (-0.03571428571428571) &
-                 + in(i+7,j-1) * (-0.03571428571428571) &
-                 + in(i-7,j+1) * (-0.0003924646781789639) &
-                 + in(i-6,j+1) * (-0.0005411255411255411) &
-                 + in(i-5,j+1) * (-0.0007936507936507937) &
-                 + in(i-4,j+1) * (-0.0012755102040816326) &
-                 + in(i-3,j+1) * (-0.002380952380952381) &
-                 + in(i-2,j+1) * (-0.005952380952380952) &
-                 + in(i-1,j+1) * (-0.03571428571428571) &
-                 + in(i+1,j+1) * (0.03571428571428571) &
-                 + in(i+2,j+1) * (0.005952380952380952) &
-                 + in(i+3,j+1) * (0.002380952380952381) &
-                 + in(i+4,j+1) * (0.0012755102040816326) &
-                 + in(i+5,j+1) * (0.0007936507936507937) &
-                 + in(i+6,j+1) * (0.0005411255411255411) &
-                 + in(i+7,j+1) * (0.0003924646781789639) &
-                 + in(i-7,j+2) * (-0.0003924646781789639) &
-                 + in(i-6,j+2) * (-0.0005411255411255411) &
-                 + in(i-5,j+2) * (-0.0007936507936507937) &
-                 + in(i-4,j+2) * (-0.0012755102040816326) &
-                 + in(i-3,j+2) * (-0.002380952380952381) &
-                 + in(i-2,j+2) * (-0.005952380952380952) &
-                 + in(i-1,j+2) * (-0.03571428571428571) &
-                 + in(i+1,j+2) * (0.005952380952380952) &
-                 + in(i+2,j+2) * (0.017857142857142856) &
-                 + in(i+3,j+2) * (0.002380952380952381) &
-                 + in(i+4,j+2) * (0.0012755102040816326) &
-                 + in(i+5,j+2) * (0.0007936507936507937) &
-                 + in(i+6,j+2) * (0.0005411255411255411) &
-                 + in(i+7,j+2) * (0.0003924646781789639) &
-                 + in(i-7,j+3) * (-0.0003924646781789639) &
-                 + in(i-6,j+3) * (-0.0005411255411255411) &
-                 + in(i-5,j+3) * (-0.0007936507936507937) &
-                 + in(i-4,j+3) * (-0.0012755102040816326) &
-                 + in(i-3,j+3) * (-0.002380952380952381) &
-                 + in(i-2,j+3) * (-0.005952380952380952) &
-                 + in(i-1,j+3) * (-0.03571428571428571) &
-                 + in(i+1,j+3) * (0.002380952380952381) &
-                 + in(i+2,j+3) * (0.002380952380952381) &
-                 + in(i+3,j+3) * (0.011904761904761904) &
-                 + in(i+4,j+3) * (0.0012755102040816326) &
-                 + in(i+5,j+3) * (0.0007936507936507937) &
-                 + in(i+6,j+3) * (0.0005411255411255411) &
-                 + in(i+7,j+3) * (0.0003924646781789639) &
-                 + in(i-7,j+4) * (-0.0003924646781789639) &
-                 + in(i-6,j+4) * (-0.0005411255411255411) &
-                 + in(i-5,j+4) * (-0.0007936507936507937) &
-                 + in(i-4,j+4) * (-0.0012755102040816326) &
-                 + in(i-3,j+4) * (-0.002380952380952381) &
-                 + in(i-2,j+4) * (-0.005952380952380952) &
-                 + in(i-1,j+4) * (-0.03571428571428571) &
-                 + in(i+1,j+4) * (0.0012755102040816326) &
-                 + in(i+2,j+4) * (0.0012755102040816326) &
-                 + in(i+3,j+4) * (0.0012755102040816326) &
-                 + in(i+4,j+4) * (0.008928571428571428) &
-                 + in(i+5,j+4) * (0.0007936507936507937) &
-                 + in(i+6,j+4) * (0.0005411255411255411) &
-                 + in(i+7,j+4) * (0.0003924646781789639) &
-                 + in(i-7,j+5) * (-0.0003924646781789639) &
-                 + in(i-6,j+5) * (-0.0005411255411255411) &
-                 + in(i-5,j+5) * (-0.0007936507936507937) &
-                 + in(i-4,j+5) * (-0.0012755102040816326) &
-                 + in(i-3,j+5) * (-0.002380952380952381) &
-                 + in(i-2,j+5) * (-0.005952380952380952) &
-                 + in(i-1,j+5) * (-0.03571428571428571) &
-                 + in(i+1,j+5) * (0.0007936507936507937) &
-                 + in(i+2,j+5) * (0.0007936507936507937) &
-                 + in(i+3,j+5) * (0.0007936507936507937) &
-                 + in(i+4,j+5) * (0.0007936507936507937) &
-                 + in(i+5,j+5) * (0.007142857142857143) &
-                 + in(i+6,j+5) * (0.0005411255411255411) &
-                 + in(i+7,j+5) * (0.0003924646781789639) &
-                 + in(i-7,j+6) * (-0.0003924646781789639) &
-                 + in(i-6,j+6) * (-0.0005411255411255411) &
-                 + in(i-5,j+6) * (-0.0007936507936507937) &
-                 + in(i-4,j+6) * (-0.0012755102040816326) &
-                 + in(i-3,j+6) * (-0.002380952380952381) &
-                 + in(i-2,j+6) * (-0.005952380952380952) &
-                 + in(i-1,j+6) * (-0.03571428571428571) &
-                 + in(i+1,j+6) * (0.0005411255411255411) &
-                 + in(i+2,j+6) * (0.0005411255411255411) &
-                 + in(i+3,j+6) * (0.0005411255411255411) &
-                 + in(i+4,j+6) * (0.0005411255411255411) &
-                 + in(i+5,j+6) * (0.0005411255411255411) &
-                 + in(i+6,j+6) * (0.005952380952380952) &
-                 + in(i+7,j+6) * (0.0003924646781789639) &
-                 + in(i-7,j+7) * (-0.0003924646781789639) &
-                 + in(i-6,j+7) * (-0.0005411255411255411) &
-                 + in(i-5,j+7) * (-0.0007936507936507937) &
-                 + in(i-4,j+7) * (-0.0012755102040816326) &
-                 + in(i-3,j+7) * (-0.002380952380952381) &
-                 + in(i-2,j+7) * (-0.005952380952380952) &
-                 + in(i-1,j+7) * (-0.03571428571428571) &
-                 + in(i+1,j+7) * (0.0003924646781789639) &
-                 + in(i+2,j+7) * (0.0003924646781789639) &
-                 + in(i+3,j+7) * (0.0003924646781789639) &
-                 + in(i+4,j+7) * (0.0003924646781789639) &
-                 + in(i+5,j+7) * (0.0003924646781789639) &
-                 + in(i+6,j+7) * (0.0003924646781789639) &
-                 + in(i+7,j+7) * (0.00510204081632653) &
+                 + in(i-7,j-7) * (-0.00510204081632653d0) &
+                 + in(i+1,j-7) * (-0.0003924646781789639d0) &
+                 + in(i+2,j-7) * (-0.0003924646781789639d0) &
+                 + in(i+3,j-7) * (-0.0003924646781789639d0) &
+                 + in(i+4,j-7) * (-0.0003924646781789639d0) &
+                 + in(i+5,j-7) * (-0.0003924646781789639d0) &
+                 + in(i+6,j-7) * (-0.0003924646781789639d0) &
+                 + in(i+7,j-7) * (-0.0003924646781789639d0) &
+                 + in(i-6,j-6) * (-0.005952380952380952d0) &
+                 + in(i+1,j-6) * (-0.0005411255411255411d0) &
+                 + in(i+2,j-6) * (-0.0005411255411255411d0) &
+                 + in(i+3,j-6) * (-0.0005411255411255411d0) &
+                 + in(i+4,j-6) * (-0.0005411255411255411d0) &
+                 + in(i+5,j-6) * (-0.0005411255411255411d0) &
+                 + in(i+6,j-6) * (-0.0005411255411255411d0) &
+                 + in(i+7,j-6) * (-0.0005411255411255411d0) &
+                 + in(i-5,j-5) * (-0.007142857142857143d0) &
+                 + in(i+1,j-5) * (-0.0007936507936507937d0) &
+                 + in(i+2,j-5) * (-0.0007936507936507937d0) &
+                 + in(i+3,j-5) * (-0.0007936507936507937d0) &
+                 + in(i+4,j-5) * (-0.0007936507936507937d0) &
+                 + in(i+5,j-5) * (-0.0007936507936507937d0) &
+                 + in(i+6,j-5) * (-0.0007936507936507937d0) &
+                 + in(i+7,j-5) * (-0.0007936507936507937d0) &
+                 + in(i-4,j-4) * (-0.008928571428571428d0) &
+                 + in(i+1,j-4) * (-0.0012755102040816326d0) &
+                 + in(i+2,j-4) * (-0.0012755102040816326d0) &
+                 + in(i+3,j-4) * (-0.0012755102040816326d0) &
+                 + in(i+4,j-4) * (-0.0012755102040816326d0) &
+                 + in(i+5,j-4) * (-0.0012755102040816326d0) &
+                 + in(i+6,j-4) * (-0.0012755102040816326d0) &
+                 + in(i+7,j-4) * (-0.0012755102040816326d0) &
+                 + in(i-3,j-3) * (-0.011904761904761904d0) &
+                 + in(i+1,j-3) * (-0.002380952380952381d0) &
+                 + in(i+2,j-3) * (-0.002380952380952381d0) &
+                 + in(i+3,j-3) * (-0.002380952380952381d0) &
+                 + in(i+4,j-3) * (-0.002380952380952381d0) &
+                 + in(i+5,j-3) * (-0.002380952380952381d0) &
+                 + in(i+6,j-3) * (-0.002380952380952381d0) &
+                 + in(i+7,j-3) * (-0.002380952380952381d0) &
+                 + in(i-2,j-2) * (-0.017857142857142856d0) &
+                 + in(i+1,j-2) * (-0.005952380952380952d0) &
+                 + in(i+2,j-2) * (-0.005952380952380952d0) &
+                 + in(i+3,j-2) * (-0.005952380952380952d0) &
+                 + in(i+4,j-2) * (-0.005952380952380952d0) &
+                 + in(i+5,j-2) * (-0.005952380952380952d0) &
+                 + in(i+6,j-2) * (-0.005952380952380952d0) &
+                 + in(i+7,j-2) * (-0.005952380952380952d0) &
+                 + in(i-1,j-1) * (-0.03571428571428571d0) &
+                 + in(i+1,j-1) * (-0.03571428571428571d0) &
+                 + in(i+2,j-1) * (-0.03571428571428571d0) &
+                 + in(i+3,j-1) * (-0.03571428571428571d0) &
+                 + in(i+4,j-1) * (-0.03571428571428571d0) &
+                 + in(i+5,j-1) * (-0.03571428571428571d0) &
+                 + in(i+6,j-1) * (-0.03571428571428571d0) &
+                 + in(i+7,j-1) * (-0.03571428571428571d0) &
+                 + in(i-7,j+1) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+1) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+1) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+1) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+1) * (-0.002380952380952381d0) &
+                 + in(i-2,j+1) * (-0.005952380952380952d0) &
+                 + in(i-1,j+1) * (-0.03571428571428571d0) &
+                 + in(i+1,j+1) * (0.03571428571428571d0) &
+                 + in(i+2,j+1) * (0.005952380952380952d0) &
+                 + in(i+3,j+1) * (0.002380952380952381d0) &
+                 + in(i+4,j+1) * (0.0012755102040816326d0) &
+                 + in(i+5,j+1) * (0.0007936507936507937d0) &
+                 + in(i+6,j+1) * (0.0005411255411255411d0) &
+                 + in(i+7,j+1) * (0.0003924646781789639d0) &
+                 + in(i-7,j+2) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+2) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+2) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+2) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+2) * (-0.002380952380952381d0) &
+                 + in(i-2,j+2) * (-0.005952380952380952d0) &
+                 + in(i-1,j+2) * (-0.03571428571428571d0) &
+                 + in(i+1,j+2) * (0.005952380952380952d0) &
+                 + in(i+2,j+2) * (0.017857142857142856d0) &
+                 + in(i+3,j+2) * (0.002380952380952381d0) &
+                 + in(i+4,j+2) * (0.0012755102040816326d0) &
+                 + in(i+5,j+2) * (0.0007936507936507937d0) &
+                 + in(i+6,j+2) * (0.0005411255411255411d0) &
+                 + in(i+7,j+2) * (0.0003924646781789639d0) &
+                 + in(i-7,j+3) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+3) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+3) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+3) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+3) * (-0.002380952380952381d0) &
+                 + in(i-2,j+3) * (-0.005952380952380952d0) &
+                 + in(i-1,j+3) * (-0.03571428571428571d0) &
+                 + in(i+1,j+3) * (0.002380952380952381d0) &
+                 + in(i+2,j+3) * (0.002380952380952381d0) &
+                 + in(i+3,j+3) * (0.011904761904761904d0) &
+                 + in(i+4,j+3) * (0.0012755102040816326d0) &
+                 + in(i+5,j+3) * (0.0007936507936507937d0) &
+                 + in(i+6,j+3) * (0.0005411255411255411d0) &
+                 + in(i+7,j+3) * (0.0003924646781789639d0) &
+                 + in(i-7,j+4) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+4) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+4) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+4) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+4) * (-0.002380952380952381d0) &
+                 + in(i-2,j+4) * (-0.005952380952380952d0) &
+                 + in(i-1,j+4) * (-0.03571428571428571d0) &
+                 + in(i+1,j+4) * (0.0012755102040816326d0) &
+                 + in(i+2,j+4) * (0.0012755102040816326d0) &
+                 + in(i+3,j+4) * (0.0012755102040816326d0) &
+                 + in(i+4,j+4) * (0.008928571428571428d0) &
+                 + in(i+5,j+4) * (0.0007936507936507937d0) &
+                 + in(i+6,j+4) * (0.0005411255411255411d0) &
+                 + in(i+7,j+4) * (0.0003924646781789639d0) &
+                 + in(i-7,j+5) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+5) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+5) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+5) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+5) * (-0.002380952380952381d0) &
+                 + in(i-2,j+5) * (-0.005952380952380952d0) &
+                 + in(i-1,j+5) * (-0.03571428571428571d0) &
+                 + in(i+1,j+5) * (0.0007936507936507937d0) &
+                 + in(i+2,j+5) * (0.0007936507936507937d0) &
+                 + in(i+3,j+5) * (0.0007936507936507937d0) &
+                 + in(i+4,j+5) * (0.0007936507936507937d0) &
+                 + in(i+5,j+5) * (0.007142857142857143d0) &
+                 + in(i+6,j+5) * (0.0005411255411255411d0) &
+                 + in(i+7,j+5) * (0.0003924646781789639d0) &
+                 + in(i-7,j+6) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+6) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+6) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+6) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+6) * (-0.002380952380952381d0) &
+                 + in(i-2,j+6) * (-0.005952380952380952d0) &
+                 + in(i-1,j+6) * (-0.03571428571428571d0) &
+                 + in(i+1,j+6) * (0.0005411255411255411d0) &
+                 + in(i+2,j+6) * (0.0005411255411255411d0) &
+                 + in(i+3,j+6) * (0.0005411255411255411d0) &
+                 + in(i+4,j+6) * (0.0005411255411255411d0) &
+                 + in(i+5,j+6) * (0.0005411255411255411d0) &
+                 + in(i+6,j+6) * (0.005952380952380952d0) &
+                 + in(i+7,j+6) * (0.0003924646781789639d0) &
+                 + in(i-7,j+7) * (-0.0003924646781789639d0) &
+                 + in(i-6,j+7) * (-0.0005411255411255411d0) &
+                 + in(i-5,j+7) * (-0.0007936507936507937d0) &
+                 + in(i-4,j+7) * (-0.0012755102040816326d0) &
+                 + in(i-3,j+7) * (-0.002380952380952381d0) &
+                 + in(i-2,j+7) * (-0.005952380952380952d0) &
+                 + in(i-1,j+7) * (-0.03571428571428571d0) &
+                 + in(i+1,j+7) * (0.0003924646781789639d0) &
+                 + in(i+2,j+7) * (0.0003924646781789639d0) &
+                 + in(i+3,j+7) * (0.0003924646781789639d0) &
+                 + in(i+4,j+7) * (0.0003924646781789639d0) &
+                 + in(i+5,j+7) * (0.0003924646781789639d0) &
+                 + in(i+6,j+7) * (0.0003924646781789639d0) &
+                 + in(i+7,j+7) * (0.00510204081632653d0) &
 +0.0
       end do
       !$omp end simd
@@ -942,206 +942,206 @@ subroutine grid8(n, in, out)
       !$omp simd
       do j=8,n-8-1
         out(i,j) = out(i,j) &
-                 + in(i-8,j-8) * (-0.00390625) &
-                 + in(i+1,j-8) * (-0.00026041666666666666) &
-                 + in(i+2,j-8) * (-0.00026041666666666666) &
-                 + in(i+3,j-8) * (-0.00026041666666666666) &
-                 + in(i+4,j-8) * (-0.00026041666666666666) &
-                 + in(i+5,j-8) * (-0.00026041666666666666) &
-                 + in(i+6,j-8) * (-0.00026041666666666666) &
-                 + in(i+7,j-8) * (-0.00026041666666666666) &
-                 + in(i+8,j-8) * (-0.00026041666666666666) &
-                 + in(i-7,j-7) * (-0.004464285714285714) &
-                 + in(i+1,j-7) * (-0.00034340659340659343) &
-                 + in(i+2,j-7) * (-0.00034340659340659343) &
-                 + in(i+3,j-7) * (-0.00034340659340659343) &
-                 + in(i+4,j-7) * (-0.00034340659340659343) &
-                 + in(i+5,j-7) * (-0.00034340659340659343) &
-                 + in(i+6,j-7) * (-0.00034340659340659343) &
-                 + in(i+7,j-7) * (-0.00034340659340659343) &
-                 + in(i+8,j-7) * (-0.00034340659340659343) &
-                 + in(i-6,j-6) * (-0.005208333333333333) &
-                 + in(i+1,j-6) * (-0.0004734848484848485) &
-                 + in(i+2,j-6) * (-0.0004734848484848485) &
-                 + in(i+3,j-6) * (-0.0004734848484848485) &
-                 + in(i+4,j-6) * (-0.0004734848484848485) &
-                 + in(i+5,j-6) * (-0.0004734848484848485) &
-                 + in(i+6,j-6) * (-0.0004734848484848485) &
-                 + in(i+7,j-6) * (-0.0004734848484848485) &
-                 + in(i+8,j-6) * (-0.0004734848484848485) &
-                 + in(i-5,j-5) * (-0.00625) &
-                 + in(i+1,j-5) * (-0.0006944444444444445) &
-                 + in(i+2,j-5) * (-0.0006944444444444445) &
-                 + in(i+3,j-5) * (-0.0006944444444444445) &
-                 + in(i+4,j-5) * (-0.0006944444444444445) &
-                 + in(i+5,j-5) * (-0.0006944444444444445) &
-                 + in(i+6,j-5) * (-0.0006944444444444445) &
-                 + in(i+7,j-5) * (-0.0006944444444444445) &
-                 + in(i+8,j-5) * (-0.0006944444444444445) &
-                 + in(i-4,j-4) * (-0.0078125) &
-                 + in(i+1,j-4) * (-0.0011160714285714285) &
-                 + in(i+2,j-4) * (-0.0011160714285714285) &
-                 + in(i+3,j-4) * (-0.0011160714285714285) &
-                 + in(i+4,j-4) * (-0.0011160714285714285) &
-                 + in(i+5,j-4) * (-0.0011160714285714285) &
-                 + in(i+6,j-4) * (-0.0011160714285714285) &
-                 + in(i+7,j-4) * (-0.0011160714285714285) &
-                 + in(i+8,j-4) * (-0.0011160714285714285) &
-                 + in(i-3,j-3) * (-0.010416666666666666) &
-                 + in(i+1,j-3) * (-0.0020833333333333333) &
-                 + in(i+2,j-3) * (-0.0020833333333333333) &
-                 + in(i+3,j-3) * (-0.0020833333333333333) &
-                 + in(i+4,j-3) * (-0.0020833333333333333) &
-                 + in(i+5,j-3) * (-0.0020833333333333333) &
-                 + in(i+6,j-3) * (-0.0020833333333333333) &
-                 + in(i+7,j-3) * (-0.0020833333333333333) &
-                 + in(i+8,j-3) * (-0.0020833333333333333) &
-                 + in(i-2,j-2) * (-0.015625) &
-                 + in(i+1,j-2) * (-0.005208333333333333) &
-                 + in(i+2,j-2) * (-0.005208333333333333) &
-                 + in(i+3,j-2) * (-0.005208333333333333) &
-                 + in(i+4,j-2) * (-0.005208333333333333) &
-                 + in(i+5,j-2) * (-0.005208333333333333) &
-                 + in(i+6,j-2) * (-0.005208333333333333) &
-                 + in(i+7,j-2) * (-0.005208333333333333) &
-                 + in(i+8,j-2) * (-0.005208333333333333) &
-                 + in(i-1,j-1) * (-0.03125) &
-                 + in(i+1,j-1) * (-0.03125) &
-                 + in(i+2,j-1) * (-0.03125) &
-                 + in(i+3,j-1) * (-0.03125) &
-                 + in(i+4,j-1) * (-0.03125) &
-                 + in(i+5,j-1) * (-0.03125) &
-                 + in(i+6,j-1) * (-0.03125) &
-                 + in(i+7,j-1) * (-0.03125) &
-                 + in(i+8,j-1) * (-0.03125) &
-                 + in(i-8,j+1) * (-0.00026041666666666666) &
-                 + in(i-7,j+1) * (-0.00034340659340659343) &
-                 + in(i-6,j+1) * (-0.0004734848484848485) &
-                 + in(i-5,j+1) * (-0.0006944444444444445) &
-                 + in(i-4,j+1) * (-0.0011160714285714285) &
-                 + in(i-3,j+1) * (-0.0020833333333333333) &
-                 + in(i-2,j+1) * (-0.005208333333333333) &
-                 + in(i-1,j+1) * (-0.03125) &
-                 + in(i+1,j+1) * (0.03125) &
-                 + in(i+2,j+1) * (0.005208333333333333) &
-                 + in(i+3,j+1) * (0.0020833333333333333) &
-                 + in(i+4,j+1) * (0.0011160714285714285) &
-                 + in(i+5,j+1) * (0.0006944444444444445) &
-                 + in(i+6,j+1) * (0.0004734848484848485) &
-                 + in(i+7,j+1) * (0.00034340659340659343) &
-                 + in(i+8,j+1) * (0.00026041666666666666) &
-                 + in(i-8,j+2) * (-0.00026041666666666666) &
-                 + in(i-7,j+2) * (-0.00034340659340659343) &
-                 + in(i-6,j+2) * (-0.0004734848484848485) &
-                 + in(i-5,j+2) * (-0.0006944444444444445) &
-                 + in(i-4,j+2) * (-0.0011160714285714285) &
-                 + in(i-3,j+2) * (-0.0020833333333333333) &
-                 + in(i-2,j+2) * (-0.005208333333333333) &
-                 + in(i-1,j+2) * (-0.03125) &
-                 + in(i+1,j+2) * (0.005208333333333333) &
-                 + in(i+2,j+2) * (0.015625) &
-                 + in(i+3,j+2) * (0.0020833333333333333) &
-                 + in(i+4,j+2) * (0.0011160714285714285) &
-                 + in(i+5,j+2) * (0.0006944444444444445) &
-                 + in(i+6,j+2) * (0.0004734848484848485) &
-                 + in(i+7,j+2) * (0.00034340659340659343) &
-                 + in(i+8,j+2) * (0.00026041666666666666) &
-                 + in(i-8,j+3) * (-0.00026041666666666666) &
-                 + in(i-7,j+3) * (-0.00034340659340659343) &
-                 + in(i-6,j+3) * (-0.0004734848484848485) &
-                 + in(i-5,j+3) * (-0.0006944444444444445) &
-                 + in(i-4,j+3) * (-0.0011160714285714285) &
-                 + in(i-3,j+3) * (-0.0020833333333333333) &
-                 + in(i-2,j+3) * (-0.005208333333333333) &
-                 + in(i-1,j+3) * (-0.03125) &
-                 + in(i+1,j+3) * (0.0020833333333333333) &
-                 + in(i+2,j+3) * (0.0020833333333333333) &
-                 + in(i+3,j+3) * (0.010416666666666666) &
-                 + in(i+4,j+3) * (0.0011160714285714285) &
-                 + in(i+5,j+3) * (0.0006944444444444445) &
-                 + in(i+6,j+3) * (0.0004734848484848485) &
-                 + in(i+7,j+3) * (0.00034340659340659343) &
-                 + in(i+8,j+3) * (0.00026041666666666666) &
-                 + in(i-8,j+4) * (-0.00026041666666666666) &
-                 + in(i-7,j+4) * (-0.00034340659340659343) &
-                 + in(i-6,j+4) * (-0.0004734848484848485) &
-                 + in(i-5,j+4) * (-0.0006944444444444445) &
-                 + in(i-4,j+4) * (-0.0011160714285714285) &
-                 + in(i-3,j+4) * (-0.0020833333333333333) &
-                 + in(i-2,j+4) * (-0.005208333333333333) &
-                 + in(i-1,j+4) * (-0.03125) &
-                 + in(i+1,j+4) * (0.0011160714285714285) &
-                 + in(i+2,j+4) * (0.0011160714285714285) &
-                 + in(i+3,j+4) * (0.0011160714285714285) &
-                 + in(i+4,j+4) * (0.0078125) &
-                 + in(i+5,j+4) * (0.0006944444444444445) &
-                 + in(i+6,j+4) * (0.0004734848484848485) &
-                 + in(i+7,j+4) * (0.00034340659340659343) &
-                 + in(i+8,j+4) * (0.00026041666666666666) &
-                 + in(i-8,j+5) * (-0.00026041666666666666) &
-                 + in(i-7,j+5) * (-0.00034340659340659343) &
-                 + in(i-6,j+5) * (-0.0004734848484848485) &
-                 + in(i-5,j+5) * (-0.0006944444444444445) &
-                 + in(i-4,j+5) * (-0.0011160714285714285) &
-                 + in(i-3,j+5) * (-0.0020833333333333333) &
-                 + in(i-2,j+5) * (-0.005208333333333333) &
-                 + in(i-1,j+5) * (-0.03125) &
-                 + in(i+1,j+5) * (0.0006944444444444445) &
-                 + in(i+2,j+5) * (0.0006944444444444445) &
-                 + in(i+3,j+5) * (0.0006944444444444445) &
-                 + in(i+4,j+5) * (0.0006944444444444445) &
-                 + in(i+5,j+5) * (0.00625) &
-                 + in(i+6,j+5) * (0.0004734848484848485) &
-                 + in(i+7,j+5) * (0.00034340659340659343) &
-                 + in(i+8,j+5) * (0.00026041666666666666) &
-                 + in(i-8,j+6) * (-0.00026041666666666666) &
-                 + in(i-7,j+6) * (-0.00034340659340659343) &
-                 + in(i-6,j+6) * (-0.0004734848484848485) &
-                 + in(i-5,j+6) * (-0.0006944444444444445) &
-                 + in(i-4,j+6) * (-0.0011160714285714285) &
-                 + in(i-3,j+6) * (-0.0020833333333333333) &
-                 + in(i-2,j+6) * (-0.005208333333333333) &
-                 + in(i-1,j+6) * (-0.03125) &
-                 + in(i+1,j+6) * (0.0004734848484848485) &
-                 + in(i+2,j+6) * (0.0004734848484848485) &
-                 + in(i+3,j+6) * (0.0004734848484848485) &
-                 + in(i+4,j+6) * (0.0004734848484848485) &
-                 + in(i+5,j+6) * (0.0004734848484848485) &
-                 + in(i+6,j+6) * (0.005208333333333333) &
-                 + in(i+7,j+6) * (0.00034340659340659343) &
-                 + in(i+8,j+6) * (0.00026041666666666666) &
-                 + in(i-8,j+7) * (-0.00026041666666666666) &
-                 + in(i-7,j+7) * (-0.00034340659340659343) &
-                 + in(i-6,j+7) * (-0.0004734848484848485) &
-                 + in(i-5,j+7) * (-0.0006944444444444445) &
-                 + in(i-4,j+7) * (-0.0011160714285714285) &
-                 + in(i-3,j+7) * (-0.0020833333333333333) &
-                 + in(i-2,j+7) * (-0.005208333333333333) &
-                 + in(i-1,j+7) * (-0.03125) &
-                 + in(i+1,j+7) * (0.00034340659340659343) &
-                 + in(i+2,j+7) * (0.00034340659340659343) &
-                 + in(i+3,j+7) * (0.00034340659340659343) &
-                 + in(i+4,j+7) * (0.00034340659340659343) &
-                 + in(i+5,j+7) * (0.00034340659340659343) &
-                 + in(i+6,j+7) * (0.00034340659340659343) &
-                 + in(i+7,j+7) * (0.004464285714285714) &
-                 + in(i+8,j+7) * (0.00026041666666666666) &
-                 + in(i-8,j+8) * (-0.00026041666666666666) &
-                 + in(i-7,j+8) * (-0.00034340659340659343) &
-                 + in(i-6,j+8) * (-0.0004734848484848485) &
-                 + in(i-5,j+8) * (-0.0006944444444444445) &
-                 + in(i-4,j+8) * (-0.0011160714285714285) &
-                 + in(i-3,j+8) * (-0.0020833333333333333) &
-                 + in(i-2,j+8) * (-0.005208333333333333) &
-                 + in(i-1,j+8) * (-0.03125) &
-                 + in(i+1,j+8) * (0.00026041666666666666) &
-                 + in(i+2,j+8) * (0.00026041666666666666) &
-                 + in(i+3,j+8) * (0.00026041666666666666) &
-                 + in(i+4,j+8) * (0.00026041666666666666) &
-                 + in(i+5,j+8) * (0.00026041666666666666) &
-                 + in(i+6,j+8) * (0.00026041666666666666) &
-                 + in(i+7,j+8) * (0.00026041666666666666) &
-                 + in(i+8,j+8) * (0.00390625) &
+                 + in(i-8,j-8) * (-0.00390625d0) &
+                 + in(i+1,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+2,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+3,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+4,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+5,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+6,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+7,j-8) * (-0.00026041666666666666d0) &
+                 + in(i+8,j-8) * (-0.00026041666666666666d0) &
+                 + in(i-7,j-7) * (-0.004464285714285714d0) &
+                 + in(i+1,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+2,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+3,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+4,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+5,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+6,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+7,j-7) * (-0.00034340659340659343d0) &
+                 + in(i+8,j-7) * (-0.00034340659340659343d0) &
+                 + in(i-6,j-6) * (-0.005208333333333333d0) &
+                 + in(i+1,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+2,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+3,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+4,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+5,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+6,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+7,j-6) * (-0.0004734848484848485d0) &
+                 + in(i+8,j-6) * (-0.0004734848484848485d0) &
+                 + in(i-5,j-5) * (-0.00625d0) &
+                 + in(i+1,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+2,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+3,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+4,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+5,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+6,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+7,j-5) * (-0.0006944444444444445d0) &
+                 + in(i+8,j-5) * (-0.0006944444444444445d0) &
+                 + in(i-4,j-4) * (-0.0078125d0) &
+                 + in(i+1,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+2,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+3,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+4,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+5,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+6,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+7,j-4) * (-0.0011160714285714285d0) &
+                 + in(i+8,j-4) * (-0.0011160714285714285d0) &
+                 + in(i-3,j-3) * (-0.010416666666666666d0) &
+                 + in(i+1,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+2,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+3,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+4,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+5,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+6,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+7,j-3) * (-0.0020833333333333333d0) &
+                 + in(i+8,j-3) * (-0.0020833333333333333d0) &
+                 + in(i-2,j-2) * (-0.015625d0) &
+                 + in(i+1,j-2) * (-0.005208333333333333d0) &
+                 + in(i+2,j-2) * (-0.005208333333333333d0) &
+                 + in(i+3,j-2) * (-0.005208333333333333d0) &
+                 + in(i+4,j-2) * (-0.005208333333333333d0) &
+                 + in(i+5,j-2) * (-0.005208333333333333d0) &
+                 + in(i+6,j-2) * (-0.005208333333333333d0) &
+                 + in(i+7,j-2) * (-0.005208333333333333d0) &
+                 + in(i+8,j-2) * (-0.005208333333333333d0) &
+                 + in(i-1,j-1) * (-0.03125d0) &
+                 + in(i+1,j-1) * (-0.03125d0) &
+                 + in(i+2,j-1) * (-0.03125d0) &
+                 + in(i+3,j-1) * (-0.03125d0) &
+                 + in(i+4,j-1) * (-0.03125d0) &
+                 + in(i+5,j-1) * (-0.03125d0) &
+                 + in(i+6,j-1) * (-0.03125d0) &
+                 + in(i+7,j-1) * (-0.03125d0) &
+                 + in(i+8,j-1) * (-0.03125d0) &
+                 + in(i-8,j+1) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+1) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+1) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+1) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+1) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+1) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+1) * (-0.005208333333333333d0) &
+                 + in(i-1,j+1) * (-0.03125d0) &
+                 + in(i+1,j+1) * (0.03125d0) &
+                 + in(i+2,j+1) * (0.005208333333333333d0) &
+                 + in(i+3,j+1) * (0.0020833333333333333d0) &
+                 + in(i+4,j+1) * (0.0011160714285714285d0) &
+                 + in(i+5,j+1) * (0.0006944444444444445d0) &
+                 + in(i+6,j+1) * (0.0004734848484848485d0) &
+                 + in(i+7,j+1) * (0.00034340659340659343d0) &
+                 + in(i+8,j+1) * (0.00026041666666666666d0) &
+                 + in(i-8,j+2) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+2) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+2) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+2) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+2) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+2) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+2) * (-0.005208333333333333d0) &
+                 + in(i-1,j+2) * (-0.03125d0) &
+                 + in(i+1,j+2) * (0.005208333333333333d0) &
+                 + in(i+2,j+2) * (0.015625d0) &
+                 + in(i+3,j+2) * (0.0020833333333333333d0) &
+                 + in(i+4,j+2) * (0.0011160714285714285d0) &
+                 + in(i+5,j+2) * (0.0006944444444444445d0) &
+                 + in(i+6,j+2) * (0.0004734848484848485d0) &
+                 + in(i+7,j+2) * (0.00034340659340659343d0) &
+                 + in(i+8,j+2) * (0.00026041666666666666d0) &
+                 + in(i-8,j+3) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+3) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+3) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+3) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+3) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+3) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+3) * (-0.005208333333333333d0) &
+                 + in(i-1,j+3) * (-0.03125d0) &
+                 + in(i+1,j+3) * (0.0020833333333333333d0) &
+                 + in(i+2,j+3) * (0.0020833333333333333d0) &
+                 + in(i+3,j+3) * (0.010416666666666666d0) &
+                 + in(i+4,j+3) * (0.0011160714285714285d0) &
+                 + in(i+5,j+3) * (0.0006944444444444445d0) &
+                 + in(i+6,j+3) * (0.0004734848484848485d0) &
+                 + in(i+7,j+3) * (0.00034340659340659343d0) &
+                 + in(i+8,j+3) * (0.00026041666666666666d0) &
+                 + in(i-8,j+4) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+4) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+4) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+4) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+4) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+4) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+4) * (-0.005208333333333333d0) &
+                 + in(i-1,j+4) * (-0.03125d0) &
+                 + in(i+1,j+4) * (0.0011160714285714285d0) &
+                 + in(i+2,j+4) * (0.0011160714285714285d0) &
+                 + in(i+3,j+4) * (0.0011160714285714285d0) &
+                 + in(i+4,j+4) * (0.0078125d0) &
+                 + in(i+5,j+4) * (0.0006944444444444445d0) &
+                 + in(i+6,j+4) * (0.0004734848484848485d0) &
+                 + in(i+7,j+4) * (0.00034340659340659343d0) &
+                 + in(i+8,j+4) * (0.00026041666666666666d0) &
+                 + in(i-8,j+5) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+5) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+5) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+5) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+5) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+5) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+5) * (-0.005208333333333333d0) &
+                 + in(i-1,j+5) * (-0.03125d0) &
+                 + in(i+1,j+5) * (0.0006944444444444445d0) &
+                 + in(i+2,j+5) * (0.0006944444444444445d0) &
+                 + in(i+3,j+5) * (0.0006944444444444445d0) &
+                 + in(i+4,j+5) * (0.0006944444444444445d0) &
+                 + in(i+5,j+5) * (0.00625d0) &
+                 + in(i+6,j+5) * (0.0004734848484848485d0) &
+                 + in(i+7,j+5) * (0.00034340659340659343d0) &
+                 + in(i+8,j+5) * (0.00026041666666666666d0) &
+                 + in(i-8,j+6) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+6) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+6) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+6) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+6) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+6) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+6) * (-0.005208333333333333d0) &
+                 + in(i-1,j+6) * (-0.03125d0) &
+                 + in(i+1,j+6) * (0.0004734848484848485d0) &
+                 + in(i+2,j+6) * (0.0004734848484848485d0) &
+                 + in(i+3,j+6) * (0.0004734848484848485d0) &
+                 + in(i+4,j+6) * (0.0004734848484848485d0) &
+                 + in(i+5,j+6) * (0.0004734848484848485d0) &
+                 + in(i+6,j+6) * (0.005208333333333333d0) &
+                 + in(i+7,j+6) * (0.00034340659340659343d0) &
+                 + in(i+8,j+6) * (0.00026041666666666666d0) &
+                 + in(i-8,j+7) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+7) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+7) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+7) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+7) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+7) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+7) * (-0.005208333333333333d0) &
+                 + in(i-1,j+7) * (-0.03125d0) &
+                 + in(i+1,j+7) * (0.00034340659340659343d0) &
+                 + in(i+2,j+7) * (0.00034340659340659343d0) &
+                 + in(i+3,j+7) * (0.00034340659340659343d0) &
+                 + in(i+4,j+7) * (0.00034340659340659343d0) &
+                 + in(i+5,j+7) * (0.00034340659340659343d0) &
+                 + in(i+6,j+7) * (0.00034340659340659343d0) &
+                 + in(i+7,j+7) * (0.004464285714285714d0) &
+                 + in(i+8,j+7) * (0.00026041666666666666d0) &
+                 + in(i-8,j+8) * (-0.00026041666666666666d0) &
+                 + in(i-7,j+8) * (-0.00034340659340659343d0) &
+                 + in(i-6,j+8) * (-0.0004734848484848485d0) &
+                 + in(i-5,j+8) * (-0.0006944444444444445d0) &
+                 + in(i-4,j+8) * (-0.0011160714285714285d0) &
+                 + in(i-3,j+8) * (-0.0020833333333333333d0) &
+                 + in(i-2,j+8) * (-0.005208333333333333d0) &
+                 + in(i-1,j+8) * (-0.03125d0) &
+                 + in(i+1,j+8) * (0.00026041666666666666d0) &
+                 + in(i+2,j+8) * (0.00026041666666666666d0) &
+                 + in(i+3,j+8) * (0.00026041666666666666d0) &
+                 + in(i+4,j+8) * (0.00026041666666666666d0) &
+                 + in(i+5,j+8) * (0.00026041666666666666d0) &
+                 + in(i+6,j+8) * (0.00026041666666666666d0) &
+                 + in(i+7,j+8) * (0.00026041666666666666d0) &
+                 + in(i+8,j+8) * (0.00390625d0) &
 +0.0
       end do
       !$omp end simd
@@ -1161,258 +1161,258 @@ subroutine grid9(n, in, out)
       !$omp simd
       do j=9,n-9-1
         out(i,j) = out(i,j) &
-                 + in(i-9,j-9) * (-0.0030864197530864196) &
-                 + in(i+1,j-9) * (-0.00018155410312273057) &
-                 + in(i+2,j-9) * (-0.00018155410312273057) &
-                 + in(i+3,j-9) * (-0.00018155410312273057) &
-                 + in(i+4,j-9) * (-0.00018155410312273057) &
-                 + in(i+5,j-9) * (-0.00018155410312273057) &
-                 + in(i+6,j-9) * (-0.00018155410312273057) &
-                 + in(i+7,j-9) * (-0.00018155410312273057) &
-                 + in(i+8,j-9) * (-0.00018155410312273057) &
-                 + in(i+9,j-9) * (-0.00018155410312273057) &
-                 + in(i-8,j-8) * (-0.003472222222222222) &
-                 + in(i+1,j-8) * (-0.0002314814814814815) &
-                 + in(i+2,j-8) * (-0.0002314814814814815) &
-                 + in(i+3,j-8) * (-0.0002314814814814815) &
-                 + in(i+4,j-8) * (-0.0002314814814814815) &
-                 + in(i+5,j-8) * (-0.0002314814814814815) &
-                 + in(i+6,j-8) * (-0.0002314814814814815) &
-                 + in(i+7,j-8) * (-0.0002314814814814815) &
-                 + in(i+8,j-8) * (-0.0002314814814814815) &
-                 + in(i+9,j-8) * (-0.0002314814814814815) &
-                 + in(i-7,j-7) * (-0.003968253968253968) &
-                 + in(i+1,j-7) * (-0.00030525030525030525) &
-                 + in(i+2,j-7) * (-0.00030525030525030525) &
-                 + in(i+3,j-7) * (-0.00030525030525030525) &
-                 + in(i+4,j-7) * (-0.00030525030525030525) &
-                 + in(i+5,j-7) * (-0.00030525030525030525) &
-                 + in(i+6,j-7) * (-0.00030525030525030525) &
-                 + in(i+7,j-7) * (-0.00030525030525030525) &
-                 + in(i+8,j-7) * (-0.00030525030525030525) &
-                 + in(i+9,j-7) * (-0.00030525030525030525) &
-                 + in(i-6,j-6) * (-0.004629629629629629) &
-                 + in(i+1,j-6) * (-0.00042087542087542086) &
-                 + in(i+2,j-6) * (-0.00042087542087542086) &
-                 + in(i+3,j-6) * (-0.00042087542087542086) &
-                 + in(i+4,j-6) * (-0.00042087542087542086) &
-                 + in(i+5,j-6) * (-0.00042087542087542086) &
-                 + in(i+6,j-6) * (-0.00042087542087542086) &
-                 + in(i+7,j-6) * (-0.00042087542087542086) &
-                 + in(i+8,j-6) * (-0.00042087542087542086) &
-                 + in(i+9,j-6) * (-0.00042087542087542086) &
-                 + in(i-5,j-5) * (-0.005555555555555556) &
-                 + in(i+1,j-5) * (-0.0006172839506172839) &
-                 + in(i+2,j-5) * (-0.0006172839506172839) &
-                 + in(i+3,j-5) * (-0.0006172839506172839) &
-                 + in(i+4,j-5) * (-0.0006172839506172839) &
-                 + in(i+5,j-5) * (-0.0006172839506172839) &
-                 + in(i+6,j-5) * (-0.0006172839506172839) &
-                 + in(i+7,j-5) * (-0.0006172839506172839) &
-                 + in(i+8,j-5) * (-0.0006172839506172839) &
-                 + in(i+9,j-5) * (-0.0006172839506172839) &
-                 + in(i-4,j-4) * (-0.006944444444444444) &
-                 + in(i+1,j-4) * (-0.000992063492063492) &
-                 + in(i+2,j-4) * (-0.000992063492063492) &
-                 + in(i+3,j-4) * (-0.000992063492063492) &
-                 + in(i+4,j-4) * (-0.000992063492063492) &
-                 + in(i+5,j-4) * (-0.000992063492063492) &
-                 + in(i+6,j-4) * (-0.000992063492063492) &
-                 + in(i+7,j-4) * (-0.000992063492063492) &
-                 + in(i+8,j-4) * (-0.000992063492063492) &
-                 + in(i+9,j-4) * (-0.000992063492063492) &
-                 + in(i-3,j-3) * (-0.009259259259259259) &
-                 + in(i+1,j-3) * (-0.001851851851851852) &
-                 + in(i+2,j-3) * (-0.001851851851851852) &
-                 + in(i+3,j-3) * (-0.001851851851851852) &
-                 + in(i+4,j-3) * (-0.001851851851851852) &
-                 + in(i+5,j-3) * (-0.001851851851851852) &
-                 + in(i+6,j-3) * (-0.001851851851851852) &
-                 + in(i+7,j-3) * (-0.001851851851851852) &
-                 + in(i+8,j-3) * (-0.001851851851851852) &
-                 + in(i+9,j-3) * (-0.001851851851851852) &
-                 + in(i-2,j-2) * (-0.013888888888888888) &
-                 + in(i+1,j-2) * (-0.004629629629629629) &
-                 + in(i+2,j-2) * (-0.004629629629629629) &
-                 + in(i+3,j-2) * (-0.004629629629629629) &
-                 + in(i+4,j-2) * (-0.004629629629629629) &
-                 + in(i+5,j-2) * (-0.004629629629629629) &
-                 + in(i+6,j-2) * (-0.004629629629629629) &
-                 + in(i+7,j-2) * (-0.004629629629629629) &
-                 + in(i+8,j-2) * (-0.004629629629629629) &
-                 + in(i+9,j-2) * (-0.004629629629629629) &
-                 + in(i-1,j-1) * (-0.027777777777777776) &
-                 + in(i+1,j-1) * (-0.027777777777777776) &
-                 + in(i+2,j-1) * (-0.027777777777777776) &
-                 + in(i+3,j-1) * (-0.027777777777777776) &
-                 + in(i+4,j-1) * (-0.027777777777777776) &
-                 + in(i+5,j-1) * (-0.027777777777777776) &
-                 + in(i+6,j-1) * (-0.027777777777777776) &
-                 + in(i+7,j-1) * (-0.027777777777777776) &
-                 + in(i+8,j-1) * (-0.027777777777777776) &
-                 + in(i+9,j-1) * (-0.027777777777777776) &
-                 + in(i-9,j+1) * (-0.00018155410312273057) &
-                 + in(i-8,j+1) * (-0.0002314814814814815) &
-                 + in(i-7,j+1) * (-0.00030525030525030525) &
-                 + in(i-6,j+1) * (-0.00042087542087542086) &
-                 + in(i-5,j+1) * (-0.0006172839506172839) &
-                 + in(i-4,j+1) * (-0.000992063492063492) &
-                 + in(i-3,j+1) * (-0.001851851851851852) &
-                 + in(i-2,j+1) * (-0.004629629629629629) &
-                 + in(i-1,j+1) * (-0.027777777777777776) &
-                 + in(i+1,j+1) * (0.027777777777777776) &
-                 + in(i+2,j+1) * (0.004629629629629629) &
-                 + in(i+3,j+1) * (0.001851851851851852) &
-                 + in(i+4,j+1) * (0.000992063492063492) &
-                 + in(i+5,j+1) * (0.0006172839506172839) &
-                 + in(i+6,j+1) * (0.00042087542087542086) &
-                 + in(i+7,j+1) * (0.00030525030525030525) &
-                 + in(i+8,j+1) * (0.0002314814814814815) &
-                 + in(i+9,j+1) * (0.00018155410312273057) &
-                 + in(i-9,j+2) * (-0.00018155410312273057) &
-                 + in(i-8,j+2) * (-0.0002314814814814815) &
-                 + in(i-7,j+2) * (-0.00030525030525030525) &
-                 + in(i-6,j+2) * (-0.00042087542087542086) &
-                 + in(i-5,j+2) * (-0.0006172839506172839) &
-                 + in(i-4,j+2) * (-0.000992063492063492) &
-                 + in(i-3,j+2) * (-0.001851851851851852) &
-                 + in(i-2,j+2) * (-0.004629629629629629) &
-                 + in(i-1,j+2) * (-0.027777777777777776) &
-                 + in(i+1,j+2) * (0.004629629629629629) &
-                 + in(i+2,j+2) * (0.013888888888888888) &
-                 + in(i+3,j+2) * (0.001851851851851852) &
-                 + in(i+4,j+2) * (0.000992063492063492) &
-                 + in(i+5,j+2) * (0.0006172839506172839) &
-                 + in(i+6,j+2) * (0.00042087542087542086) &
-                 + in(i+7,j+2) * (0.00030525030525030525) &
-                 + in(i+8,j+2) * (0.0002314814814814815) &
-                 + in(i+9,j+2) * (0.00018155410312273057) &
-                 + in(i-9,j+3) * (-0.00018155410312273057) &
-                 + in(i-8,j+3) * (-0.0002314814814814815) &
-                 + in(i-7,j+3) * (-0.00030525030525030525) &
-                 + in(i-6,j+3) * (-0.00042087542087542086) &
-                 + in(i-5,j+3) * (-0.0006172839506172839) &
-                 + in(i-4,j+3) * (-0.000992063492063492) &
-                 + in(i-3,j+3) * (-0.001851851851851852) &
-                 + in(i-2,j+3) * (-0.004629629629629629) &
-                 + in(i-1,j+3) * (-0.027777777777777776) &
-                 + in(i+1,j+3) * (0.001851851851851852) &
-                 + in(i+2,j+3) * (0.001851851851851852) &
-                 + in(i+3,j+3) * (0.009259259259259259) &
-                 + in(i+4,j+3) * (0.000992063492063492) &
-                 + in(i+5,j+3) * (0.0006172839506172839) &
-                 + in(i+6,j+3) * (0.00042087542087542086) &
-                 + in(i+7,j+3) * (0.00030525030525030525) &
-                 + in(i+8,j+3) * (0.0002314814814814815) &
-                 + in(i+9,j+3) * (0.00018155410312273057) &
-                 + in(i-9,j+4) * (-0.00018155410312273057) &
-                 + in(i-8,j+4) * (-0.0002314814814814815) &
-                 + in(i-7,j+4) * (-0.00030525030525030525) &
-                 + in(i-6,j+4) * (-0.00042087542087542086) &
-                 + in(i-5,j+4) * (-0.0006172839506172839) &
-                 + in(i-4,j+4) * (-0.000992063492063492) &
-                 + in(i-3,j+4) * (-0.001851851851851852) &
-                 + in(i-2,j+4) * (-0.004629629629629629) &
-                 + in(i-1,j+4) * (-0.027777777777777776) &
-                 + in(i+1,j+4) * (0.000992063492063492) &
-                 + in(i+2,j+4) * (0.000992063492063492) &
-                 + in(i+3,j+4) * (0.000992063492063492) &
-                 + in(i+4,j+4) * (0.006944444444444444) &
-                 + in(i+5,j+4) * (0.0006172839506172839) &
-                 + in(i+6,j+4) * (0.00042087542087542086) &
-                 + in(i+7,j+4) * (0.00030525030525030525) &
-                 + in(i+8,j+4) * (0.0002314814814814815) &
-                 + in(i+9,j+4) * (0.00018155410312273057) &
-                 + in(i-9,j+5) * (-0.00018155410312273057) &
-                 + in(i-8,j+5) * (-0.0002314814814814815) &
-                 + in(i-7,j+5) * (-0.00030525030525030525) &
-                 + in(i-6,j+5) * (-0.00042087542087542086) &
-                 + in(i-5,j+5) * (-0.0006172839506172839) &
-                 + in(i-4,j+5) * (-0.000992063492063492) &
-                 + in(i-3,j+5) * (-0.001851851851851852) &
-                 + in(i-2,j+5) * (-0.004629629629629629) &
-                 + in(i-1,j+5) * (-0.027777777777777776) &
-                 + in(i+1,j+5) * (0.0006172839506172839) &
-                 + in(i+2,j+5) * (0.0006172839506172839) &
-                 + in(i+3,j+5) * (0.0006172839506172839) &
-                 + in(i+4,j+5) * (0.0006172839506172839) &
-                 + in(i+5,j+5) * (0.005555555555555556) &
-                 + in(i+6,j+5) * (0.00042087542087542086) &
-                 + in(i+7,j+5) * (0.00030525030525030525) &
-                 + in(i+8,j+5) * (0.0002314814814814815) &
-                 + in(i+9,j+5) * (0.00018155410312273057) &
-                 + in(i-9,j+6) * (-0.00018155410312273057) &
-                 + in(i-8,j+6) * (-0.0002314814814814815) &
-                 + in(i-7,j+6) * (-0.00030525030525030525) &
-                 + in(i-6,j+6) * (-0.00042087542087542086) &
-                 + in(i-5,j+6) * (-0.0006172839506172839) &
-                 + in(i-4,j+6) * (-0.000992063492063492) &
-                 + in(i-3,j+6) * (-0.001851851851851852) &
-                 + in(i-2,j+6) * (-0.004629629629629629) &
-                 + in(i-1,j+6) * (-0.027777777777777776) &
-                 + in(i+1,j+6) * (0.00042087542087542086) &
-                 + in(i+2,j+6) * (0.00042087542087542086) &
-                 + in(i+3,j+6) * (0.00042087542087542086) &
-                 + in(i+4,j+6) * (0.00042087542087542086) &
-                 + in(i+5,j+6) * (0.00042087542087542086) &
-                 + in(i+6,j+6) * (0.004629629629629629) &
-                 + in(i+7,j+6) * (0.00030525030525030525) &
-                 + in(i+8,j+6) * (0.0002314814814814815) &
-                 + in(i+9,j+6) * (0.00018155410312273057) &
-                 + in(i-9,j+7) * (-0.00018155410312273057) &
-                 + in(i-8,j+7) * (-0.0002314814814814815) &
-                 + in(i-7,j+7) * (-0.00030525030525030525) &
-                 + in(i-6,j+7) * (-0.00042087542087542086) &
-                 + in(i-5,j+7) * (-0.0006172839506172839) &
-                 + in(i-4,j+7) * (-0.000992063492063492) &
-                 + in(i-3,j+7) * (-0.001851851851851852) &
-                 + in(i-2,j+7) * (-0.004629629629629629) &
-                 + in(i-1,j+7) * (-0.027777777777777776) &
-                 + in(i+1,j+7) * (0.00030525030525030525) &
-                 + in(i+2,j+7) * (0.00030525030525030525) &
-                 + in(i+3,j+7) * (0.00030525030525030525) &
-                 + in(i+4,j+7) * (0.00030525030525030525) &
-                 + in(i+5,j+7) * (0.00030525030525030525) &
-                 + in(i+6,j+7) * (0.00030525030525030525) &
-                 + in(i+7,j+7) * (0.003968253968253968) &
-                 + in(i+8,j+7) * (0.0002314814814814815) &
-                 + in(i+9,j+7) * (0.00018155410312273057) &
-                 + in(i-9,j+8) * (-0.00018155410312273057) &
-                 + in(i-8,j+8) * (-0.0002314814814814815) &
-                 + in(i-7,j+8) * (-0.00030525030525030525) &
-                 + in(i-6,j+8) * (-0.00042087542087542086) &
-                 + in(i-5,j+8) * (-0.0006172839506172839) &
-                 + in(i-4,j+8) * (-0.000992063492063492) &
-                 + in(i-3,j+8) * (-0.001851851851851852) &
-                 + in(i-2,j+8) * (-0.004629629629629629) &
-                 + in(i-1,j+8) * (-0.027777777777777776) &
-                 + in(i+1,j+8) * (0.0002314814814814815) &
-                 + in(i+2,j+8) * (0.0002314814814814815) &
-                 + in(i+3,j+8) * (0.0002314814814814815) &
-                 + in(i+4,j+8) * (0.0002314814814814815) &
-                 + in(i+5,j+8) * (0.0002314814814814815) &
-                 + in(i+6,j+8) * (0.0002314814814814815) &
-                 + in(i+7,j+8) * (0.0002314814814814815) &
-                 + in(i+8,j+8) * (0.003472222222222222) &
-                 + in(i+9,j+8) * (0.00018155410312273057) &
-                 + in(i-9,j+9) * (-0.00018155410312273057) &
-                 + in(i-8,j+9) * (-0.0002314814814814815) &
-                 + in(i-7,j+9) * (-0.00030525030525030525) &
-                 + in(i-6,j+9) * (-0.00042087542087542086) &
-                 + in(i-5,j+9) * (-0.0006172839506172839) &
-                 + in(i-4,j+9) * (-0.000992063492063492) &
-                 + in(i-3,j+9) * (-0.001851851851851852) &
-                 + in(i-2,j+9) * (-0.004629629629629629) &
-                 + in(i-1,j+9) * (-0.027777777777777776) &
-                 + in(i+1,j+9) * (0.00018155410312273057) &
-                 + in(i+2,j+9) * (0.00018155410312273057) &
-                 + in(i+3,j+9) * (0.00018155410312273057) &
-                 + in(i+4,j+9) * (0.00018155410312273057) &
-                 + in(i+5,j+9) * (0.00018155410312273057) &
-                 + in(i+6,j+9) * (0.00018155410312273057) &
-                 + in(i+7,j+9) * (0.00018155410312273057) &
-                 + in(i+8,j+9) * (0.00018155410312273057) &
-                 + in(i+9,j+9) * (0.0030864197530864196) &
+                 + in(i-9,j-9) * (-0.0030864197530864196d0) &
+                 + in(i+1,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+2,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+3,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+4,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+5,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+6,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+7,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+8,j-9) * (-0.00018155410312273057d0) &
+                 + in(i+9,j-9) * (-0.00018155410312273057d0) &
+                 + in(i-8,j-8) * (-0.003472222222222222d0) &
+                 + in(i+1,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+2,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+3,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+4,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+5,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+6,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+7,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+8,j-8) * (-0.0002314814814814815d0) &
+                 + in(i+9,j-8) * (-0.0002314814814814815d0) &
+                 + in(i-7,j-7) * (-0.003968253968253968d0) &
+                 + in(i+1,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+2,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+3,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+4,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+5,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+6,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+7,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+8,j-7) * (-0.00030525030525030525d0) &
+                 + in(i+9,j-7) * (-0.00030525030525030525d0) &
+                 + in(i-6,j-6) * (-0.004629629629629629d0) &
+                 + in(i+1,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+2,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+3,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+4,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+5,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+6,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+7,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+8,j-6) * (-0.00042087542087542086d0) &
+                 + in(i+9,j-6) * (-0.00042087542087542086d0) &
+                 + in(i-5,j-5) * (-0.005555555555555556d0) &
+                 + in(i+1,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+2,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+3,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+4,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+5,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+6,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+7,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+8,j-5) * (-0.0006172839506172839d0) &
+                 + in(i+9,j-5) * (-0.0006172839506172839d0) &
+                 + in(i-4,j-4) * (-0.006944444444444444d0) &
+                 + in(i+1,j-4) * (-0.000992063492063492d0) &
+                 + in(i+2,j-4) * (-0.000992063492063492d0) &
+                 + in(i+3,j-4) * (-0.000992063492063492d0) &
+                 + in(i+4,j-4) * (-0.000992063492063492d0) &
+                 + in(i+5,j-4) * (-0.000992063492063492d0) &
+                 + in(i+6,j-4) * (-0.000992063492063492d0) &
+                 + in(i+7,j-4) * (-0.000992063492063492d0) &
+                 + in(i+8,j-4) * (-0.000992063492063492d0) &
+                 + in(i+9,j-4) * (-0.000992063492063492d0) &
+                 + in(i-3,j-3) * (-0.009259259259259259d0) &
+                 + in(i+1,j-3) * (-0.001851851851851852d0) &
+                 + in(i+2,j-3) * (-0.001851851851851852d0) &
+                 + in(i+3,j-3) * (-0.001851851851851852d0) &
+                 + in(i+4,j-3) * (-0.001851851851851852d0) &
+                 + in(i+5,j-3) * (-0.001851851851851852d0) &
+                 + in(i+6,j-3) * (-0.001851851851851852d0) &
+                 + in(i+7,j-3) * (-0.001851851851851852d0) &
+                 + in(i+8,j-3) * (-0.001851851851851852d0) &
+                 + in(i+9,j-3) * (-0.001851851851851852d0) &
+                 + in(i-2,j-2) * (-0.013888888888888888d0) &
+                 + in(i+1,j-2) * (-0.004629629629629629d0) &
+                 + in(i+2,j-2) * (-0.004629629629629629d0) &
+                 + in(i+3,j-2) * (-0.004629629629629629d0) &
+                 + in(i+4,j-2) * (-0.004629629629629629d0) &
+                 + in(i+5,j-2) * (-0.004629629629629629d0) &
+                 + in(i+6,j-2) * (-0.004629629629629629d0) &
+                 + in(i+7,j-2) * (-0.004629629629629629d0) &
+                 + in(i+8,j-2) * (-0.004629629629629629d0) &
+                 + in(i+9,j-2) * (-0.004629629629629629d0) &
+                 + in(i-1,j-1) * (-0.027777777777777776d0) &
+                 + in(i+1,j-1) * (-0.027777777777777776d0) &
+                 + in(i+2,j-1) * (-0.027777777777777776d0) &
+                 + in(i+3,j-1) * (-0.027777777777777776d0) &
+                 + in(i+4,j-1) * (-0.027777777777777776d0) &
+                 + in(i+5,j-1) * (-0.027777777777777776d0) &
+                 + in(i+6,j-1) * (-0.027777777777777776d0) &
+                 + in(i+7,j-1) * (-0.027777777777777776d0) &
+                 + in(i+8,j-1) * (-0.027777777777777776d0) &
+                 + in(i+9,j-1) * (-0.027777777777777776d0) &
+                 + in(i-9,j+1) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+1) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+1) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+1) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+1) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+1) * (-0.000992063492063492d0) &
+                 + in(i-3,j+1) * (-0.001851851851851852d0) &
+                 + in(i-2,j+1) * (-0.004629629629629629d0) &
+                 + in(i-1,j+1) * (-0.027777777777777776d0) &
+                 + in(i+1,j+1) * (0.027777777777777776d0) &
+                 + in(i+2,j+1) * (0.004629629629629629d0) &
+                 + in(i+3,j+1) * (0.001851851851851852d0) &
+                 + in(i+4,j+1) * (0.000992063492063492d0) &
+                 + in(i+5,j+1) * (0.0006172839506172839d0) &
+                 + in(i+6,j+1) * (0.00042087542087542086d0) &
+                 + in(i+7,j+1) * (0.00030525030525030525d0) &
+                 + in(i+8,j+1) * (0.0002314814814814815d0) &
+                 + in(i+9,j+1) * (0.00018155410312273057d0) &
+                 + in(i-9,j+2) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+2) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+2) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+2) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+2) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+2) * (-0.000992063492063492d0) &
+                 + in(i-3,j+2) * (-0.001851851851851852d0) &
+                 + in(i-2,j+2) * (-0.004629629629629629d0) &
+                 + in(i-1,j+2) * (-0.027777777777777776d0) &
+                 + in(i+1,j+2) * (0.004629629629629629d0) &
+                 + in(i+2,j+2) * (0.013888888888888888d0) &
+                 + in(i+3,j+2) * (0.001851851851851852d0) &
+                 + in(i+4,j+2) * (0.000992063492063492d0) &
+                 + in(i+5,j+2) * (0.0006172839506172839d0) &
+                 + in(i+6,j+2) * (0.00042087542087542086d0) &
+                 + in(i+7,j+2) * (0.00030525030525030525d0) &
+                 + in(i+8,j+2) * (0.0002314814814814815d0) &
+                 + in(i+9,j+2) * (0.00018155410312273057d0) &
+                 + in(i-9,j+3) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+3) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+3) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+3) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+3) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+3) * (-0.000992063492063492d0) &
+                 + in(i-3,j+3) * (-0.001851851851851852d0) &
+                 + in(i-2,j+3) * (-0.004629629629629629d0) &
+                 + in(i-1,j+3) * (-0.027777777777777776d0) &
+                 + in(i+1,j+3) * (0.001851851851851852d0) &
+                 + in(i+2,j+3) * (0.001851851851851852d0) &
+                 + in(i+3,j+3) * (0.009259259259259259d0) &
+                 + in(i+4,j+3) * (0.000992063492063492d0) &
+                 + in(i+5,j+3) * (0.0006172839506172839d0) &
+                 + in(i+6,j+3) * (0.00042087542087542086d0) &
+                 + in(i+7,j+3) * (0.00030525030525030525d0) &
+                 + in(i+8,j+3) * (0.0002314814814814815d0) &
+                 + in(i+9,j+3) * (0.00018155410312273057d0) &
+                 + in(i-9,j+4) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+4) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+4) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+4) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+4) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+4) * (-0.000992063492063492d0) &
+                 + in(i-3,j+4) * (-0.001851851851851852d0) &
+                 + in(i-2,j+4) * (-0.004629629629629629d0) &
+                 + in(i-1,j+4) * (-0.027777777777777776d0) &
+                 + in(i+1,j+4) * (0.000992063492063492d0) &
+                 + in(i+2,j+4) * (0.000992063492063492d0) &
+                 + in(i+3,j+4) * (0.000992063492063492d0) &
+                 + in(i+4,j+4) * (0.006944444444444444d0) &
+                 + in(i+5,j+4) * (0.0006172839506172839d0) &
+                 + in(i+6,j+4) * (0.00042087542087542086d0) &
+                 + in(i+7,j+4) * (0.00030525030525030525d0) &
+                 + in(i+8,j+4) * (0.0002314814814814815d0) &
+                 + in(i+9,j+4) * (0.00018155410312273057d0) &
+                 + in(i-9,j+5) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+5) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+5) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+5) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+5) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+5) * (-0.000992063492063492d0) &
+                 + in(i-3,j+5) * (-0.001851851851851852d0) &
+                 + in(i-2,j+5) * (-0.004629629629629629d0) &
+                 + in(i-1,j+5) * (-0.027777777777777776d0) &
+                 + in(i+1,j+5) * (0.0006172839506172839d0) &
+                 + in(i+2,j+5) * (0.0006172839506172839d0) &
+                 + in(i+3,j+5) * (0.0006172839506172839d0) &
+                 + in(i+4,j+5) * (0.0006172839506172839d0) &
+                 + in(i+5,j+5) * (0.005555555555555556d0) &
+                 + in(i+6,j+5) * (0.00042087542087542086d0) &
+                 + in(i+7,j+5) * (0.00030525030525030525d0) &
+                 + in(i+8,j+5) * (0.0002314814814814815d0) &
+                 + in(i+9,j+5) * (0.00018155410312273057d0) &
+                 + in(i-9,j+6) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+6) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+6) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+6) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+6) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+6) * (-0.000992063492063492d0) &
+                 + in(i-3,j+6) * (-0.001851851851851852d0) &
+                 + in(i-2,j+6) * (-0.004629629629629629d0) &
+                 + in(i-1,j+6) * (-0.027777777777777776d0) &
+                 + in(i+1,j+6) * (0.00042087542087542086d0) &
+                 + in(i+2,j+6) * (0.00042087542087542086d0) &
+                 + in(i+3,j+6) * (0.00042087542087542086d0) &
+                 + in(i+4,j+6) * (0.00042087542087542086d0) &
+                 + in(i+5,j+6) * (0.00042087542087542086d0) &
+                 + in(i+6,j+6) * (0.004629629629629629d0) &
+                 + in(i+7,j+6) * (0.00030525030525030525d0) &
+                 + in(i+8,j+6) * (0.0002314814814814815d0) &
+                 + in(i+9,j+6) * (0.00018155410312273057d0) &
+                 + in(i-9,j+7) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+7) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+7) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+7) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+7) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+7) * (-0.000992063492063492d0) &
+                 + in(i-3,j+7) * (-0.001851851851851852d0) &
+                 + in(i-2,j+7) * (-0.004629629629629629d0) &
+                 + in(i-1,j+7) * (-0.027777777777777776d0) &
+                 + in(i+1,j+7) * (0.00030525030525030525d0) &
+                 + in(i+2,j+7) * (0.00030525030525030525d0) &
+                 + in(i+3,j+7) * (0.00030525030525030525d0) &
+                 + in(i+4,j+7) * (0.00030525030525030525d0) &
+                 + in(i+5,j+7) * (0.00030525030525030525d0) &
+                 + in(i+6,j+7) * (0.00030525030525030525d0) &
+                 + in(i+7,j+7) * (0.003968253968253968d0) &
+                 + in(i+8,j+7) * (0.0002314814814814815d0) &
+                 + in(i+9,j+7) * (0.00018155410312273057d0) &
+                 + in(i-9,j+8) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+8) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+8) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+8) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+8) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+8) * (-0.000992063492063492d0) &
+                 + in(i-3,j+8) * (-0.001851851851851852d0) &
+                 + in(i-2,j+8) * (-0.004629629629629629d0) &
+                 + in(i-1,j+8) * (-0.027777777777777776d0) &
+                 + in(i+1,j+8) * (0.0002314814814814815d0) &
+                 + in(i+2,j+8) * (0.0002314814814814815d0) &
+                 + in(i+3,j+8) * (0.0002314814814814815d0) &
+                 + in(i+4,j+8) * (0.0002314814814814815d0) &
+                 + in(i+5,j+8) * (0.0002314814814814815d0) &
+                 + in(i+6,j+8) * (0.0002314814814814815d0) &
+                 + in(i+7,j+8) * (0.0002314814814814815d0) &
+                 + in(i+8,j+8) * (0.003472222222222222d0) &
+                 + in(i+9,j+8) * (0.00018155410312273057d0) &
+                 + in(i-9,j+9) * (-0.00018155410312273057d0) &
+                 + in(i-8,j+9) * (-0.0002314814814814815d0) &
+                 + in(i-7,j+9) * (-0.00030525030525030525d0) &
+                 + in(i-6,j+9) * (-0.00042087542087542086d0) &
+                 + in(i-5,j+9) * (-0.0006172839506172839d0) &
+                 + in(i-4,j+9) * (-0.000992063492063492d0) &
+                 + in(i-3,j+9) * (-0.001851851851851852d0) &
+                 + in(i-2,j+9) * (-0.004629629629629629d0) &
+                 + in(i-1,j+9) * (-0.027777777777777776d0) &
+                 + in(i+1,j+9) * (0.00018155410312273057d0) &
+                 + in(i+2,j+9) * (0.00018155410312273057d0) &
+                 + in(i+3,j+9) * (0.00018155410312273057d0) &
+                 + in(i+4,j+9) * (0.00018155410312273057d0) &
+                 + in(i+5,j+9) * (0.00018155410312273057d0) &
+                 + in(i+6,j+9) * (0.00018155410312273057d0) &
+                 + in(i+7,j+9) * (0.00018155410312273057d0) &
+                 + in(i+8,j+9) * (0.00018155410312273057d0) &
+                 + in(i+9,j+9) * (0.0030864197530864196d0) &
 +0.0
       end do
       !$omp end simd

From fcec426bd6eedbd501a059d7de931295ef7f68f1 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Wed, 25 Oct 2017 13:20:18 -0700
Subject: [PATCH 182/245] Fix for mapper interface update.

---
 LEGION/Stencil/stencil.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/LEGION/Stencil/stencil.cc b/LEGION/Stencil/stencil.cc
index bd09ba174..99b2a0666 100644
--- a/LEGION/Stencil/stencil.cc
+++ b/LEGION/Stencil/stencil.cc
@@ -96,7 +96,8 @@ class StencilMapper : public DefaultMapper
                                 const MapMustEpochInput&      input,
                                       MapMustEpochOutput&     output);
     virtual Memory default_policy_select_target_memory(MapperContext ctx,
-                                            Processor target_proc);
+                                            Processor target_proc,
+                                            const RegionRequirement &req);
   private:
     //std::vector<Processor>& procs_list;
     std::vector<Memory>& sysmems_list;
@@ -119,7 +120,8 @@ StencilMapper::StencilMapper(MapperRuntime *rt, Machine machine, Processor local
 }
 
 Memory StencilMapper::default_policy_select_target_memory(MapperContext ctx,
-                                                         Processor target_proc)
+                                                  Processor target_proc,
+                                                  const RegionRequirement &req)
 {
   return proc_sysmems[target_proc];
 }

From 80d3a11c9eeaf2b43e1d17b2e559eb52a4b8501b Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Wed, 21 Mar 2018 13:07:32 -0700
Subject: [PATCH 183/245] do to SYCL what we have for OpenCL

---
 Cxx11/nstream-sycl.cc | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc
index bebfb5932..2a8c83548 100644
--- a/Cxx11/nstream-sycl.cc
+++ b/Cxx11/nstream-sycl.cc
@@ -123,6 +123,10 @@ void run(cl::sycl::queue & q, int iterations, size_t length)
     std::cout << e.what() << std::endl;
     return;
   }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return;
+  }
 
   //////////////////////////////////////////////////////////////////////
   /// Analyze and output results
@@ -262,9 +266,15 @@ int main(int argc, char * argv[])
   }
   catch (cl::sycl::exception e) {
     std::cout << e.what() << std::endl;
+    return 1;
   }
   catch (std::exception e) {
     std::cout << e.what() << std::endl;
+    return 1;
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
   }
 
   return 0;

From af7f70a937195898dd51661a2e689aeb2b5d7307 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Thu, 22 Mar 2018 06:14:10 -0700
Subject: [PATCH 184/245] fix name mangling issue - thanks Rod@CodePlay!

---
 Cxx11/nstream-sycl.cc | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc
index 2a8c83548..8969fdf3a 100644
--- a/Cxx11/nstream-sycl.cc
+++ b/Cxx11/nstream-sycl.cc
@@ -117,6 +117,7 @@ void run(cl::sycl::queue & q, int iterations, size_t length)
   }
   catch (cl::sycl::exception e) {
     std::cout << e.what() << std::endl;
+<<<<<<< HEAD
     return;
   }
   catch (std::exception e) {
@@ -125,6 +126,8 @@ void run(cl::sycl::queue & q, int iterations, size_t length)
   }
   catch (const char * e) {
     std::cout << e << std::endl;
+=======
+>>>>>>> fix name mangling issue - thanks Rod@CodePlay!
     return;
   }
 
@@ -207,14 +210,22 @@ int main(int argc, char * argv[])
   //////////////////////////////////////////////////////////////////////
 
   try {
+<<<<<<< HEAD
 
     if (1) {
         cl::sycl::queue host(cl::sycl::host_selector{});
 #ifndef TRISYCL
         auto device      = host.get_device();
+=======
+    cl::sycl::queue cpu(cl::sycl::cpu_selector{});
+    if (1) {
+        auto device      = cpu.get_device();
+        auto platform    = device.get_platform();
+>>>>>>> fix name mangling issue - thanks Rod@CodePlay!
         std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
         auto platform    = device.get_platform();
         std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
+<<<<<<< HEAD
 #endif
 
         run<float>(host, iterations, length);
@@ -226,9 +237,22 @@ int main(int argc, char * argv[])
         cl::sycl::queue cpu(cl::sycl::cpu_selector{});
 #ifndef TRISYCL
         auto device      = cpu.get_device();
+=======
+        //std::cout << "cl_khr_spir:   " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl;
+
+        run<float>(cpu, iterations, length);
+        run<double>(cpu, iterations, length);
+    }
+
+    cl::sycl::queue gpu(cl::sycl::gpu_selector{});
+    if (1) {
+        auto device      = gpu.get_device();
+        auto platform    = device.get_platform();
+>>>>>>> fix name mangling issue - thanks Rod@CodePlay!
         std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
         auto platform    = device.get_platform();
         std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
+<<<<<<< HEAD
         bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir"));
 #else
         bool has_spir = true; // ?
@@ -262,10 +286,17 @@ int main(int argc, char * argv[])
           run<double>(gpu, iterations, length);
 #endif
         }
+=======
+        //std::cout << "cl_khr_spir:   " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl;
+
+        run<float>(gpu, iterations, length);
+        run<double>(gpu, iterations, length);
+>>>>>>> fix name mangling issue - thanks Rod@CodePlay!
     }
   }
   catch (cl::sycl::exception e) {
     std::cout << e.what() << std::endl;
+<<<<<<< HEAD
     return 1;
   }
   catch (std::exception e) {
@@ -274,6 +305,8 @@ int main(int argc, char * argv[])
   }
   catch (const char * e) {
     std::cout << e << std::endl;
+=======
+>>>>>>> fix name mangling issue - thanks Rod@CodePlay!
     return 1;
   }
 

From 0aa743c4eb560b8bc4f715b1b7b50488654fa658 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Thu, 22 Mar 2018 16:32:20 -0700
Subject: [PATCH 185/245] hard-code SYCL to CPU execution only due to GPU
 issues

the bandwidth reported is consistent for elements, not bytes, which
means that something is wrong.  64b data should not lead to BW that is
2x 32b data...
---
 Cxx11/nstream-sycl.cc | 45 +++----------------------------------------
 1 file changed, 3 insertions(+), 42 deletions(-)

diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc
index 8969fdf3a..0142d7913 100644
--- a/Cxx11/nstream-sycl.cc
+++ b/Cxx11/nstream-sycl.cc
@@ -117,7 +117,6 @@ void run(cl::sycl::queue & q, int iterations, size_t length)
   }
   catch (cl::sycl::exception e) {
     std::cout << e.what() << std::endl;
-<<<<<<< HEAD
     return;
   }
   catch (std::exception e) {
@@ -126,8 +125,6 @@ void run(cl::sycl::queue & q, int iterations, size_t length)
   }
   catch (const char * e) {
     std::cout << e << std::endl;
-=======
->>>>>>> fix name mangling issue - thanks Rod@CodePlay!
     return;
   }
 
@@ -210,24 +207,15 @@ int main(int argc, char * argv[])
   //////////////////////////////////////////////////////////////////////
 
   try {
-<<<<<<< HEAD
 
     if (1) {
         cl::sycl::queue host(cl::sycl::host_selector{});
 #ifndef TRISYCL
         auto device      = host.get_device();
-=======
-    cl::sycl::queue cpu(cl::sycl::cpu_selector{});
-    if (1) {
-        auto device      = cpu.get_device();
         auto platform    = device.get_platform();
->>>>>>> fix name mangling issue - thanks Rod@CodePlay!
         std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
-        auto platform    = device.get_platform();
         std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
-<<<<<<< HEAD
 #endif
-
         run<float>(host, iterations, length);
         run<double>(host, iterations, length);
     }
@@ -237,30 +225,12 @@ int main(int argc, char * argv[])
         cl::sycl::queue cpu(cl::sycl::cpu_selector{});
 #ifndef TRISYCL
         auto device      = cpu.get_device();
-=======
-        //std::cout << "cl_khr_spir:   " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl;
-
-        run<float>(cpu, iterations, length);
-        run<double>(cpu, iterations, length);
-    }
-
-    cl::sycl::queue gpu(cl::sycl::gpu_selector{});
-    if (1) {
-        auto device      = gpu.get_device();
         auto platform    = device.get_platform();
->>>>>>> fix name mangling issue - thanks Rod@CodePlay!
         std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
-        auto platform    = device.get_platform();
         std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
-<<<<<<< HEAD
-        bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir"));
-#else
-        bool has_spir = true; // ?
 #endif
-        if (has_spir) {
-          run<float>(cpu, iterations, length);
-          run<double>(cpu, iterations, length);
-        }
+        run<float>(cpu, iterations, length);
+        run<double>(cpu, iterations, length);
     }
 
     // NVIDIA GPU requires ptx64 target and does not work very well
@@ -268,8 +238,8 @@ int main(int argc, char * argv[])
         cl::sycl::queue gpu(cl::sycl::gpu_selector{});
 #ifndef TRISYCL
         auto device      = gpu.get_device();
-        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
         auto platform    = device.get_platform();
+        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
         std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
         bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir"));
 #else
@@ -286,17 +256,10 @@ int main(int argc, char * argv[])
           run<double>(gpu, iterations, length);
 #endif
         }
-=======
-        //std::cout << "cl_khr_spir:   " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl;
-
-        run<float>(gpu, iterations, length);
-        run<double>(gpu, iterations, length);
->>>>>>> fix name mangling issue - thanks Rod@CodePlay!
     }
   }
   catch (cl::sycl::exception e) {
     std::cout << e.what() << std::endl;
-<<<<<<< HEAD
     return 1;
   }
   catch (std::exception e) {
@@ -305,8 +268,6 @@ int main(int argc, char * argv[])
   }
   catch (const char * e) {
     std::cout << e << std::endl;
-=======
->>>>>>> fix name mangling issue - thanks Rod@CodePlay!
     return 1;
   }
 

From 29d05b6c8be06751eec6d954b05f0d65d252d38f Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Tue, 22 May 2018 08:23:07 -0700
Subject: [PATCH 186/245] add host, catch std exception

---
 Cxx11/nstream-sycl.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc
index 0142d7913..d3ddbeab6 100644
--- a/Cxx11/nstream-sycl.cc
+++ b/Cxx11/nstream-sycl.cc
@@ -270,6 +270,9 @@ int main(int argc, char * argv[])
     std::cout << e << std::endl;
     return 1;
   }
+  catch (std::exception e) {
+    std::cout << e.what() << std::endl;
+  }
 
   return 0;
 }

From b9534313fa77417139f690a8a456683d581881c6 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Thu, 31 May 2018 11:20:54 -0700
Subject: [PATCH 187/245] c++1z instead of c++17

---
 travis/build-run-prk.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index 962ecc1f4..1f40f02a8 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -668,9 +668,9 @@ case "$PRK_TARGET" in
             SYCLDIR=${TRAVIS_ROOT}/triSYCL
             if [ "${CC}" = "clang" ] ; then
                 # SYCL will compile without OpenMP
-                echo "SYCLCXX=${PRK_CXX} -pthread -std=c++17" >> common/make.defs
+                echo "SYCLCXX=${PRK_CXX} -pthread -std=c++1z" >> common/make.defs
             else
-                echo "SYCLCXX=${PRK_CXX} -fopenmp -std=c++17" >> common/make.defs
+                echo "SYCLCXX=${PRK_CXX} -fopenmp -std=c++1z" >> common/make.defs
             fi
             echo "SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/include" >> common/make.defs
             ${MAKE} -C $PRK_TARGET_PATH p2p-hyperplane-sycl stencil-sycl transpose-sycl nstream-sycl

From 33d5286576e1a0eff3a3e180f7ecdcf362200f98 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sun, 3 Mar 2019 11:00:04 -0800
Subject: [PATCH 188/245] add list platforms

---
 Cxx11/nstream-opencl.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Cxx11/nstream-opencl.cc b/Cxx11/nstream-opencl.cc
index 40b76d4cc..4ef40bd64 100644
--- a/Cxx11/nstream-opencl.cc
+++ b/Cxx11/nstream-opencl.cc
@@ -190,6 +190,8 @@ int main(int argc, char* argv[])
   /// Setup OpenCL environment
   //////////////////////////////////////////////////////////////////////
 
+  prk::opencl::listPlatforms();
+
   cl_int err = CL_SUCCESS;
 
   cl::Context cpu(CL_DEVICE_TYPE_CPU, NULL, NULL, NULL, &err);

From 678ef8c652072d4aae41c553d0a457872daf9ad0 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sun, 3 Mar 2019 11:00:41 -0800
Subject: [PATCH 189/245] add OpenCL info and SYCL exception parsing

---
 Cxx11/nstream-sycl.cc | 55 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 48 insertions(+), 7 deletions(-)

diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc
index d3ddbeab6..26025943c 100644
--- a/Cxx11/nstream-sycl.cc
+++ b/Cxx11/nstream-sycl.cc
@@ -66,6 +66,11 @@
 
 #include "prk_util.h"
 
+#if 0
+#include "prk_opencl.h"
+#define USE_OPENCL 1
+#endif
+
 // need to declare kernel class as template
 // to prevent name mangling conflict below
 template <typename T> class nstream;
@@ -89,9 +94,9 @@ void run(cl::sycl::queue & q, int iterations, size_t length)
 
   try {
 
-    cl::sycl::buffer<T> d_A { h_A.data(), h_A.size() };
-    cl::sycl::buffer<T> d_B { h_B.data(), h_B.size() };
-    cl::sycl::buffer<T> d_C { h_C.data(), h_C.size() };
+    cl::sycl::buffer<T,1> d_A { h_A.data(), cl::sycl::range<1>(h_A.size()) };
+    cl::sycl::buffer<T,1> d_B { h_B.data(), cl::sycl::range<1>(h_B.size()) };
+    cl::sycl::buffer<T,1> d_C { h_C.data(), cl::sycl::range<1>(h_C.size()) };
 
     for (int iter = 0; iter<=iterations; ++iter) {
 
@@ -117,6 +122,11 @@ void run(cl::sycl::queue & q, int iterations, size_t length)
   }
   catch (cl::sycl::exception e) {
     std::cout << e.what() << std::endl;
+    std::cout << e.get_file_name() << std::endl;
+    std::cout << e.get_line_number() << std::endl;
+    std::cout << e.get_description() << std::endl;
+    std::cout << e.get_cl_error_message() << std::endl;
+    std::cout << e.get_cl_code() << std::endl;
     return;
   }
   catch (std::exception e) {
@@ -206,8 +216,11 @@ int main(int argc, char * argv[])
   /// Setup SYCL environment
   //////////////////////////////////////////////////////////////////////
 
-  try {
+#ifdef USE_OPENCL
+  prk::opencl::listPlatforms();
+#endif
 
+  try {
     if (1) {
         cl::sycl::queue host(cl::sycl::host_selector{});
 #ifndef TRISYCL
@@ -219,7 +232,20 @@ int main(int argc, char * argv[])
         run<float>(host, iterations, length);
         run<double>(host, iterations, length);
     }
+  }
+  catch (cl::sycl::exception e) {
+    std::cout << e.what() << std::endl;
+    std::cout << e.get_file_name() << std::endl;
+    std::cout << e.get_line_number() << std::endl;
+    std::cout << e.get_description() << std::endl;
+    std::cout << e.get_cl_error_message() << std::endl;
+    std::cout << e.get_cl_code() << std::endl;
+  }
+  catch (std::exception e) {
+    std::cout << e.what() << std::endl;
+  }
 
+  try {
     // CPU requires spir64 target
     if (1) {
         cl::sycl::queue cpu(cl::sycl::cpu_selector{});
@@ -232,7 +258,20 @@ int main(int argc, char * argv[])
         run<float>(cpu, iterations, length);
         run<double>(cpu, iterations, length);
     }
+  }
+  catch (cl::sycl::exception e) {
+    std::cout << e.what() << std::endl;
+    std::cout << e.get_file_name() << std::endl;
+    std::cout << e.get_line_number() << std::endl;
+    std::cout << e.get_description() << std::endl;
+    std::cout << e.get_cl_error_message() << std::endl;
+    std::cout << e.get_cl_code() << std::endl;
+  }
+  catch (std::exception e) {
+    std::cout << e.what() << std::endl;
+  }
 
+  try {
     // NVIDIA GPU requires ptx64 target and does not work very well
     if (1) {
         cl::sycl::queue gpu(cl::sycl::gpu_selector{});
@@ -260,6 +299,11 @@ int main(int argc, char * argv[])
   }
   catch (cl::sycl::exception e) {
     std::cout << e.what() << std::endl;
+    std::cout << e.get_file_name() << std::endl;
+    std::cout << e.get_line_number() << std::endl;
+    std::cout << e.get_description() << std::endl;
+    std::cout << e.get_cl_error_message() << std::endl;
+    std::cout << e.get_cl_code() << std::endl;
     return 1;
   }
   catch (std::exception e) {
@@ -270,9 +314,6 @@ int main(int argc, char * argv[])
     std::cout << e << std::endl;
     return 1;
   }
-  catch (std::exception e) {
-    std::cout << e.what() << std::endl;
-  }
 
   return 0;
 }

From 05f30bdff6bd6f5ddbf787abca6935fb96061f95 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Thu, 9 May 2019 13:10:29 -0700
Subject: [PATCH 190/245] SYCL with explicit date movement

---
 Cxx11/Makefile                   |   2 +-
 Cxx11/nstream-sycl-explicit.cc   | 337 +++++++++++++++++++++++++++++++
 Cxx11/transpose-sycl-explicit.cc | 299 +++++++++++++++++++++++++++
 3 files changed, 637 insertions(+), 1 deletion(-)
 create mode 100644 Cxx11/nstream-sycl-explicit.cc
 create mode 100644 Cxx11/transpose-sycl-explicit.cc

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index b166d65d4..d1c945ea6 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -105,7 +105,7 @@ taskloop: stencil-vector-taskloop transpose-vector-taskloop nstream-vector-taskl
 
 opencl: p2p-innerloop-opencl stencil-opencl transpose-opencl nstream-opencl
 
-sycl: p2p-hyperplane-sycl stencil-sycl transpose-sycl nstream-sycl
+sycl: p2p-hyperplane-sycl stencil-sycl transpose-sycl nstream-sycl transpose-sycl-explicit nstream-sycl-explicit
 
 tbb: p2p-innerloop-vector-tbb p2p-vector-tbb stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb \
      p2p-hyperplane-vector-tbb p2p-tasks-tbb
diff --git a/Cxx11/nstream-sycl-explicit.cc b/Cxx11/nstream-sycl-explicit.cc
new file mode 100644
index 000000000..2f177db35
--- /dev/null
+++ b/Cxx11/nstream-sycl-explicit.cc
@@ -0,0 +1,337 @@
+///
+/// Copyright (c) 2017, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    nstream
+///
+/// PURPOSE: To compute memory bandwidth when adding a vector of a given
+///          number of double precision values to the scalar multiple of
+///          another vector of the same length, and storing the result in
+///          a third vector.
+///
+/// USAGE:   The program takes as input the number
+///          of iterations to loop over the triad vectors, the length of the
+///          vectors, and the offset between vectors
+///
+///          <progname> <# iterations> <vector length> <offset>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// NOTES:   Bandwidth is determined as the number of words read, plus the
+///          number of words written, times the size of the words, divided
+///          by the execution time. For a vector length of N, the total
+///          number of words read and written is 4*N*sizeof(double).
+///
+///
+/// HISTORY: This code is loosely based on the Stream benchmark by John
+///          McCalpin, but does not follow all the Stream rules. Hence,
+///          reported results should not be associated with Stream in
+///          external publications
+///
+///          Converted to C++11 by Jeff Hammond, November 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "CL/sycl.hpp"
+#include "prk_util.h"
+
+#define PREBUILD_KERNEL 1
+
+#if 0
+#include "prk_opencl.h"
+#define USE_OPENCL 1
+#endif
+
+// need to declare kernel class as template
+// to prevent name mangling conflict below
+template <typename T> class nstream;
+
+template <typename T>
+void run(cl::sycl::queue & q, int iterations, size_t length)
+{
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  double nstream_time(0);
+
+  const T scalar(3);
+
+  std::vector<T> h_A(length,0);
+
+  try {
+
+#if PREBUILD_KERNEL
+    cl::sycl::program kernel(q.get_context());
+    kernel.build_with_kernel_type<nstream<T>>();
+#endif
+
+    cl::sycl::buffer<T> d_A { cl::sycl::range<1>{length} };
+    cl::sycl::buffer<T> d_B { cl::sycl::range<1>{length} };
+    cl::sycl::buffer<T> d_C { cl::sycl::range<1>{length} };
+
+    q.submit([&](cl::sycl::handler& h) {
+        cl::sycl::accessor<T, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> A(d_A, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0));
+        h.fill(A,(T)0);
+    });
+    q.submit([&](cl::sycl::handler& h) {
+        cl::sycl::accessor<T, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0));
+        h.fill(B,(T)2);
+    });
+    q.submit([&](cl::sycl::handler& h) {
+        cl::sycl::accessor<T, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> C(d_C, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0));
+        h.fill(C,(T)2);
+    });
+    q.wait();
+
+    for (int iter = 0; iter<=iterations; ++iter) {
+
+      if (iter==1) nstream_time = prk::wtime();
+
+      q.submit([&](cl::sycl::handler& h) {
+
+        cl::sycl::accessor<T, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer> A(d_A, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0));
+        cl::sycl::accessor<T, 1, cl::sycl::access::mode::read,       cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0));
+        cl::sycl::accessor<T, 1, cl::sycl::access::mode::read,       cl::sycl::access::target::global_buffer> C(d_C, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0));
+
+        h.parallel_for<class nstream<T>>(
+#if PREBUILD_KERNEL
+                kernel.get_kernel<nstream<T>>(),
+#endif
+                cl::sycl::range<1>{length}, [=] (cl::sycl::item<1> i) {
+            A[i] += B[i] + scalar * C[i];
+        });
+      });
+      q.wait();
+    }
+
+    // Stop timer before buffer+accessor destructors fire,
+    // since that will move data, and we do not time that
+    // for other device-oriented programming models.
+    nstream_time = prk::wtime() - nstream_time;
+
+    q.submit([&](cl::sycl::handler& h) {
+        cl::sycl::accessor<T, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> A(d_A, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0));
+        h.copy(A,h_A.data());
+    });
+    q.wait();
+  }
+  catch (cl::sycl::exception e) {
+    std::cout << e.what() << std::endl;
+    std::cout << e.get_file_name() << std::endl;
+    std::cout << e.get_line_number() << std::endl;
+    std::cout << e.get_description() << std::endl;
+    std::cout << e.get_cl_error_message() << std::endl;
+    std::cout << e.get_cl_code() << std::endl;
+    return;
+  }
+  catch (std::exception e) {
+    std::cout << e.what() << std::endl;
+    return;
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  T ar(0);
+  T br(2);
+  T cr(2);
+  for (int i=0; i<=iterations; ++i) {
+      ar += br + scalar * cr;
+  }
+
+  ar *= length;
+
+  double asum(0);
+  for (size_t i=0; i<length; ++i) {
+      asum += std::fabs(h_A[i]);
+  }
+
+  const double epsilon(1.e-8);
+  if (std::fabs(ar-asum)/asum > epsilon) {
+      std::cout << "Failed Validation on output array\n"
+                << "       Expected checksum: " << ar << "\n"
+                << "       Observed checksum: " << asum << std::endl;
+      std::cout << "ERROR: solution did not validate" << std::endl;
+  } else {
+      std::cout << "Solution validates" << std::endl;
+      double avgtime = nstream_time/iterations;
+      double nbytes = 4.0 * length * sizeof(T);
+      std::cout << 8*sizeof(T) << "B "
+                << "Rate (MB/s): " << 1.e-6*nbytes/avgtime
+                << " Avg time (s): " << avgtime << std::endl;
+  }
+}
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/SYCL STREAM triad: A = B + scalar * C" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations, offset;
+  size_t length;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <vector length>";
+      }
+
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      length = std::atol(argv[2]);
+      if (length <= 0) {
+        throw "ERROR: vector length must be positive";
+      }
+
+      offset = (argc>3) ? std::atoi(argv[3]) : 0;
+      if (length <= 0) {
+        throw "ERROR: offset must be nonnegative";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Vector length        = " << length << std::endl;
+  std::cout << "Offset               = " << offset << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Setup SYCL environment
+  //////////////////////////////////////////////////////////////////////
+
+#ifdef USE_OPENCL
+  prk::opencl::listPlatforms();
+#endif
+
+  try {
+    if (length<100000) {
+        cl::sycl::queue host(cl::sycl::host_selector{});
+#ifndef TRISYCL
+        auto device      = host.get_device();
+        auto platform    = device.get_platform();
+        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
+        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
+#endif
+        run<float>(host, iterations, length);
+        run<double>(host, iterations, length);
+    } else {
+        std::cout << "Skipping host device since it is too slow for large problems" << std::endl;
+    }
+
+    // CPU requires spir64 target
+    if (1) {
+        cl::sycl::queue cpu(cl::sycl::cpu_selector{});
+#ifndef TRISYCL
+        auto device      = cpu.get_device();
+        auto platform    = device.get_platform();
+        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
+        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
+        bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir"));
+#else
+        bool has_spir = true; // ?
+#endif
+        if (has_spir) {
+          run<float>(cpu, iterations, length);
+          run<double>(cpu, iterations, length);
+        }
+    }
+
+    // NVIDIA GPU requires ptx64 target and does not work very well
+    if (1) {
+        cl::sycl::queue gpu(cl::sycl::gpu_selector{});
+#ifndef TRISYCL
+        auto device      = gpu.get_device();
+        auto platform    = device.get_platform();
+        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
+        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
+        bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir"));
+        bool has_fp64 = device.has_extension(cl::sycl::string_class("cl_khr_fp64"));
+#else
+        bool has_spir = true; // ?
+        bool has_fp64 = true;
+#endif
+        if (!has_fp64) {
+          std::cout << "SYCL GPU device lacks FP64 support." << std::endl;
+        }
+        if (has_spir) {
+          run<float>(gpu, iterations, length);
+          if (has_fp64) {
+            run<double>(gpu, iterations, length);
+          }
+        } else {
+          std::cout << "SYCL GPU device lacks SPIR-V support." << std::endl;
+#ifdef __COMPUTECPP__
+          std::cout << "You are using ComputeCpp so we will try it anyways..." << std::endl;
+          run<float>(gpu, iterations, length);
+          if (has_fp64) {
+            run<double>(gpu, iterations, length);
+          }
+#endif
+        }
+    }
+  }
+  catch (cl::sycl::exception e) {
+    std::cout << e.what() << std::endl;
+    std::cout << e.get_file_name() << std::endl;
+    std::cout << e.get_line_number() << std::endl;
+    std::cout << e.get_description() << std::endl;
+    std::cout << e.get_cl_error_message() << std::endl;
+    std::cout << e.get_cl_code() << std::endl;
+    return 1;
+  }
+  catch (std::exception e) {
+    std::cout << e.what() << std::endl;
+    return 1;
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
+
+
diff --git a/Cxx11/transpose-sycl-explicit.cc b/Cxx11/transpose-sycl-explicit.cc
new file mode 100644
index 000000000..8b09da622
--- /dev/null
+++ b/Cxx11/transpose-sycl-explicit.cc
@@ -0,0 +1,299 @@
+///
+/// Copyright (c) 2013, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    transpose
+///
+/// PURPOSE: This program measures the time for the transpose of a
+///          column-major stored matrix into a row-major stored matrix.
+///
+/// USAGE:   Program input is the matrix order and the number of times to
+///          repeat the operation:
+///
+///          transpose <matrix_size> <# iterations>
+///
+///          The output consists of diagnostics to make sure the
+///          transpose worked and timing statistics.
+///
+/// HISTORY: Written by  Rob Van der Wijngaart, February 2009.
+///          Converted to C++11 by Jeff Hammond, February 2016 and May 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "CL/sycl.hpp"
+#include "prk_util.h"
+
+#define PREBUILD_KERNEL 1
+
+// need to declare kernel class as template
+// to prevent name mangling conflict below
+template <typename T> class iota;
+template <typename T> class transpose;
+
+template <typename T>
+void run(cl::sycl::queue & q, int iterations, size_t order)
+{
+  //////////////////////////////////////////////////////////////////////
+  /// Allocate space for the input and transpose matrix
+  //////////////////////////////////////////////////////////////////////
+
+  double trans_time(0);
+
+  std::vector<T> h_B(order*order,(T)0);
+
+  try {
+
+#if PREBUILD_KERNEL
+    cl::sycl::program kernel(q.get_context());
+    kernel.build_with_kernel_type<transpose<T>>();
+#endif
+
+#if USE_2D_INDEXING
+    cl::sycl::buffer<T,2> d_A( cl::sycl::range<2>{order,order} );
+    cl::sycl::buffer<T,2> d_B( cl::sycl::range<2>{order,order} );
+#else
+    cl::sycl::buffer<T> d_A { cl::sycl::range<1>{order*order}  };
+    cl::sycl::buffer<T> d_B { cl::sycl::range<1>{order*order}  };
+#endif
+
+    q.submit([&](cl::sycl::handler& h) {
+#if USE_2D_INDEXING
+        cl::sycl::accessor<T, 2, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> A(d_A, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0));
+        h.parallel_for<class iota<T>>(cl::sycl::range<2>{order,order}, [=] (cl::sycl::item<2> i) {
+            A[i] = i[0] * order + i[1];
+        });
+#else
+        cl::sycl::accessor<T, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> A(d_A, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0));
+        h.parallel_for<class iota<T>>(cl::sycl::range<1>{order*order}, [=] (cl::sycl::item<1> i) {
+            A[i] = i[0];
+        });
+#endif
+    });
+    q.submit([&](cl::sycl::handler& h) {
+#if USE_2D_INDEXING
+        cl::sycl::accessor<T, 2, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0));
+#else
+        cl::sycl::accessor<T, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0));
+#endif
+        h.fill(B,(T)0);
+    });
+    q.wait();
+
+    for (int iter = 0; iter<=iterations; ++iter) {
+
+      if (iter==1) trans_time = prk::wtime();
+
+      q.submit([&](cl::sycl::handler& h) {
+
+#if USE_2D_INDEXING
+        cl::sycl::accessor<T, 2, cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer> A(d_A, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0));
+        cl::sycl::accessor<T, 2, cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0));
+#else
+        cl::sycl::accessor<T, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer> A(d_A, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0));
+        cl::sycl::accessor<T, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0));
+#endif
+
+        h.parallel_for<class transpose<T>>(
+#if PREBUILD_KERNEL
+                kernel.get_kernel<transpose<T>>(),
+#endif
+                cl::sycl::range<2>{order,order}, [=] (cl::sycl::item<2> it) {
+#if USE_2D_INDEXING
+          cl::sycl::id<2> ij{it[0],it[1]};
+          cl::sycl::id<2> ji{it[1],it[0]};
+          B[ij] += A[ji];
+          A[ji] += (T)1;
+#else
+          B[it[0] * order + it[1]] += A[it[1] * order + it[0]];
+          A[it[1] * order + it[0]] += (T)1;
+#endif
+        });
+      });
+      q.wait();
+    }
+
+    // Stop timer before buffer+accessor destructors fire,
+    // since that will move data, and we do not time that
+    // for other device-oriented programming models.
+    trans_time = prk::wtime() - trans_time;
+
+    q.submit([&](cl::sycl::handler& h) {
+#if USE_2D_INDEXING
+        cl::sycl::accessor<T, 2, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0));
+#else
+        cl::sycl::accessor<T, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0));
+#endif
+        h.copy(B,h_B.data());
+    });
+    q.wait();
+  }
+  catch (cl::sycl::exception e) {
+    std::cout << e.what() << std::endl;
+    return;
+  }
+  catch (std::exception e) {
+    std::cout << e.what() << std::endl;
+    return;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  // TODO: replace with std::generate, std::accumulate, or similar
+  const T addit = (iterations+1.) * (iterations/2.);
+  double abserr(0);
+  for (size_t i=0; i<order; ++i) {
+    for (size_t j=0; j<order; ++j) {
+      size_t const ij = i*order+j;
+      size_t const ji = j*order+i;
+      const T reference = static_cast<T>(ij)*(1.+iterations)+addit;
+      abserr += std::fabs(h_B[ji] - reference);
+    }
+  }
+
+#ifdef VERBOSE
+  std::cout << "Sum of absolute differences: " << abserr << std::endl;
+#endif
+
+  const double epsilon(1.0e-8);
+  if (abserr < epsilon) {
+    std::cout << "Solution validates" << std::endl;
+    double avgtime = trans_time/iterations;
+    double bytes = (size_t)order * (size_t)order * sizeof(T);
+    std::cout << 8*sizeof(T) << "B "
+              << "Rate (MB/s): " << 1.0e-6 * (2.*bytes)/avgtime
+              << " Avg time (s): " << avgtime << std::endl;
+  } else {
+    std::cout << "ERROR: Aggregate squared error " << abserr
+              << " exceeds threshold " << epsilon << std::endl;
+  }
+}
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/SYCL Matrix transpose: B = A^T" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations;
+  size_t order;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <matrix order>";
+      }
+
+      // number of times to do the transpose
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      // order of a the matrix
+      order = std::atoi(argv[2]);
+      if (order <= 0) {
+        throw "ERROR: Matrix Order must be greater than 0";
+      } else if (order > std::floor(std::sqrt(INT_MAX))) {
+        throw "ERROR: matrix dimension too large - overflow risk";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations  = " << iterations << std::endl;
+  std::cout << "Matrix order          = " << order << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Setup SYCL environment
+  //////////////////////////////////////////////////////////////////////
+
+  try {
+
+    if (1) {
+        cl::sycl::queue host(cl::sycl::host_selector{});
+#ifndef TRISYCL
+        auto device      = host.get_device();
+        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
+        auto platform    = device.get_platform();
+        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
+#endif
+
+        run<float>(host, iterations, order);
+        run<double>(host, iterations, order);
+    }
+
+    // CPU requires spir64 target
+    if (1) {
+        cl::sycl::queue cpu(cl::sycl::cpu_selector{});
+#ifndef TRISYCL
+        auto device      = cpu.get_device();
+        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
+        auto platform    = device.get_platform();
+        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
+        //std::cout << "cl_khr_spir:   " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl;
+#endif
+
+        run<float>(cpu, iterations, order);
+        run<double>(cpu, iterations, order);
+    }
+
+    // NVIDIA GPU requires ptx64 target and does not work very well
+    if (0) {
+        cl::sycl::queue gpu(cl::sycl::gpu_selector{});
+#ifndef TRISYCL
+        auto device      = gpu.get_device();
+        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
+        auto platform    = device.get_platform();
+        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
+        //std::cout << "cl_khr_spir:   " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl;
+#endif
+
+        run<float>(gpu, iterations, order);
+        run<double>(gpu, iterations, order);
+    }
+  }
+  catch (cl::sycl::exception e) {
+    std::cout << e.what() << std::endl;
+  }
+  catch (std::exception e) {
+    std::cout << e.what() << std::endl;
+  }
+
+  return 0;
+}
+
+

From 11c06ca00e7a7d6af14e7ce04b0543e365f22dbf Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Thu, 9 May 2019 13:12:13 -0700
Subject: [PATCH 191/245] reconcile structure with explicit copy versions

---
 Cxx11/nstream-sycl.cc   | 72 ++++++++++++++++++++---------------------
 Cxx11/transpose-sycl.cc | 22 +++++++++----
 2 files changed, 50 insertions(+), 44 deletions(-)

diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc
index 26025943c..277f9435e 100644
--- a/Cxx11/nstream-sycl.cc
+++ b/Cxx11/nstream-sycl.cc
@@ -63,9 +63,10 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "CL/sycl.hpp"
-
 #include "prk_util.h"
 
+#define PREBUILD_KERNEL 1
+
 #if 0
 #include "prk_opencl.h"
 #define USE_OPENCL 1
@@ -84,16 +85,19 @@ void run(cl::sycl::queue & q, int iterations, size_t length)
 
   double nstream_time(0);
 
+  const T scalar(3);
+
   std::vector<T> h_A(length,0);
   std::vector<T> h_B(length,2);
   std::vector<T> h_C(length,2);
 
-  auto range = prk::range(static_cast<size_t>(0), length);
-
-  const T scalar(3);
-
   try {
 
+#if PREBUILD_KERNEL
+    cl::sycl::program kernel(q.get_context());
+    kernel.build_with_kernel_type<nstream<T>>();
+#endif
+
     cl::sycl::buffer<T,1> d_A { h_A.data(), cl::sycl::range<1>(h_A.size()) };
     cl::sycl::buffer<T,1> d_B { h_B.data(), cl::sycl::range<1>(h_B.size()) };
     cl::sycl::buffer<T,1> d_C { h_C.data(), cl::sycl::range<1>(h_C.size()) };
@@ -108,7 +112,11 @@ void run(cl::sycl::queue & q, int iterations, size_t length)
         auto B = d_B.template get_access<cl::sycl::access::mode::read>(h);
         auto C = d_C.template get_access<cl::sycl::access::mode::read>(h);
 
-        h.parallel_for<class nstream<T>>(cl::sycl::range<1>{length}, [=] (cl::sycl::item<1> i) {
+        h.parallel_for<class nstream<T>>(
+#if PREBUILD_KERNEL
+                kernel.get_kernel<nstream<T>>(),
+#endif
+                cl::sycl::range<1>{length}, [=] (cl::sycl::item<1> i) {
             A[i] += B[i] + scalar * C[i];
         });
       });
@@ -221,7 +229,7 @@ int main(int argc, char * argv[])
 #endif
 
   try {
-    if (1) {
+    if (length<100000) {
         cl::sycl::queue host(cl::sycl::host_selector{});
 #ifndef TRISYCL
         auto device      = host.get_device();
@@ -231,21 +239,10 @@ int main(int argc, char * argv[])
 #endif
         run<float>(host, iterations, length);
         run<double>(host, iterations, length);
+    } else {
+        std::cout << "Skipping host device since it is too slow for large problems" << std::endl;
     }
-  }
-  catch (cl::sycl::exception e) {
-    std::cout << e.what() << std::endl;
-    std::cout << e.get_file_name() << std::endl;
-    std::cout << e.get_line_number() << std::endl;
-    std::cout << e.get_description() << std::endl;
-    std::cout << e.get_cl_error_message() << std::endl;
-    std::cout << e.get_cl_code() << std::endl;
-  }
-  catch (std::exception e) {
-    std::cout << e.what() << std::endl;
-  }
 
-  try {
     // CPU requires spir64 target
     if (1) {
         cl::sycl::queue cpu(cl::sycl::cpu_selector{});
@@ -254,24 +251,16 @@ int main(int argc, char * argv[])
         auto platform    = device.get_platform();
         std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
         std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
+        bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir"));
+#else
+        bool has_spir = true; // ?
 #endif
-        run<float>(cpu, iterations, length);
-        run<double>(cpu, iterations, length);
+        if (has_spir) {
+          run<float>(cpu, iterations, length);
+          run<double>(cpu, iterations, length);
+        }
     }
-  }
-  catch (cl::sycl::exception e) {
-    std::cout << e.what() << std::endl;
-    std::cout << e.get_file_name() << std::endl;
-    std::cout << e.get_line_number() << std::endl;
-    std::cout << e.get_description() << std::endl;
-    std::cout << e.get_cl_error_message() << std::endl;
-    std::cout << e.get_cl_code() << std::endl;
-  }
-  catch (std::exception e) {
-    std::cout << e.what() << std::endl;
-  }
 
-  try {
     // NVIDIA GPU requires ptx64 target and does not work very well
     if (1) {
         cl::sycl::queue gpu(cl::sycl::gpu_selector{});
@@ -281,18 +270,27 @@ int main(int argc, char * argv[])
         std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
         std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
         bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir"));
+        bool has_fp64 = device.has_extension(cl::sycl::string_class("cl_khr_fp64"));
 #else
         bool has_spir = true; // ?
+        bool has_fp64 = true;
 #endif
+        if (!has_fp64) {
+          std::cout << "SYCL GPU device lacks FP64 support." << std::endl;
+        }
         if (has_spir) {
           run<float>(gpu, iterations, length);
-          run<double>(gpu, iterations, length);
+          if (has_fp64) {
+            run<double>(gpu, iterations, length);
+          }
         } else {
           std::cout << "SYCL GPU device lacks SPIR-V support." << std::endl;
 #ifdef __COMPUTECPP__
           std::cout << "You are using ComputeCpp so we will try it anyways..." << std::endl;
           run<float>(gpu, iterations, length);
-          run<double>(gpu, iterations, length);
+          if (has_fp64) {
+            run<double>(gpu, iterations, length);
+          }
 #endif
         }
     }
diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc
index e7b1d94d2..a0fa97d00 100644
--- a/Cxx11/transpose-sycl.cc
+++ b/Cxx11/transpose-sycl.cc
@@ -50,9 +50,10 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "CL/sycl.hpp"
-
 #include "prk_util.h"
 
+#define PREBUILD_KERNEL 1
+
 // need to declare kernel class as template
 // to prevent name mangling conflict below
 template <typename T> class transpose;
@@ -67,13 +68,18 @@ void run(cl::sycl::queue & q, int iterations, size_t order)
   double trans_time(0);
 
   std::vector<T> h_A(order*order);
-  std::vector<T> h_B(order*order,static_cast<T>(0));
+  std::vector<T> h_B(order*order,(T)0);
 
   // fill A with the sequence 0 to order^2-1 as doubles
   std::iota(h_A.begin(), h_A.end(), static_cast<T>(0));
 
   try {
 
+#if PREBUILD_KERNEL
+    cl::sycl::program kernel(q.get_context());
+    kernel.build_with_kernel_type<transpose<T>>();
+#endif
+
 #if USE_2D_INDEXING
     cl::sycl::buffer<T,2> d_A( h_A.data(), cl::sycl::range<2>{order,order} );
     cl::sycl::buffer<T,2> d_B( h_B.data(), cl::sycl::range<2>{order,order} );
@@ -92,16 +98,19 @@ void run(cl::sycl::queue & q, int iterations, size_t order)
         auto A = d_A.template get_access<cl::sycl::access::mode::read_write>(h);
         auto B = d_B.template get_access<cl::sycl::access::mode::read_write>(h);
 
-        // transpose
-        h.parallel_for<class transpose<T>>(cl::sycl::range<2>{order,order}, [=] (cl::sycl::item<2> it) {
+        h.parallel_for<class transpose<T>>(
+#if PREBUILD_KERNEL
+                kernel.get_kernel<transpose<T>>(),
+#endif
+                cl::sycl::range<2>{order,order}, [=] (cl::sycl::item<2> it) {
 #if USE_2D_INDEXING
           cl::sycl::id<2> ij{it[0],it[1]};
           cl::sycl::id<2> ji{it[1],it[0]};
           B[ij] += A[ji];
-          A[ji] += static_cast<T>(1);
+          A[ji] += (T)1;
 #else
           B[it[0] * order + it[1]] += A[it[1] * order + it[0]];
-          A[it[1] * order + it[0]] += static_cast<T>(1);
+          A[it[1] * order + it[0]] += (T)1;
 #endif
         });
       });
@@ -238,7 +247,6 @@ int main(int argc, char * argv[])
         std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
         //std::cout << "cl_khr_spir:   " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl;
 #endif
-
         run<float>(gpu, iterations, order);
         run<double>(gpu, iterations, order);
     }

From 8f431b1e68381b485c63c03a5457962659a227ea Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Thu, 9 May 2019 13:18:52 -0700
Subject: [PATCH 192/245] fixing things

---
 Cxx11/Makefile                                             | 2 +-
 .../{nstream-sycl-explicit.cc => nstream-explicit-sycl.cc} | 2 --
 Cxx11/nstream-sycl.cc                                      | 2 --
 Cxx11/prk_util.h                                           | 7 +++++++
 ...anspose-sycl-explicit.cc => transpose-explicit-sycl.cc} | 2 --
 Cxx11/transpose-sycl.cc                                    | 2 --
 6 files changed, 8 insertions(+), 9 deletions(-)
 rename Cxx11/{nstream-sycl-explicit.cc => nstream-explicit-sycl.cc} (99%)
 rename Cxx11/{transpose-sycl-explicit.cc => transpose-explicit-sycl.cc} (99%)

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index d1c945ea6..f96e63744 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -105,7 +105,7 @@ taskloop: stencil-vector-taskloop transpose-vector-taskloop nstream-vector-taskl
 
 opencl: p2p-innerloop-opencl stencil-opencl transpose-opencl nstream-opencl
 
-sycl: p2p-hyperplane-sycl stencil-sycl transpose-sycl nstream-sycl transpose-sycl-explicit nstream-sycl-explicit
+sycl: p2p-hyperplane-sycl stencil-sycl transpose-sycl nstream-sycl transpose-explicit-sycl nstream-explicit-sycl
 
 tbb: p2p-innerloop-vector-tbb p2p-vector-tbb stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb \
      p2p-hyperplane-vector-tbb p2p-tasks-tbb
diff --git a/Cxx11/nstream-sycl-explicit.cc b/Cxx11/nstream-explicit-sycl.cc
similarity index 99%
rename from Cxx11/nstream-sycl-explicit.cc
rename to Cxx11/nstream-explicit-sycl.cc
index 2f177db35..6367bf660 100644
--- a/Cxx11/nstream-sycl-explicit.cc
+++ b/Cxx11/nstream-explicit-sycl.cc
@@ -65,8 +65,6 @@
 #include "CL/sycl.hpp"
 #include "prk_util.h"
 
-#define PREBUILD_KERNEL 1
-
 #if 0
 #include "prk_opencl.h"
 #define USE_OPENCL 1
diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc
index 277f9435e..b0fd07be1 100644
--- a/Cxx11/nstream-sycl.cc
+++ b/Cxx11/nstream-sycl.cc
@@ -65,8 +65,6 @@
 #include "CL/sycl.hpp"
 #include "prk_util.h"
 
-#define PREBUILD_KERNEL 1
-
 #if 0
 #include "prk_opencl.h"
 #define USE_OPENCL 1
diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h
index d2caae1b7..c969af7fd 100644
--- a/Cxx11/prk_util.h
+++ b/Cxx11/prk_util.h
@@ -82,6 +82,13 @@
 #define PRK_UNUSED
 #endif
 
+// for SYCL
+#ifdef TRISYCL
+#define PREBUILD_KERNEL 0
+#else
+#define PREBUILD_KERNEL 1
+#endif
+
 namespace prk {
 
     int get_alignment(void)
diff --git a/Cxx11/transpose-sycl-explicit.cc b/Cxx11/transpose-explicit-sycl.cc
similarity index 99%
rename from Cxx11/transpose-sycl-explicit.cc
rename to Cxx11/transpose-explicit-sycl.cc
index 8b09da622..c34497d97 100644
--- a/Cxx11/transpose-sycl-explicit.cc
+++ b/Cxx11/transpose-explicit-sycl.cc
@@ -52,8 +52,6 @@
 #include "CL/sycl.hpp"
 #include "prk_util.h"
 
-#define PREBUILD_KERNEL 1
-
 // need to declare kernel class as template
 // to prevent name mangling conflict below
 template <typename T> class iota;
diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc
index a0fa97d00..b853ccf7b 100644
--- a/Cxx11/transpose-sycl.cc
+++ b/Cxx11/transpose-sycl.cc
@@ -52,8 +52,6 @@
 #include "CL/sycl.hpp"
 #include "prk_util.h"
 
-#define PREBUILD_KERNEL 1
-
 // need to declare kernel class as template
 // to prevent name mangling conflict below
 template <typename T> class transpose;

From a0d8b21fbca31c78f1ef2a8f06a8de612e82dd6c Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Thu, 9 May 2019 13:56:50 -0700
Subject: [PATCH 193/245] enable optimizations

---
 common/make.defs.llvm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/make.defs.llvm b/common/make.defs.llvm
index 224edb8d9..092da96b8 100644
--- a/common/make.defs.llvm
+++ b/common/make.defs.llvm
@@ -61,7 +61,7 @@ OPENMPFLAG+=-L${LLVM_ROOT}/lib
 SYCLDIR=/opt/sycl/latest
 SYCLCXX=${SYCLDIR}/bin/compute++
 SYCLFLAG=-DUSE_SYCL -sycl-driver -I$(SYCLDIR)/include -L$(SYCLDIR)/lib -Wl,-rpath=$(SYCLDIR)/lib -lComputeCpp
-SYCLFLAG+=-std=c++14
+SYCLFLAG+=-std=c++14 -O3
 # This makes a huge difference in e.g. nstream...
 #SYCLFLAG+=-no-serial-memop
 # CentOS7 and Ubuntu14 built for this

From 4d7092e9008f33488ea7fe9074cb28e4c1a0657e Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Thu, 9 May 2019 14:01:46 -0700
Subject: [PATCH 194/245] make.defs.gcc update (#397)

---
 common/make.defs.gcc | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index 50b7a572a..b0566487a 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -62,6 +62,7 @@ SYCLFLAG=-I$(SYCLDIR)/include
 #SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx
 #SYCLCXX=${CXX} ${OPENMPFLAG}
 #SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG}
+METALFLAG=-framework MetalPerformanceShaders
 #
 # OCCA
 #
@@ -103,6 +104,19 @@ SYCLFLAG=-I${SYCLDIR}/include ${BOOSTFLAG} -DTRISYCL
 #SYCLFLAG=-I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG}
 SYCLFLAG+=${RANGEFLAG}
 #
+# SYCL flags
+#
+# triSYCL
+# https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
+SYCLDIR=./triSYCL
+SYCLCXX=${CXX} -std=c++17 ${OPENMPFLAG}
+SYCLFLAG=-I${SYCLDIR}/include ${BOOSTFLAG}
+# ProGTX
+# https://github.com/ProGTX/sycl-gtx
+#SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx
+#SYCLCXX=${CXX} ${OPENMPFLAG}
+#SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG}
+#
 # CBLAS for C++ DGEMM
 #
 BLASFLAG=-DACCELERATE -framework Accelerate

From bc7e338cdf670851ed7a970a2d12d847711f98f4 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Thu, 9 May 2019 14:09:00 -0700
Subject: [PATCH 195/245] add SYCL nstream and transpose with explicit data
 movement (#395)

* triSYCL needs C++17

* fix Julia syntax issue

"1./" is a syntax error now.  change to "1.0/"

* do to SYCL what we have for OpenCL

* fix name mangling issue - thanks Rod@CodePlay!

* run 32b for all devices unconditionally

* label result with precision

* hard-code SYCL to CPU execution only due to GPU issues

the bandwidth reported is consistent for elements, not bytes, which
means that something is wrong.  64b data should not lead to BW that is
2x 32b data...

* add host, catch std exception

* c++1z instead of c++17

* fix use of ranges in SYCL

* correct sycl ranges fix

* better example flags

* do to SYCL what we have for OpenCL

* fix name mangling issue - thanks Rod@CodePlay!

* run 32b for all devices unconditionally

* label result with precision

* hard-code SYCL to CPU execution only due to GPU issues

the bandwidth reported is consistent for elements, not bytes, which
means that something is wrong.  64b data should not lead to BW that is
2x 32b data...

* add host, catch std exception

* c++1z instead of c++17

* fix use of ranges in SYCL

* correct sycl ranges fix

* better example flags

* do not incorrectly declare non-read-only buffers as read-only

* Sycl multi device and exceptions (#347)

* triSYCL needs C++17
* fix Julia syntax issue "1./" is a syntax error now.  change to "1.0/"
* do to SYCL what we have for OpenCL
* fix name mangling issue - thanks Rod@CodePlay!
* run 32b for all devices unconditionally
* label result with precision
* hard-code SYCL to CPU execution only due to GPU issues
the bandwidth reported is consistent for elements, not bytes, which
means that something is wrong.  64b data should not lead to BW that is
2x 32b data...
* add host, catch std exception
* c++1z instead of c++17
* fix use of ranges in SYCL
* correct sycl ranges fix
* better example flags

* fix nstream correctness by initializing host vectors

* make transpose-sycl multi-device etc

* templatize stencil sycl kernel over type

* SYCL stencil now templated

* try to detect working configs better

* forward-declare kernel names in SYCL stencil

* fix float template for 2D case

* declare kernel name templates closer to usage

* OpenCL: add No Device errors (#373)

* add No Device errors
* errno needs to be included unconditionally

* remove Rust from parent makefile to unbreak case when cargo missing

* avoid overflow

* Cxx11 nstream-kokkos: add missing fences

There are fences missing hence you wont' measure what you think on
asynchronous backends such as CUDA or HPX.  This also fixes using the actual name of the exec space instead of typeid.
Example for CUDA on V100:
Original:
Parallel Research Kernels version 2.16
C++11/Kokkos STREAM triad: A = B + scalar * C
Number of iterations = 1
Vector length        = 100000000
Offset               = 0
Kokkos execution space: N6Kokkos4CudaE
Solution validates
Rate (MB/s): 422188 Avg time (s): 0.00757957

With fences (and name fix):
Parallel Research Kernels version 2.16
C++11/Kokkos STREAM triad: A = B + scalar * C
Number of iterations = 1
Vector length        = 100000000
Offset               = 0
Kokkos execution space: Cuda
Solution validates
Rate (MB/s): 842600 Avg time (s): 0.00379777

* fix how BLAS linked in Fortran

* range-based TBB parallel_for

* show but do not enable non-range-based for in RAJA

* not yet working prk::vector

* fix prk::vector

* eliminate rule conflict

* switch from std::vector to prk::vector

* use prk::vector instead of std::vector

* use prk::vector instead of std::vector

* use prk::vector instead of std::vector

* use prk::vector instead of std::vector

* use prk::vector instead of std::vector

* better=simpler use of STL

* add variant for prk::vector

* try to implement prk::vector - works for some impls

* add versions that use prk::vector rather than STL

* add versions that use prk::vector rather than STL

* ignore more stuff

* cleanup stencil codegen for vector classes

* clean new targets (prk::vector sequential)

* silence GCC warning

* silence GCC warning

* add new impls

* reorder loops

* fix issues with Thrust when not using NVCC

* update examples for Thrust changes

* switch Thrust to use PRK range wrapper

* work around Clang FE issue

* add hyperplane OpenMP to C1z

* silent compiler warning

* prk::vector impl seems to be working

* silence compiler warning

* clean example for Intel toolchain

* use .data() instead of &([0]) and dynamic schedule loop in DGEMM CBLAS

* Flang is mostly Fortran 2008 complete now

* add kokkos::fence where appropriate

* Update make.defs.llvm

default to CodePlay
disable OCCA

* pointless reordering of string

* add PGI support for IVDEP

* return value qualified is ignored

* TBB does not support PGI

* update PGI example flags

* fix errors

* add hyperplane to make and travis

* try to use explicit data movement

* fix nstream-sycl but performance still terrible

* add optimization flag to example make.defs

* add kernel prebuild option and check for fp64 support

* improve SYCL transpose

1D and 2D both wrong for order>1295

* partial merge from master

* reconcile nstream sycl

* merging

* reconcile transpose sycl
---
 Cxx11/nstream-sycl-explicit.cc   | 337 +++++++++++++++++++++++++++++++
 Cxx11/nstream-sycl.cc            |   2 +
 Cxx11/transpose-sycl-explicit.cc | 299 +++++++++++++++++++++++++++
 Cxx11/transpose-sycl.cc          |   2 +
 4 files changed, 640 insertions(+)
 create mode 100644 Cxx11/nstream-sycl-explicit.cc
 create mode 100644 Cxx11/transpose-sycl-explicit.cc

diff --git a/Cxx11/nstream-sycl-explicit.cc b/Cxx11/nstream-sycl-explicit.cc
new file mode 100644
index 000000000..2f177db35
--- /dev/null
+++ b/Cxx11/nstream-sycl-explicit.cc
@@ -0,0 +1,337 @@
+///
+/// Copyright (c) 2017, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    nstream
+///
+/// PURPOSE: To compute memory bandwidth when adding a vector of a given
+///          number of double precision values to the scalar multiple of
+///          another vector of the same length, and storing the result in
+///          a third vector.
+///
+/// USAGE:   The program takes as input the number
+///          of iterations to loop over the triad vectors, the length of the
+///          vectors, and the offset between vectors
+///
+///          <progname> <# iterations> <vector length> <offset>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// NOTES:   Bandwidth is determined as the number of words read, plus the
+///          number of words written, times the size of the words, divided
+///          by the execution time. For a vector length of N, the total
+///          number of words read and written is 4*N*sizeof(double).
+///
+///
+/// HISTORY: This code is loosely based on the Stream benchmark by John
+///          McCalpin, but does not follow all the Stream rules. Hence,
+///          reported results should not be associated with Stream in
+///          external publications
+///
+///          Converted to C++11 by Jeff Hammond, November 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "CL/sycl.hpp"
+#include "prk_util.h"
+
+#define PREBUILD_KERNEL 1
+
+#if 0
+#include "prk_opencl.h"
+#define USE_OPENCL 1
+#endif
+
+// need to declare kernel class as template
+// to prevent name mangling conflict below
+template <typename T> class nstream;
+
+template <typename T>
+void run(cl::sycl::queue & q, int iterations, size_t length)
+{
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  double nstream_time(0);
+
+  const T scalar(3);
+
+  std::vector<T> h_A(length,0);
+
+  try {
+
+#if PREBUILD_KERNEL
+    cl::sycl::program kernel(q.get_context());
+    kernel.build_with_kernel_type<nstream<T>>();
+#endif
+
+    cl::sycl::buffer<T> d_A { cl::sycl::range<1>{length} };
+    cl::sycl::buffer<T> d_B { cl::sycl::range<1>{length} };
+    cl::sycl::buffer<T> d_C { cl::sycl::range<1>{length} };
+
+    q.submit([&](cl::sycl::handler& h) {
+        cl::sycl::accessor<T, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> A(d_A, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0));
+        h.fill(A,(T)0);
+    });
+    q.submit([&](cl::sycl::handler& h) {
+        cl::sycl::accessor<T, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0));
+        h.fill(B,(T)2);
+    });
+    q.submit([&](cl::sycl::handler& h) {
+        cl::sycl::accessor<T, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> C(d_C, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0));
+        h.fill(C,(T)2);
+    });
+    q.wait();
+
+    for (int iter = 0; iter<=iterations; ++iter) {
+
+      if (iter==1) nstream_time = prk::wtime();
+
+      q.submit([&](cl::sycl::handler& h) {
+
+        cl::sycl::accessor<T, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer> A(d_A, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0));
+        cl::sycl::accessor<T, 1, cl::sycl::access::mode::read,       cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0));
+        cl::sycl::accessor<T, 1, cl::sycl::access::mode::read,       cl::sycl::access::target::global_buffer> C(d_C, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0));
+
+        h.parallel_for<class nstream<T>>(
+#if PREBUILD_KERNEL
+                kernel.get_kernel<nstream<T>>(),
+#endif
+                cl::sycl::range<1>{length}, [=] (cl::sycl::item<1> i) {
+            A[i] += B[i] + scalar * C[i];
+        });
+      });
+      q.wait();
+    }
+
+    // Stop timer before buffer+accessor destructors fire,
+    // since that will move data, and we do not time that
+    // for other device-oriented programming models.
+    nstream_time = prk::wtime() - nstream_time;
+
+    q.submit([&](cl::sycl::handler& h) {
+        cl::sycl::accessor<T, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> A(d_A, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0));
+        h.copy(A,h_A.data());
+    });
+    q.wait();
+  }
+  catch (cl::sycl::exception e) {
+    std::cout << e.what() << std::endl;
+    std::cout << e.get_file_name() << std::endl;
+    std::cout << e.get_line_number() << std::endl;
+    std::cout << e.get_description() << std::endl;
+    std::cout << e.get_cl_error_message() << std::endl;
+    std::cout << e.get_cl_code() << std::endl;
+    return;
+  }
+  catch (std::exception e) {
+    std::cout << e.what() << std::endl;
+    return;
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  T ar(0);
+  T br(2);
+  T cr(2);
+  for (int i=0; i<=iterations; ++i) {
+      ar += br + scalar * cr;
+  }
+
+  ar *= length;
+
+  double asum(0);
+  for (size_t i=0; i<length; ++i) {
+      asum += std::fabs(h_A[i]);
+  }
+
+  const double epsilon(1.e-8);
+  if (std::fabs(ar-asum)/asum > epsilon) {
+      std::cout << "Failed Validation on output array\n"
+                << "       Expected checksum: " << ar << "\n"
+                << "       Observed checksum: " << asum << std::endl;
+      std::cout << "ERROR: solution did not validate" << std::endl;
+  } else {
+      std::cout << "Solution validates" << std::endl;
+      double avgtime = nstream_time/iterations;
+      double nbytes = 4.0 * length * sizeof(T);
+      std::cout << 8*sizeof(T) << "B "
+                << "Rate (MB/s): " << 1.e-6*nbytes/avgtime
+                << " Avg time (s): " << avgtime << std::endl;
+  }
+}
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/SYCL STREAM triad: A = B + scalar * C" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations, offset;
+  size_t length;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <vector length>";
+      }
+
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      length = std::atol(argv[2]);
+      if (length <= 0) {
+        throw "ERROR: vector length must be positive";
+      }
+
+      offset = (argc>3) ? std::atoi(argv[3]) : 0;
+      if (length <= 0) {
+        throw "ERROR: offset must be nonnegative";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Vector length        = " << length << std::endl;
+  std::cout << "Offset               = " << offset << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Setup SYCL environment
+  //////////////////////////////////////////////////////////////////////
+
+#ifdef USE_OPENCL
+  prk::opencl::listPlatforms();
+#endif
+
+  try {
+    if (length<100000) {
+        cl::sycl::queue host(cl::sycl::host_selector{});
+#ifndef TRISYCL
+        auto device      = host.get_device();
+        auto platform    = device.get_platform();
+        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
+        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
+#endif
+        run<float>(host, iterations, length);
+        run<double>(host, iterations, length);
+    } else {
+        std::cout << "Skipping host device since it is too slow for large problems" << std::endl;
+    }
+
+    // CPU requires spir64 target
+    if (1) {
+        cl::sycl::queue cpu(cl::sycl::cpu_selector{});
+#ifndef TRISYCL
+        auto device      = cpu.get_device();
+        auto platform    = device.get_platform();
+        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
+        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
+        bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir"));
+#else
+        bool has_spir = true; // ?
+#endif
+        if (has_spir) {
+          run<float>(cpu, iterations, length);
+          run<double>(cpu, iterations, length);
+        }
+    }
+
+    // NVIDIA GPU requires ptx64 target and does not work very well
+    if (1) {
+        cl::sycl::queue gpu(cl::sycl::gpu_selector{});
+#ifndef TRISYCL
+        auto device      = gpu.get_device();
+        auto platform    = device.get_platform();
+        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
+        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
+        bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir"));
+        bool has_fp64 = device.has_extension(cl::sycl::string_class("cl_khr_fp64"));
+#else
+        bool has_spir = true; // ?
+        bool has_fp64 = true;
+#endif
+        if (!has_fp64) {
+          std::cout << "SYCL GPU device lacks FP64 support." << std::endl;
+        }
+        if (has_spir) {
+          run<float>(gpu, iterations, length);
+          if (has_fp64) {
+            run<double>(gpu, iterations, length);
+          }
+        } else {
+          std::cout << "SYCL GPU device lacks SPIR-V support." << std::endl;
+#ifdef __COMPUTECPP__
+          std::cout << "You are using ComputeCpp so we will try it anyways..." << std::endl;
+          run<float>(gpu, iterations, length);
+          if (has_fp64) {
+            run<double>(gpu, iterations, length);
+          }
+#endif
+        }
+    }
+  }
+  catch (cl::sycl::exception e) {
+    std::cout << e.what() << std::endl;
+    std::cout << e.get_file_name() << std::endl;
+    std::cout << e.get_line_number() << std::endl;
+    std::cout << e.get_description() << std::endl;
+    std::cout << e.get_cl_error_message() << std::endl;
+    std::cout << e.get_cl_code() << std::endl;
+    return 1;
+  }
+  catch (std::exception e) {
+    std::cout << e.what() << std::endl;
+    return 1;
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
+
+
diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc
index b0fd07be1..277f9435e 100644
--- a/Cxx11/nstream-sycl.cc
+++ b/Cxx11/nstream-sycl.cc
@@ -65,6 +65,8 @@
 #include "CL/sycl.hpp"
 #include "prk_util.h"
 
+#define PREBUILD_KERNEL 1
+
 #if 0
 #include "prk_opencl.h"
 #define USE_OPENCL 1
diff --git a/Cxx11/transpose-sycl-explicit.cc b/Cxx11/transpose-sycl-explicit.cc
new file mode 100644
index 000000000..8b09da622
--- /dev/null
+++ b/Cxx11/transpose-sycl-explicit.cc
@@ -0,0 +1,299 @@
+///
+/// Copyright (c) 2013, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    transpose
+///
+/// PURPOSE: This program measures the time for the transpose of a
+///          column-major stored matrix into a row-major stored matrix.
+///
+/// USAGE:   Program input is the matrix order and the number of times to
+///          repeat the operation:
+///
+///          transpose <matrix_size> <# iterations>
+///
+///          The output consists of diagnostics to make sure the
+///          transpose worked and timing statistics.
+///
+/// HISTORY: Written by  Rob Van der Wijngaart, February 2009.
+///          Converted to C++11 by Jeff Hammond, February 2016 and May 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "CL/sycl.hpp"
+#include "prk_util.h"
+
+#define PREBUILD_KERNEL 1
+
+// need to declare kernel class as template
+// to prevent name mangling conflict below
+template <typename T> class iota;
+template <typename T> class transpose;
+
+template <typename T>
+void run(cl::sycl::queue & q, int iterations, size_t order)
+{
+  //////////////////////////////////////////////////////////////////////
+  /// Allocate space for the input and transpose matrix
+  //////////////////////////////////////////////////////////////////////
+
+  double trans_time(0);
+
+  std::vector<T> h_B(order*order,(T)0);
+
+  try {
+
+#if PREBUILD_KERNEL
+    cl::sycl::program kernel(q.get_context());
+    kernel.build_with_kernel_type<transpose<T>>();
+#endif
+
+#if USE_2D_INDEXING
+    cl::sycl::buffer<T,2> d_A( cl::sycl::range<2>{order,order} );
+    cl::sycl::buffer<T,2> d_B( cl::sycl::range<2>{order,order} );
+#else
+    cl::sycl::buffer<T> d_A { cl::sycl::range<1>{order*order}  };
+    cl::sycl::buffer<T> d_B { cl::sycl::range<1>{order*order}  };
+#endif
+
+    q.submit([&](cl::sycl::handler& h) {
+#if USE_2D_INDEXING
+        cl::sycl::accessor<T, 2, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> A(d_A, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0));
+        h.parallel_for<class iota<T>>(cl::sycl::range<2>{order,order}, [=] (cl::sycl::item<2> i) {
+            A[i] = i[0] * order + i[1];
+        });
+#else
+        cl::sycl::accessor<T, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> A(d_A, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0));
+        h.parallel_for<class iota<T>>(cl::sycl::range<1>{order*order}, [=] (cl::sycl::item<1> i) {
+            A[i] = i[0];
+        });
+#endif
+    });
+    q.submit([&](cl::sycl::handler& h) {
+#if USE_2D_INDEXING
+        cl::sycl::accessor<T, 2, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0));
+#else
+        cl::sycl::accessor<T, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0));
+#endif
+        h.fill(B,(T)0);
+    });
+    q.wait();
+
+    for (int iter = 0; iter<=iterations; ++iter) {
+
+      if (iter==1) trans_time = prk::wtime();
+
+      q.submit([&](cl::sycl::handler& h) {
+
+#if USE_2D_INDEXING
+        cl::sycl::accessor<T, 2, cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer> A(d_A, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0));
+        cl::sycl::accessor<T, 2, cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0));
+#else
+        cl::sycl::accessor<T, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer> A(d_A, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0));
+        cl::sycl::accessor<T, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0));
+#endif
+
+        h.parallel_for<class transpose<T>>(
+#if PREBUILD_KERNEL
+                kernel.get_kernel<transpose<T>>(),
+#endif
+                cl::sycl::range<2>{order,order}, [=] (cl::sycl::item<2> it) {
+#if USE_2D_INDEXING
+          cl::sycl::id<2> ij{it[0],it[1]};
+          cl::sycl::id<2> ji{it[1],it[0]};
+          B[ij] += A[ji];
+          A[ji] += (T)1;
+#else
+          B[it[0] * order + it[1]] += A[it[1] * order + it[0]];
+          A[it[1] * order + it[0]] += (T)1;
+#endif
+        });
+      });
+      q.wait();
+    }
+
+    // Stop timer before buffer+accessor destructors fire,
+    // since that will move data, and we do not time that
+    // for other device-oriented programming models.
+    trans_time = prk::wtime() - trans_time;
+
+    q.submit([&](cl::sycl::handler& h) {
+#if USE_2D_INDEXING
+        cl::sycl::accessor<T, 2, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0));
+#else
+        cl::sycl::accessor<T, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0));
+#endif
+        h.copy(B,h_B.data());
+    });
+    q.wait();
+  }
+  catch (cl::sycl::exception e) {
+    std::cout << e.what() << std::endl;
+    return;
+  }
+  catch (std::exception e) {
+    std::cout << e.what() << std::endl;
+    return;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  // TODO: replace with std::generate, std::accumulate, or similar
+  const T addit = (iterations+1.) * (iterations/2.);
+  double abserr(0);
+  for (size_t i=0; i<order; ++i) {
+    for (size_t j=0; j<order; ++j) {
+      size_t const ij = i*order+j;
+      size_t const ji = j*order+i;
+      const T reference = static_cast<T>(ij)*(1.+iterations)+addit;
+      abserr += std::fabs(h_B[ji] - reference);
+    }
+  }
+
+#ifdef VERBOSE
+  std::cout << "Sum of absolute differences: " << abserr << std::endl;
+#endif
+
+  const double epsilon(1.0e-8);
+  if (abserr < epsilon) {
+    std::cout << "Solution validates" << std::endl;
+    double avgtime = trans_time/iterations;
+    double bytes = (size_t)order * (size_t)order * sizeof(T);
+    std::cout << 8*sizeof(T) << "B "
+              << "Rate (MB/s): " << 1.0e-6 * (2.*bytes)/avgtime
+              << " Avg time (s): " << avgtime << std::endl;
+  } else {
+    std::cout << "ERROR: Aggregate squared error " << abserr
+              << " exceeds threshold " << epsilon << std::endl;
+  }
+}
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/SYCL Matrix transpose: B = A^T" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations;
+  size_t order;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <matrix order>";
+      }
+
+      // number of times to do the transpose
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      // order of a the matrix
+      order = std::atoi(argv[2]);
+      if (order <= 0) {
+        throw "ERROR: Matrix Order must be greater than 0";
+      } else if (order > std::floor(std::sqrt(INT_MAX))) {
+        throw "ERROR: matrix dimension too large - overflow risk";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations  = " << iterations << std::endl;
+  std::cout << "Matrix order          = " << order << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Setup SYCL environment
+  //////////////////////////////////////////////////////////////////////
+
+  try {
+
+    if (1) {
+        cl::sycl::queue host(cl::sycl::host_selector{});
+#ifndef TRISYCL
+        auto device      = host.get_device();
+        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
+        auto platform    = device.get_platform();
+        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
+#endif
+
+        run<float>(host, iterations, order);
+        run<double>(host, iterations, order);
+    }
+
+    // CPU requires spir64 target
+    if (1) {
+        cl::sycl::queue cpu(cl::sycl::cpu_selector{});
+#ifndef TRISYCL
+        auto device      = cpu.get_device();
+        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
+        auto platform    = device.get_platform();
+        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
+        //std::cout << "cl_khr_spir:   " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl;
+#endif
+
+        run<float>(cpu, iterations, order);
+        run<double>(cpu, iterations, order);
+    }
+
+    // NVIDIA GPU requires ptx64 target and does not work very well
+    if (0) {
+        cl::sycl::queue gpu(cl::sycl::gpu_selector{});
+#ifndef TRISYCL
+        auto device      = gpu.get_device();
+        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
+        auto platform    = device.get_platform();
+        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
+        //std::cout << "cl_khr_spir:   " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl;
+#endif
+
+        run<float>(gpu, iterations, order);
+        run<double>(gpu, iterations, order);
+    }
+  }
+  catch (cl::sycl::exception e) {
+    std::cout << e.what() << std::endl;
+  }
+  catch (std::exception e) {
+    std::cout << e.what() << std::endl;
+  }
+
+  return 0;
+}
+
+
diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc
index b853ccf7b..a0fa97d00 100644
--- a/Cxx11/transpose-sycl.cc
+++ b/Cxx11/transpose-sycl.cc
@@ -52,6 +52,8 @@
 #include "CL/sycl.hpp"
 #include "prk_util.h"
 
+#define PREBUILD_KERNEL 1
+
 // need to declare kernel class as template
 // to prevent name mangling conflict below
 template <typename T> class transpose;

From 45b005ffbf7fffd76493301de8bec3197486ec68 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Thu, 9 May 2019 14:55:19 -0700
Subject: [PATCH 196/245] cleanup

---
 Cxx11/nstream-sycl.cc   | 2 --
 Cxx11/transpose-sycl.cc | 2 --
 2 files changed, 4 deletions(-)

diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc
index 277f9435e..b0fd07be1 100644
--- a/Cxx11/nstream-sycl.cc
+++ b/Cxx11/nstream-sycl.cc
@@ -65,8 +65,6 @@
 #include "CL/sycl.hpp"
 #include "prk_util.h"
 
-#define PREBUILD_KERNEL 1
-
 #if 0
 #include "prk_opencl.h"
 #define USE_OPENCL 1
diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc
index a0fa97d00..b853ccf7b 100644
--- a/Cxx11/transpose-sycl.cc
+++ b/Cxx11/transpose-sycl.cc
@@ -52,8 +52,6 @@
 #include "CL/sycl.hpp"
 #include "prk_util.h"
 
-#define PREBUILD_KERNEL 1
-
 // need to declare kernel class as template
 // to prevent name mangling conflict below
 template <typename T> class transpose;

From ddfdc9c94f15e1111222f073d7dda667cf4bbe1f Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Thu, 9 May 2019 14:56:07 -0700
Subject: [PATCH 197/245] remove renamed code

---
 Cxx11/nstream-sycl-explicit.cc   | 337 -------------------------------
 Cxx11/transpose-sycl-explicit.cc | 299 ---------------------------
 2 files changed, 636 deletions(-)
 delete mode 100644 Cxx11/nstream-sycl-explicit.cc
 delete mode 100644 Cxx11/transpose-sycl-explicit.cc

diff --git a/Cxx11/nstream-sycl-explicit.cc b/Cxx11/nstream-sycl-explicit.cc
deleted file mode 100644
index 2f177db35..000000000
--- a/Cxx11/nstream-sycl-explicit.cc
+++ /dev/null
@@ -1,337 +0,0 @@
-///
-/// Copyright (c) 2017, Intel Corporation
-///
-/// Redistribution and use in source and binary forms, with or without
-/// modification, are permitted provided that the following conditions
-/// are met:
-///
-/// * Redistributions of source code must retain the above copyright
-///       notice, this list of conditions and the following disclaimer.
-/// * Redistributions in binary form must reproduce the above
-///       copyright notice, this list of conditions and the following
-///       disclaimer in the documentation and/or other materials provided
-///       with the distribution.
-/// * Neither the name of Intel Corporation nor the names of its
-///       contributors may be used to endorse or promote products
-///       derived from this software without specific prior written
-///       permission.
-///
-/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-/// POSSIBILITY OF SUCH DAMAGE.
-
-//////////////////////////////////////////////////////////////////////
-///
-/// NAME:    nstream
-///
-/// PURPOSE: To compute memory bandwidth when adding a vector of a given
-///          number of double precision values to the scalar multiple of
-///          another vector of the same length, and storing the result in
-///          a third vector.
-///
-/// USAGE:   The program takes as input the number
-///          of iterations to loop over the triad vectors, the length of the
-///          vectors, and the offset between vectors
-///
-///          <progname> <# iterations> <vector length> <offset>
-///
-///          The output consists of diagnostics to make sure the
-///          algorithm worked, and of timing statistics.
-///
-/// NOTES:   Bandwidth is determined as the number of words read, plus the
-///          number of words written, times the size of the words, divided
-///          by the execution time. For a vector length of N, the total
-///          number of words read and written is 4*N*sizeof(double).
-///
-///
-/// HISTORY: This code is loosely based on the Stream benchmark by John
-///          McCalpin, but does not follow all the Stream rules. Hence,
-///          reported results should not be associated with Stream in
-///          external publications
-///
-///          Converted to C++11 by Jeff Hammond, November 2017.
-///
-//////////////////////////////////////////////////////////////////////
-
-#include "CL/sycl.hpp"
-#include "prk_util.h"
-
-#define PREBUILD_KERNEL 1
-
-#if 0
-#include "prk_opencl.h"
-#define USE_OPENCL 1
-#endif
-
-// need to declare kernel class as template
-// to prevent name mangling conflict below
-template <typename T> class nstream;
-
-template <typename T>
-void run(cl::sycl::queue & q, int iterations, size_t length)
-{
-  //////////////////////////////////////////////////////////////////////
-  // Allocate space and perform the computation
-  //////////////////////////////////////////////////////////////////////
-
-  double nstream_time(0);
-
-  const T scalar(3);
-
-  std::vector<T> h_A(length,0);
-
-  try {
-
-#if PREBUILD_KERNEL
-    cl::sycl::program kernel(q.get_context());
-    kernel.build_with_kernel_type<nstream<T>>();
-#endif
-
-    cl::sycl::buffer<T> d_A { cl::sycl::range<1>{length} };
-    cl::sycl::buffer<T> d_B { cl::sycl::range<1>{length} };
-    cl::sycl::buffer<T> d_C { cl::sycl::range<1>{length} };
-
-    q.submit([&](cl::sycl::handler& h) {
-        cl::sycl::accessor<T, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> A(d_A, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0));
-        h.fill(A,(T)0);
-    });
-    q.submit([&](cl::sycl::handler& h) {
-        cl::sycl::accessor<T, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0));
-        h.fill(B,(T)2);
-    });
-    q.submit([&](cl::sycl::handler& h) {
-        cl::sycl::accessor<T, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> C(d_C, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0));
-        h.fill(C,(T)2);
-    });
-    q.wait();
-
-    for (int iter = 0; iter<=iterations; ++iter) {
-
-      if (iter==1) nstream_time = prk::wtime();
-
-      q.submit([&](cl::sycl::handler& h) {
-
-        cl::sycl::accessor<T, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer> A(d_A, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0));
-        cl::sycl::accessor<T, 1, cl::sycl::access::mode::read,       cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0));
-        cl::sycl::accessor<T, 1, cl::sycl::access::mode::read,       cl::sycl::access::target::global_buffer> C(d_C, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0));
-
-        h.parallel_for<class nstream<T>>(
-#if PREBUILD_KERNEL
-                kernel.get_kernel<nstream<T>>(),
-#endif
-                cl::sycl::range<1>{length}, [=] (cl::sycl::item<1> i) {
-            A[i] += B[i] + scalar * C[i];
-        });
-      });
-      q.wait();
-    }
-
-    // Stop timer before buffer+accessor destructors fire,
-    // since that will move data, and we do not time that
-    // for other device-oriented programming models.
-    nstream_time = prk::wtime() - nstream_time;
-
-    q.submit([&](cl::sycl::handler& h) {
-        cl::sycl::accessor<T, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> A(d_A, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0));
-        h.copy(A,h_A.data());
-    });
-    q.wait();
-  }
-  catch (cl::sycl::exception e) {
-    std::cout << e.what() << std::endl;
-    std::cout << e.get_file_name() << std::endl;
-    std::cout << e.get_line_number() << std::endl;
-    std::cout << e.get_description() << std::endl;
-    std::cout << e.get_cl_error_message() << std::endl;
-    std::cout << e.get_cl_code() << std::endl;
-    return;
-  }
-  catch (std::exception e) {
-    std::cout << e.what() << std::endl;
-    return;
-  }
-  catch (const char * e) {
-    std::cout << e << std::endl;
-    return;
-  }
-
-  //////////////////////////////////////////////////////////////////////
-  /// Analyze and output results
-  //////////////////////////////////////////////////////////////////////
-
-  T ar(0);
-  T br(2);
-  T cr(2);
-  for (int i=0; i<=iterations; ++i) {
-      ar += br + scalar * cr;
-  }
-
-  ar *= length;
-
-  double asum(0);
-  for (size_t i=0; i<length; ++i) {
-      asum += std::fabs(h_A[i]);
-  }
-
-  const double epsilon(1.e-8);
-  if (std::fabs(ar-asum)/asum > epsilon) {
-      std::cout << "Failed Validation on output array\n"
-                << "       Expected checksum: " << ar << "\n"
-                << "       Observed checksum: " << asum << std::endl;
-      std::cout << "ERROR: solution did not validate" << std::endl;
-  } else {
-      std::cout << "Solution validates" << std::endl;
-      double avgtime = nstream_time/iterations;
-      double nbytes = 4.0 * length * sizeof(T);
-      std::cout << 8*sizeof(T) << "B "
-                << "Rate (MB/s): " << 1.e-6*nbytes/avgtime
-                << " Avg time (s): " << avgtime << std::endl;
-  }
-}
-
-int main(int argc, char * argv[])
-{
-  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
-  std::cout << "C++11/SYCL STREAM triad: A = B + scalar * C" << std::endl;
-
-  //////////////////////////////////////////////////////////////////////
-  /// Read and test input parameters
-  //////////////////////////////////////////////////////////////////////
-
-  int iterations, offset;
-  size_t length;
-  try {
-      if (argc < 3) {
-        throw "Usage: <# iterations> <vector length>";
-      }
-
-      iterations  = std::atoi(argv[1]);
-      if (iterations < 1) {
-        throw "ERROR: iterations must be >= 1";
-      }
-
-      length = std::atol(argv[2]);
-      if (length <= 0) {
-        throw "ERROR: vector length must be positive";
-      }
-
-      offset = (argc>3) ? std::atoi(argv[3]) : 0;
-      if (length <= 0) {
-        throw "ERROR: offset must be nonnegative";
-      }
-  }
-  catch (const char * e) {
-    std::cout << e << std::endl;
-    return 1;
-  }
-
-  std::cout << "Number of iterations = " << iterations << std::endl;
-  std::cout << "Vector length        = " << length << std::endl;
-  std::cout << "Offset               = " << offset << std::endl;
-
-  //////////////////////////////////////////////////////////////////////
-  /// Setup SYCL environment
-  //////////////////////////////////////////////////////////////////////
-
-#ifdef USE_OPENCL
-  prk::opencl::listPlatforms();
-#endif
-
-  try {
-    if (length<100000) {
-        cl::sycl::queue host(cl::sycl::host_selector{});
-#ifndef TRISYCL
-        auto device      = host.get_device();
-        auto platform    = device.get_platform();
-        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
-        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
-#endif
-        run<float>(host, iterations, length);
-        run<double>(host, iterations, length);
-    } else {
-        std::cout << "Skipping host device since it is too slow for large problems" << std::endl;
-    }
-
-    // CPU requires spir64 target
-    if (1) {
-        cl::sycl::queue cpu(cl::sycl::cpu_selector{});
-#ifndef TRISYCL
-        auto device      = cpu.get_device();
-        auto platform    = device.get_platform();
-        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
-        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
-        bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir"));
-#else
-        bool has_spir = true; // ?
-#endif
-        if (has_spir) {
-          run<float>(cpu, iterations, length);
-          run<double>(cpu, iterations, length);
-        }
-    }
-
-    // NVIDIA GPU requires ptx64 target and does not work very well
-    if (1) {
-        cl::sycl::queue gpu(cl::sycl::gpu_selector{});
-#ifndef TRISYCL
-        auto device      = gpu.get_device();
-        auto platform    = device.get_platform();
-        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
-        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
-        bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir"));
-        bool has_fp64 = device.has_extension(cl::sycl::string_class("cl_khr_fp64"));
-#else
-        bool has_spir = true; // ?
-        bool has_fp64 = true;
-#endif
-        if (!has_fp64) {
-          std::cout << "SYCL GPU device lacks FP64 support." << std::endl;
-        }
-        if (has_spir) {
-          run<float>(gpu, iterations, length);
-          if (has_fp64) {
-            run<double>(gpu, iterations, length);
-          }
-        } else {
-          std::cout << "SYCL GPU device lacks SPIR-V support." << std::endl;
-#ifdef __COMPUTECPP__
-          std::cout << "You are using ComputeCpp so we will try it anyways..." << std::endl;
-          run<float>(gpu, iterations, length);
-          if (has_fp64) {
-            run<double>(gpu, iterations, length);
-          }
-#endif
-        }
-    }
-  }
-  catch (cl::sycl::exception e) {
-    std::cout << e.what() << std::endl;
-    std::cout << e.get_file_name() << std::endl;
-    std::cout << e.get_line_number() << std::endl;
-    std::cout << e.get_description() << std::endl;
-    std::cout << e.get_cl_error_message() << std::endl;
-    std::cout << e.get_cl_code() << std::endl;
-    return 1;
-  }
-  catch (std::exception e) {
-    std::cout << e.what() << std::endl;
-    return 1;
-  }
-  catch (const char * e) {
-    std::cout << e << std::endl;
-    return 1;
-  }
-
-  return 0;
-}
-
-
diff --git a/Cxx11/transpose-sycl-explicit.cc b/Cxx11/transpose-sycl-explicit.cc
deleted file mode 100644
index 8b09da622..000000000
--- a/Cxx11/transpose-sycl-explicit.cc
+++ /dev/null
@@ -1,299 +0,0 @@
-///
-/// Copyright (c) 2013, Intel Corporation
-///
-/// Redistribution and use in source and binary forms, with or without
-/// modification, are permitted provided that the following conditions
-/// are met:
-///
-/// * Redistributions of source code must retain the above copyright
-///       notice, this list of conditions and the following disclaimer.
-/// * Redistributions in binary form must reproduce the above
-///       copyright notice, this list of conditions and the following
-///       disclaimer in the documentation and/or other materials provided
-///       with the distribution.
-/// * Neither the name of Intel Corporation nor the names of its
-///       contributors may be used to endorse or promote products
-///       derived from this software without specific prior written
-///       permission.
-///
-/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-/// POSSIBILITY OF SUCH DAMAGE.
-
-//////////////////////////////////////////////////////////////////////
-///
-/// NAME:    transpose
-///
-/// PURPOSE: This program measures the time for the transpose of a
-///          column-major stored matrix into a row-major stored matrix.
-///
-/// USAGE:   Program input is the matrix order and the number of times to
-///          repeat the operation:
-///
-///          transpose <matrix_size> <# iterations>
-///
-///          The output consists of diagnostics to make sure the
-///          transpose worked and timing statistics.
-///
-/// HISTORY: Written by  Rob Van der Wijngaart, February 2009.
-///          Converted to C++11 by Jeff Hammond, February 2016 and May 2017.
-///
-//////////////////////////////////////////////////////////////////////
-
-#include "CL/sycl.hpp"
-#include "prk_util.h"
-
-#define PREBUILD_KERNEL 1
-
-// need to declare kernel class as template
-// to prevent name mangling conflict below
-template <typename T> class iota;
-template <typename T> class transpose;
-
-template <typename T>
-void run(cl::sycl::queue & q, int iterations, size_t order)
-{
-  //////////////////////////////////////////////////////////////////////
-  /// Allocate space for the input and transpose matrix
-  //////////////////////////////////////////////////////////////////////
-
-  double trans_time(0);
-
-  std::vector<T> h_B(order*order,(T)0);
-
-  try {
-
-#if PREBUILD_KERNEL
-    cl::sycl::program kernel(q.get_context());
-    kernel.build_with_kernel_type<transpose<T>>();
-#endif
-
-#if USE_2D_INDEXING
-    cl::sycl::buffer<T,2> d_A( cl::sycl::range<2>{order,order} );
-    cl::sycl::buffer<T,2> d_B( cl::sycl::range<2>{order,order} );
-#else
-    cl::sycl::buffer<T> d_A { cl::sycl::range<1>{order*order}  };
-    cl::sycl::buffer<T> d_B { cl::sycl::range<1>{order*order}  };
-#endif
-
-    q.submit([&](cl::sycl::handler& h) {
-#if USE_2D_INDEXING
-        cl::sycl::accessor<T, 2, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> A(d_A, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0));
-        h.parallel_for<class iota<T>>(cl::sycl::range<2>{order,order}, [=] (cl::sycl::item<2> i) {
-            A[i] = i[0] * order + i[1];
-        });
-#else
-        cl::sycl::accessor<T, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> A(d_A, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0));
-        h.parallel_for<class iota<T>>(cl::sycl::range<1>{order*order}, [=] (cl::sycl::item<1> i) {
-            A[i] = i[0];
-        });
-#endif
-    });
-    q.submit([&](cl::sycl::handler& h) {
-#if USE_2D_INDEXING
-        cl::sycl::accessor<T, 2, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0));
-#else
-        cl::sycl::accessor<T, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0));
-#endif
-        h.fill(B,(T)0);
-    });
-    q.wait();
-
-    for (int iter = 0; iter<=iterations; ++iter) {
-
-      if (iter==1) trans_time = prk::wtime();
-
-      q.submit([&](cl::sycl::handler& h) {
-
-#if USE_2D_INDEXING
-        cl::sycl::accessor<T, 2, cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer> A(d_A, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0));
-        cl::sycl::accessor<T, 2, cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0));
-#else
-        cl::sycl::accessor<T, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer> A(d_A, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0));
-        cl::sycl::accessor<T, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0));
-#endif
-
-        h.parallel_for<class transpose<T>>(
-#if PREBUILD_KERNEL
-                kernel.get_kernel<transpose<T>>(),
-#endif
-                cl::sycl::range<2>{order,order}, [=] (cl::sycl::item<2> it) {
-#if USE_2D_INDEXING
-          cl::sycl::id<2> ij{it[0],it[1]};
-          cl::sycl::id<2> ji{it[1],it[0]};
-          B[ij] += A[ji];
-          A[ji] += (T)1;
-#else
-          B[it[0] * order + it[1]] += A[it[1] * order + it[0]];
-          A[it[1] * order + it[0]] += (T)1;
-#endif
-        });
-      });
-      q.wait();
-    }
-
-    // Stop timer before buffer+accessor destructors fire,
-    // since that will move data, and we do not time that
-    // for other device-oriented programming models.
-    trans_time = prk::wtime() - trans_time;
-
-    q.submit([&](cl::sycl::handler& h) {
-#if USE_2D_INDEXING
-        cl::sycl::accessor<T, 2, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0));
-#else
-        cl::sycl::accessor<T, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0));
-#endif
-        h.copy(B,h_B.data());
-    });
-    q.wait();
-  }
-  catch (cl::sycl::exception e) {
-    std::cout << e.what() << std::endl;
-    return;
-  }
-  catch (std::exception e) {
-    std::cout << e.what() << std::endl;
-    return;
-  }
-
-  //////////////////////////////////////////////////////////////////////
-  /// Analyze and output results
-  //////////////////////////////////////////////////////////////////////
-
-  // TODO: replace with std::generate, std::accumulate, or similar
-  const T addit = (iterations+1.) * (iterations/2.);
-  double abserr(0);
-  for (size_t i=0; i<order; ++i) {
-    for (size_t j=0; j<order; ++j) {
-      size_t const ij = i*order+j;
-      size_t const ji = j*order+i;
-      const T reference = static_cast<T>(ij)*(1.+iterations)+addit;
-      abserr += std::fabs(h_B[ji] - reference);
-    }
-  }
-
-#ifdef VERBOSE
-  std::cout << "Sum of absolute differences: " << abserr << std::endl;
-#endif
-
-  const double epsilon(1.0e-8);
-  if (abserr < epsilon) {
-    std::cout << "Solution validates" << std::endl;
-    double avgtime = trans_time/iterations;
-    double bytes = (size_t)order * (size_t)order * sizeof(T);
-    std::cout << 8*sizeof(T) << "B "
-              << "Rate (MB/s): " << 1.0e-6 * (2.*bytes)/avgtime
-              << " Avg time (s): " << avgtime << std::endl;
-  } else {
-    std::cout << "ERROR: Aggregate squared error " << abserr
-              << " exceeds threshold " << epsilon << std::endl;
-  }
-}
-
-int main(int argc, char * argv[])
-{
-  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
-  std::cout << "C++11/SYCL Matrix transpose: B = A^T" << std::endl;
-
-  //////////////////////////////////////////////////////////////////////
-  /// Read and test input parameters
-  //////////////////////////////////////////////////////////////////////
-
-  int iterations;
-  size_t order;
-  try {
-      if (argc < 3) {
-        throw "Usage: <# iterations> <matrix order>";
-      }
-
-      // number of times to do the transpose
-      iterations  = std::atoi(argv[1]);
-      if (iterations < 1) {
-        throw "ERROR: iterations must be >= 1";
-      }
-
-      // order of a the matrix
-      order = std::atoi(argv[2]);
-      if (order <= 0) {
-        throw "ERROR: Matrix Order must be greater than 0";
-      } else if (order > std::floor(std::sqrt(INT_MAX))) {
-        throw "ERROR: matrix dimension too large - overflow risk";
-      }
-  }
-  catch (const char * e) {
-    std::cout << e << std::endl;
-    return 1;
-  }
-
-  std::cout << "Number of iterations  = " << iterations << std::endl;
-  std::cout << "Matrix order          = " << order << std::endl;
-
-  //////////////////////////////////////////////////////////////////////
-  /// Setup SYCL environment
-  //////////////////////////////////////////////////////////////////////
-
-  try {
-
-    if (1) {
-        cl::sycl::queue host(cl::sycl::host_selector{});
-#ifndef TRISYCL
-        auto device      = host.get_device();
-        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
-        auto platform    = device.get_platform();
-        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
-#endif
-
-        run<float>(host, iterations, order);
-        run<double>(host, iterations, order);
-    }
-
-    // CPU requires spir64 target
-    if (1) {
-        cl::sycl::queue cpu(cl::sycl::cpu_selector{});
-#ifndef TRISYCL
-        auto device      = cpu.get_device();
-        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
-        auto platform    = device.get_platform();
-        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
-        //std::cout << "cl_khr_spir:   " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl;
-#endif
-
-        run<float>(cpu, iterations, order);
-        run<double>(cpu, iterations, order);
-    }
-
-    // NVIDIA GPU requires ptx64 target and does not work very well
-    if (0) {
-        cl::sycl::queue gpu(cl::sycl::gpu_selector{});
-#ifndef TRISYCL
-        auto device      = gpu.get_device();
-        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
-        auto platform    = device.get_platform();
-        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
-        //std::cout << "cl_khr_spir:   " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl;
-#endif
-
-        run<float>(gpu, iterations, order);
-        run<double>(gpu, iterations, order);
-    }
-  }
-  catch (cl::sycl::exception e) {
-    std::cout << e.what() << std::endl;
-  }
-  catch (std::exception e) {
-    std::cout << e.what() << std::endl;
-  }
-
-  return 0;
-}
-
-

From 781e1e55165f9dacdae54e8963bc65e7538708e6 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Thu, 9 May 2019 15:08:12 -0700
Subject: [PATCH 198/245] homogenize exceptions and such

---
 Cxx11/nstream-explicit-sycl.cc   |  6 ++-
 Cxx11/nstream-sycl.cc            |  6 ++-
 Cxx11/stencil-sycl.cc            | 85 ++++++++++++++++++++++++--------
 Cxx11/transpose-explicit-sycl.cc | 80 ++++++++++++++++++++++++------
 Cxx11/transpose-sycl.cc          | 79 ++++++++++++++++++++++++-----
 5 files changed, 205 insertions(+), 51 deletions(-)

diff --git a/Cxx11/nstream-explicit-sycl.cc b/Cxx11/nstream-explicit-sycl.cc
index 6367bf660..e51b78a28 100644
--- a/Cxx11/nstream-explicit-sycl.cc
+++ b/Cxx11/nstream-explicit-sycl.cc
@@ -70,8 +70,6 @@
 #define USE_OPENCL 1
 #endif
 
-// need to declare kernel class as template
-// to prevent name mangling conflict below
 template <typename T> class nstream;
 
 template <typename T>
@@ -146,11 +144,13 @@ void run(cl::sycl::queue & q, int iterations, size_t length)
   }
   catch (cl::sycl::exception e) {
     std::cout << e.what() << std::endl;
+#ifdef __COMPUTECPP__
     std::cout << e.get_file_name() << std::endl;
     std::cout << e.get_line_number() << std::endl;
     std::cout << e.get_description() << std::endl;
     std::cout << e.get_cl_error_message() << std::endl;
     std::cout << e.get_cl_code() << std::endl;
+#endif
     return;
   }
   catch (std::exception e) {
@@ -313,11 +313,13 @@ int main(int argc, char * argv[])
   }
   catch (cl::sycl::exception e) {
     std::cout << e.what() << std::endl;
+#ifdef __COMPUTECPP__
     std::cout << e.get_file_name() << std::endl;
     std::cout << e.get_line_number() << std::endl;
     std::cout << e.get_description() << std::endl;
     std::cout << e.get_cl_error_message() << std::endl;
     std::cout << e.get_cl_code() << std::endl;
+#endif
     return 1;
   }
   catch (std::exception e) {
diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc
index b0fd07be1..f7d42d732 100644
--- a/Cxx11/nstream-sycl.cc
+++ b/Cxx11/nstream-sycl.cc
@@ -70,8 +70,6 @@
 #define USE_OPENCL 1
 #endif
 
-// need to declare kernel class as template
-// to prevent name mangling conflict below
 template <typename T> class nstream;
 
 template <typename T>
@@ -128,11 +126,13 @@ void run(cl::sycl::queue & q, int iterations, size_t length)
   }
   catch (cl::sycl::exception e) {
     std::cout << e.what() << std::endl;
+#ifdef __COMPUTECPP__
     std::cout << e.get_file_name() << std::endl;
     std::cout << e.get_line_number() << std::endl;
     std::cout << e.get_description() << std::endl;
     std::cout << e.get_cl_error_message() << std::endl;
     std::cout << e.get_cl_code() << std::endl;
+#endif
     return;
   }
   catch (std::exception e) {
@@ -295,11 +295,13 @@ int main(int argc, char * argv[])
   }
   catch (cl::sycl::exception e) {
     std::cout << e.what() << std::endl;
+#ifdef __COMPUTECPP__
     std::cout << e.get_file_name() << std::endl;
     std::cout << e.get_line_number() << std::endl;
     std::cout << e.get_description() << std::endl;
     std::cout << e.get_cl_error_message() << std::endl;
     std::cout << e.get_cl_code() << std::endl;
+#endif
     return 1;
   }
   catch (std::exception e) {
diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc
index 585fe62e9..1d9e34134 100644
--- a/Cxx11/stencil-sycl.cc
+++ b/Cxx11/stencil-sycl.cc
@@ -61,10 +61,14 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "CL/sycl.hpp"
-
 #include "prk_util.h"
 #include "stencil_sycl.hpp"
 
+#if 0
+#include "prk_opencl.h"
+#define USE_OPENCL 1
+#endif
+
 template <typename T> class init;
 template <typename T> class add;
 
@@ -186,23 +190,26 @@ void run(cl::sycl::queue & q, int iterations, size_t n, size_t tile_size, bool s
   }
   catch (cl::sycl::exception e) {
     std::cout << e.what() << std::endl;
+#ifdef __COMPUTECPP__
+    std::cout << e.get_file_name() << std::endl;
+    std::cout << e.get_line_number() << std::endl;
+    std::cout << e.get_description() << std::endl;
+    std::cout << e.get_cl_error_message() << std::endl;
+    std::cout << e.get_cl_code() << std::endl;
+#endif
     return;
   }
   catch (std::exception e) {
     std::cout << e.what() << std::endl;
     return;
   }
-
-#if 0
-  for (auto i=0; i<n; i++) {
-    for (auto j=0; j<n; j++) {
-        std::cerr << i << "," << j << "," << h_out[i*n+j] << "\n";
-    }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return;
   }
-#endif
 
   //////////////////////////////////////////////////////////////////////
-  // Analyze and output results.
+  /// Analyze and output results
   //////////////////////////////////////////////////////////////////////
 
   // interior of grid with respect to stencil
@@ -309,14 +316,18 @@ int main(int argc, char * argv[])
   /// Setup SYCL environment
   //////////////////////////////////////////////////////////////////////
 
+#ifdef USE_OPENCL
+  prk::opencl::listPlatforms();
+#endif
+
   try {
 
     if (1) {
         cl::sycl::queue host(cl::sycl::host_selector{});
 #ifndef TRISYCL
         auto device      = host.get_device();
-        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
         auto platform    = device.get_platform();
+        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
         std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
 #endif
 
@@ -329,14 +340,17 @@ int main(int argc, char * argv[])
         cl::sycl::queue cpu(cl::sycl::cpu_selector{});
 #ifndef TRISYCL
         auto device      = cpu.get_device();
-        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
         auto platform    = device.get_platform();
+        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
         std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
-        //std::cout << "cl_khr_spir:   " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl;
+        bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir"));
+#else
+        bool has_spir = true; // ?
 #endif
-
-        run<float>(cpu, iterations, n, tile_size, star, radius);
-        run<double>(cpu, iterations, n, tile_size, star, radius);
+        if (has_spir) {
+          run<float>(cpu, iterations, n, tile_size, star, radius);
+          run<double>(cpu, iterations, n, tile_size, star, radius);
+        }
     }
 
     // NVIDIA GPU requires ptx64 target and does not work very well
@@ -344,21 +358,52 @@ int main(int argc, char * argv[])
         cl::sycl::queue gpu(cl::sycl::gpu_selector{});
 #ifndef TRISYCL
         auto device      = gpu.get_device();
-        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
         auto platform    = device.get_platform();
+        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
         std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
-        //std::cout << "cl_khr_spir:   " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl;
+        bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir"));
+        bool has_fp64 = device.has_extension(cl::sycl::string_class("cl_khr_fp64"));
+#else
+        bool has_spir = true; // ?
+        bool has_fp64 = true;
+#endif
+        if (!has_fp64) {
+          std::cout << "SYCL GPU device lacks FP64 support." << std::endl;
+        }
+        if (has_spir) {
+          run<float>(gpu, iterations, n, tile_size, star, radius);
+          if (has_fp64) {
+            run<double>(gpu, iterations, n, tile_size, star, radius);
+          }
+        } else {
+          std::cout << "SYCL GPU device lacks SPIR-V support." << std::endl;
+#ifdef __COMPUTECPP__
+          std::cout << "You are using ComputeCpp so we will try it anyways..." << std::endl;
+          run<float>(gpu, iterations, n, tile_size, star, radius);
+          if (has_fp64) {
+            run<double>(gpu, iterations, n, tile_size, star, radius);
+          }
 #endif
-
-        run<float>(gpu, iterations, n, tile_size, star, radius);
-        run<double>(gpu, iterations, n, tile_size, star, radius);
     }
   }
   catch (cl::sycl::exception e) {
     std::cout << e.what() << std::endl;
+#ifdef __COMPUTECPP__
+    std::cout << e.get_file_name() << std::endl;
+    std::cout << e.get_line_number() << std::endl;
+    std::cout << e.get_description() << std::endl;
+    std::cout << e.get_cl_error_message() << std::endl;
+    std::cout << e.get_cl_code() << std::endl;
+#endif
+    return 1;
   }
   catch (std::exception e) {
     std::cout << e.what() << std::endl;
+    return 1;
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
   }
 
   return 0;
diff --git a/Cxx11/transpose-explicit-sycl.cc b/Cxx11/transpose-explicit-sycl.cc
index c34497d97..cb4e31c05 100644
--- a/Cxx11/transpose-explicit-sycl.cc
+++ b/Cxx11/transpose-explicit-sycl.cc
@@ -52,8 +52,11 @@
 #include "CL/sycl.hpp"
 #include "prk_util.h"
 
-// need to declare kernel class as template
-// to prevent name mangling conflict below
+#if 0
+#include "prk_opencl.h"
+#define USE_OPENCL 1
+#endif
+
 template <typename T> class iota;
 template <typename T> class transpose;
 
@@ -61,7 +64,7 @@ template <typename T>
 void run(cl::sycl::queue & q, int iterations, size_t order)
 {
   //////////////////////////////////////////////////////////////////////
-  /// Allocate space for the input and transpose matrix
+  // Allocate space for the input and transpose matrix
   //////////////////////////////////////////////////////////////////////
 
   double trans_time(0);
@@ -156,12 +159,23 @@ void run(cl::sycl::queue & q, int iterations, size_t order)
   }
   catch (cl::sycl::exception e) {
     std::cout << e.what() << std::endl;
+#ifdef __COMPUTECPP__
+    std::cout << e.get_file_name() << std::endl;
+    std::cout << e.get_line_number() << std::endl;
+    std::cout << e.get_description() << std::endl;
+    std::cout << e.get_cl_error_message() << std::endl;
+    std::cout << e.get_cl_code() << std::endl;
+#endif
     return;
   }
   catch (std::exception e) {
     std::cout << e.what() << std::endl;
     return;
   }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return;
+  }
 
   //////////////////////////////////////////////////////////////////////
   /// Analyze and output results
@@ -239,14 +253,18 @@ int main(int argc, char * argv[])
   /// Setup SYCL environment
   //////////////////////////////////////////////////////////////////////
 
+#ifdef USE_OPENCL
+  prk::opencl::listPlatforms();
+#endif
+
   try {
 
     if (1) {
         cl::sycl::queue host(cl::sycl::host_selector{});
 #ifndef TRISYCL
         auto device      = host.get_device();
-        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
         auto platform    = device.get_platform();
+        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
         std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
 #endif
 
@@ -259,14 +277,17 @@ int main(int argc, char * argv[])
         cl::sycl::queue cpu(cl::sycl::cpu_selector{});
 #ifndef TRISYCL
         auto device      = cpu.get_device();
-        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
         auto platform    = device.get_platform();
+        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
         std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
-        //std::cout << "cl_khr_spir:   " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl;
+        bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir"));
+#else
+        bool has_spir = true; // ?
 #endif
-
-        run<float>(cpu, iterations, order);
-        run<double>(cpu, iterations, order);
+        if (has_spir) {
+          run<float>(cpu, iterations, order);
+          run<double>(cpu, iterations, order);
+        }
     }
 
     // NVIDIA GPU requires ptx64 target and does not work very well
@@ -274,21 +295,52 @@ int main(int argc, char * argv[])
         cl::sycl::queue gpu(cl::sycl::gpu_selector{});
 #ifndef TRISYCL
         auto device      = gpu.get_device();
-        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
         auto platform    = device.get_platform();
+        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
         std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
-        //std::cout << "cl_khr_spir:   " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl;
+        bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir"));
+        bool has_fp64 = device.has_extension(cl::sycl::string_class("cl_khr_fp64"));
+#else
+        bool has_spir = true; // ?
+        bool has_fp64 = true;
+#endif
+        if (!has_fp64) {
+          std::cout << "SYCL GPU device lacks FP64 support." << std::endl;
+        }
+        if (has_spir) {
+          run<float>(gpu, iterations, order);
+          if (has_fp64) {
+            run<double>(gpu, iterations, order);
+          }
+        } else {
+          std::cout << "SYCL GPU device lacks SPIR-V support." << std::endl;
+#ifdef __COMPUTECPP__
+          std::cout << "You are using ComputeCpp so we will try it anyways..." << std::endl;
+          run<float>(gpu, iterations, order);
+          if (has_fp64) {
+            run<double>(gpu, iterations, order);
+          }
 #endif
-
-        run<float>(gpu, iterations, order);
-        run<double>(gpu, iterations, order);
     }
   }
   catch (cl::sycl::exception e) {
     std::cout << e.what() << std::endl;
+#ifdef __COMPUTECPP__
+    std::cout << e.get_file_name() << std::endl;
+    std::cout << e.get_line_number() << std::endl;
+    std::cout << e.get_description() << std::endl;
+    std::cout << e.get_cl_error_message() << std::endl;
+    std::cout << e.get_cl_code() << std::endl;
+#endif
+    return 1;
   }
   catch (std::exception e) {
     std::cout << e.what() << std::endl;
+    return 1;
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
   }
 
   return 0;
diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc
index b853ccf7b..5ed7b9805 100644
--- a/Cxx11/transpose-sycl.cc
+++ b/Cxx11/transpose-sycl.cc
@@ -52,15 +52,18 @@
 #include "CL/sycl.hpp"
 #include "prk_util.h"
 
-// need to declare kernel class as template
-// to prevent name mangling conflict below
+#if 0
+#include "prk_opencl.h"
+#define USE_OPENCL 1
+#endif
+
 template <typename T> class transpose;
 
 template <typename T>
 void run(cl::sycl::queue & q, int iterations, size_t order)
 {
   //////////////////////////////////////////////////////////////////////
-  /// Allocate space for the input and transpose matrix
+  // Allocate space for the input and transpose matrix
   //////////////////////////////////////////////////////////////////////
 
   double trans_time(0);
@@ -122,12 +125,23 @@ void run(cl::sycl::queue & q, int iterations, size_t order)
   }
   catch (cl::sycl::exception e) {
     std::cout << e.what() << std::endl;
+#ifdef __COMPUTECPP__
+    std::cout << e.get_file_name() << std::endl;
+    std::cout << e.get_line_number() << std::endl;
+    std::cout << e.get_description() << std::endl;
+    std::cout << e.get_cl_error_message() << std::endl;
+    std::cout << e.get_cl_code() << std::endl;
+#endif
     return;
   }
   catch (std::exception e) {
     std::cout << e.what() << std::endl;
     return;
   }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return;
+  }
 
   //////////////////////////////////////////////////////////////////////
   /// Analyze and output results
@@ -205,14 +219,18 @@ int main(int argc, char * argv[])
   /// Setup SYCL environment
   //////////////////////////////////////////////////////////////////////
 
+#ifdef USE_OPENCL
+  prk::opencl::listPlatforms();
+#endif
+
   try {
 
     if (1) {
         cl::sycl::queue host(cl::sycl::host_selector{});
 #ifndef TRISYCL
         auto device      = host.get_device();
-        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
         auto platform    = device.get_platform();
+        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
         std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
 #endif
 
@@ -225,14 +243,17 @@ int main(int argc, char * argv[])
         cl::sycl::queue cpu(cl::sycl::cpu_selector{});
 #ifndef TRISYCL
         auto device      = cpu.get_device();
-        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
         auto platform    = device.get_platform();
+        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
         std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
-        //std::cout << "cl_khr_spir:   " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl;
+        bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir"));
+#else
+        bool has_spir = true; // ?
 #endif
-
-        run<float>(cpu, iterations, order);
-        run<double>(cpu, iterations, order);
+        if (has_spir) {
+          run<float>(cpu, iterations, order);
+          run<double>(cpu, iterations, order);
+        }
     }
 
     // NVIDIA GPU requires ptx64 target and does not work very well
@@ -240,20 +261,52 @@ int main(int argc, char * argv[])
         cl::sycl::queue gpu(cl::sycl::gpu_selector{});
 #ifndef TRISYCL
         auto device      = gpu.get_device();
-        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
         auto platform    = device.get_platform();
+        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
         std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
-        //std::cout << "cl_khr_spir:   " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl;
+        bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir"));
+        bool has_fp64 = device.has_extension(cl::sycl::string_class("cl_khr_fp64"));
+#else
+        bool has_spir = true; // ?
+        bool has_fp64 = true;
+#endif
+        if (!has_fp64) {
+          std::cout << "SYCL GPU device lacks FP64 support." << std::endl;
+        }
+        if (has_spir) {
+          run<float>(gpu, iterations, order);
+          if (has_fp64) {
+            run<double>(gpu, iterations, order);
+          }
+        } else {
+          std::cout << "SYCL GPU device lacks SPIR-V support." << std::endl;
+#ifdef __COMPUTECPP__
+          std::cout << "You are using ComputeCpp so we will try it anyways..." << std::endl;
+          run<float>(gpu, iterations, order);
+          if (has_fp64) {
+            run<double>(gpu, iterations, order);
+          }
 #endif
-        run<float>(gpu, iterations, order);
-        run<double>(gpu, iterations, order);
     }
   }
   catch (cl::sycl::exception e) {
     std::cout << e.what() << std::endl;
+#ifdef __COMPUTECPP__
+    std::cout << e.get_file_name() << std::endl;
+    std::cout << e.get_line_number() << std::endl;
+    std::cout << e.get_description() << std::endl;
+    std::cout << e.get_cl_error_message() << std::endl;
+    std::cout << e.get_cl_code() << std::endl;
+#endif
+    return 1;
   }
   catch (std::exception e) {
     std::cout << e.what() << std::endl;
+    return 1;
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
   }
 
   return 0;

From f9fd8a4a8dccac8c39cbf6f41d9f01cec34a55c6 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Thu, 9 May 2019 15:13:46 -0700
Subject: [PATCH 199/245] fix syntax errors

---
 Cxx11/stencil-sycl.cc            | 1 +
 Cxx11/transpose-explicit-sycl.cc | 3 +--
 Cxx11/transpose-sycl.cc          | 3 +--
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc
index 1d9e34134..d9fa54ff6 100644
--- a/Cxx11/stencil-sycl.cc
+++ b/Cxx11/stencil-sycl.cc
@@ -384,6 +384,7 @@ int main(int argc, char * argv[])
             run<double>(gpu, iterations, n, tile_size, star, radius);
           }
 #endif
+        }
     }
   }
   catch (cl::sycl::exception e) {
diff --git a/Cxx11/transpose-explicit-sycl.cc b/Cxx11/transpose-explicit-sycl.cc
index cb4e31c05..cedeafd68 100644
--- a/Cxx11/transpose-explicit-sycl.cc
+++ b/Cxx11/transpose-explicit-sycl.cc
@@ -258,7 +258,6 @@ int main(int argc, char * argv[])
 #endif
 
   try {
-
     if (1) {
         cl::sycl::queue host(cl::sycl::host_selector{});
 #ifndef TRISYCL
@@ -267,7 +266,6 @@ int main(int argc, char * argv[])
         std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
         std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
 #endif
-
         run<float>(host, iterations, order);
         run<double>(host, iterations, order);
     }
@@ -321,6 +319,7 @@ int main(int argc, char * argv[])
             run<double>(gpu, iterations, order);
           }
 #endif
+        }
     }
   }
   catch (cl::sycl::exception e) {
diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc
index 5ed7b9805..761fa136d 100644
--- a/Cxx11/transpose-sycl.cc
+++ b/Cxx11/transpose-sycl.cc
@@ -224,7 +224,6 @@ int main(int argc, char * argv[])
 #endif
 
   try {
-
     if (1) {
         cl::sycl::queue host(cl::sycl::host_selector{});
 #ifndef TRISYCL
@@ -233,7 +232,6 @@ int main(int argc, char * argv[])
         std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
         std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
 #endif
-
         run<float>(host, iterations, order);
         run<double>(host, iterations, order);
     }
@@ -287,6 +285,7 @@ int main(int argc, char * argv[])
             run<double>(gpu, iterations, order);
           }
 #endif
+        }
     }
   }
   catch (cl::sycl::exception e) {

From 24fceb29510bdb252dc157f8c0e33c27df5d97c4 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Mon, 13 May 2019 09:40:37 -0400
Subject: [PATCH 200/245] build C1z p2p-hyperplane-openmp

---
 travis/build-run-prk.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index 1f40f02a8..dcb85da2a 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -173,7 +173,7 @@ case "$PRK_TARGET" in
             g*)
                 # Host
                 echo "OPENMPFLAG=-fopenmp" >> common/make.defs
-                ${MAKE} -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp transpose-openmp
+                ${MAKE} -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-innerloop-openmp p2p-hyperplane-openmp stencil-openmp transpose-openmp
                 $PRK_TARGET_PATH/p2p-tasks-openmp         10 1024 1024 100 100
                 $PRK_TARGET_PATH/p2p-innerloop-openmp     10 1024
                 $PRK_TARGET_PATH/p2p-hyperplane-openmp    10 1024

From fdbcd4b3ff1c78cec3b94f49b852cdd548023ffb Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Mon, 13 May 2019 09:43:31 -0400
Subject: [PATCH 201/245] fix or disable all the RAJA stuff (#399)

- use range everywhere we needed to
- remove some unused code that would not work anymore anyways
- simplify some nested loop cases that should be revisited later
---
 Cxx11/Makefile                |  4 +--
 Cxx11/generate-cxx-stencil.py |  9 +++----
 Cxx11/nstream-raja.cc         |  3 ++-
 Cxx11/nstream-vector-raja.cc  | 11 +++++---
 Cxx11/p2p-raja.cc             | 12 ++++++---
 Cxx11/p2p-vector-raja.cc      |  8 ++++--
 Cxx11/stencil-raja.cc         | 13 +++------
 Cxx11/stencil-vector-raja.cc  | 40 +++++++---------------------
 Cxx11/stencil_raja.hpp        | 50 +++++++++++++++++++++--------------
 9 files changed, 73 insertions(+), 77 deletions(-)

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index f96e63744..1bb8d88ce 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -118,8 +118,8 @@ rangefor: stencil-vector-rangefor transpose-vector-rangefor nstream-vector-range
 
 kokkos: stencil-kokkos transpose-kokkos nstream-kokkos
 
-raja: p2p-vector-raja stencil-vector-raja transpose-vector-raja nstream-vector-raja \
-      p2p-raja stencil-raja transpose-raja nstream-raja
+raja: p2p-vector-raja stencil-vector-raja nstream-vector-raja \
+      p2p-raja transpose-raja nstream-raja stencil-raja # transpose-vector-raja
 
 cuda: stencil-cuda transpose-cuda nstream-cuda
 
diff --git a/Cxx11/generate-cxx-stencil.py b/Cxx11/generate-cxx-stencil.py
index 2f557fe3f..e4f187007 100755
--- a/Cxx11/generate-cxx-stencil.py
+++ b/Cxx11/generate-cxx-stencil.py
@@ -111,12 +111,9 @@ def codegen(src,pattern,stencil_size,radius,W,model):
         src.write('    });\n')
     elif (model=='raja'):
         src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {\n')
-        #src.write('    RAJA::forallN<RAJA::NestedPolicy<RAJA::ExecList<thread_exec, RAJA::simd_exec>>>\n')
-        #src.write('            ( RAJA::RangeSegment('+str(radius)+',n-'+str(radius)+'),'
-        #                        'RAJA::RangeSegment('+str(radius)+',n-'+str(radius)+'),\n')
-        #src.write('              [&](RAJA::Index_type i, RAJA::Index_type j) {\n')
-        src.write('    RAJA::forall<thread_exec>(RAJA::Index_type('+str(radius)+'), RAJA::Index_type(n-'+str(radius)+'), [&](RAJA::Index_type i) {\n')
-        src.write('      RAJA::forall<RAJA::simd_exec>(RAJA::Index_type('+str(radius)+'), RAJA::Index_type(n-'+str(radius)+'), [&](RAJA::Index_type j) {\n')
+        src.write('    RAJA::RangeSegment inside('+str(radius)+',n-'+str(radius)+');\n')
+        src.write('    RAJA::forall<thread_exec>(inside, [&](RAJA::Index_type i) {\n')
+        src.write('      RAJA::forall<RAJA::simd_exec>(inside, [&](RAJA::Index_type j) {\n')
         bodygen(src,pattern,stencil_size,radius,W,model)
         src.write('      });\n')
         src.write('    });\n')
diff --git a/Cxx11/nstream-raja.cc b/Cxx11/nstream-raja.cc
index ef7b6c08e..dcba4cbf2 100644
--- a/Cxx11/nstream-raja.cc
+++ b/Cxx11/nstream-raja.cc
@@ -166,7 +166,8 @@ int main(int argc, char * argv[])
   ar *= length;
 
   RAJA::ReduceSum<RAJA::seq_reduce, double> reduced_asum(0.0);
-  RAJA::forall<RAJA::seq_exec>(RAJA::Index_type(0), RAJA::Index_type(length), [&](RAJA::Index_type i) {
+  //RAJA::forall<RAJA::seq_exec>(RAJA::Index_type(0), RAJA::Index_type(length), [&](RAJA::Index_type i) {
+  RAJA::forall<RAJA::seq_exec>(range, [=](RAJA::Index_type i) {
       reduced_asum += std::fabs(A(i));
   });
   double asum(reduced_asum);
diff --git a/Cxx11/nstream-vector-raja.cc b/Cxx11/nstream-vector-raja.cc
index ee3986e50..8db807cc4 100644
--- a/Cxx11/nstream-vector-raja.cc
+++ b/Cxx11/nstream-vector-raja.cc
@@ -124,10 +124,13 @@ int main(int argc, char * argv[])
   std::vector<double> B(length);
   std::vector<double> C(length);
 
+  RAJA::RangeSegment range(0, length);
+
   double scalar(3);
 
   {
-    RAJA::forall<thread_exec>(RAJA::Index_type(0), RAJA::Index_type(length), [&](RAJA::Index_type i) {
+    //RAJA::forall<thread_exec>(RAJA::Index_type(0), RAJA::Index_type(length), [&](RAJA::Index_type i) {
+    RAJA::forall<thread_exec>(range, [&](RAJA::Index_type i) {
         A[i] = 0.0;
         B[i] = 2.0;
         C[i] = 2.0;
@@ -137,7 +140,8 @@ int main(int argc, char * argv[])
 
       if (iter==1) nstream_time = prk::wtime();
 
-      RAJA::forall<thread_exec>(RAJA::Index_type(0), RAJA::Index_type(length), [&](RAJA::Index_type i) {
+      //RAJA::forall<thread_exec>(RAJA::Index_type(0), RAJA::Index_type(length), [&](RAJA::Index_type i) {
+      RAJA::forall<thread_exec>(range, [&](RAJA::Index_type i) {
           A[i] += B[i] + scalar * C[i];
       });
     }
@@ -158,7 +162,8 @@ int main(int argc, char * argv[])
   ar *= length;
 
   RAJA::ReduceSum<RAJA::seq_reduce, double> reduced_asum(0.0);
-  RAJA::forall<RAJA::seq_exec>(RAJA::Index_type(0), RAJA::Index_type(length), [&](RAJA::Index_type i) {
+  //RAJA::forall<RAJA::seq_exec>(RAJA::Index_type(0), RAJA::Index_type(length), [&](RAJA::Index_type i) {
+  RAJA::forall<RAJA::seq_exec>(range, [=](RAJA::Index_type i) {
       reduced_asum += std::fabs(A[i]);
   });
   double asum(reduced_asum);
diff --git a/Cxx11/p2p-raja.cc b/Cxx11/p2p-raja.cc
index 202d9b6b6..2900d3fbc 100644
--- a/Cxx11/p2p-raja.cc
+++ b/Cxx11/p2p-raja.cc
@@ -121,10 +121,14 @@ int main(int argc, char* argv[])
   double * RESTRICT Amem = new double[m*n];
   matrix grid(Amem, m, n);
 
-  for (int i=0; i<m; i++) {
-    for (int j=0; j<n; j++) {
-      grid(i,j) = 0.0;
-    }
+  {
+    RAJA::RangeSegment range(0,m);
+    RAJA::forall<thread_exec>(range, [=](RAJA::Index_type i) {
+    //for (int i=0; i<m; i++) {
+      for (int j=0; j<n; j++) {
+        grid(i,j) = 0.0;
+      }
+    });
   }
   // set boundary values (bottom and left side of grid)
   for (int j=0; j<n; j++) {
diff --git a/Cxx11/p2p-vector-raja.cc b/Cxx11/p2p-vector-raja.cc
index e4faddccc..5b6626001 100644
--- a/Cxx11/p2p-vector-raja.cc
+++ b/Cxx11/p2p-vector-raja.cc
@@ -150,14 +150,18 @@ int main(int argc, char* argv[])
     });
 #else
     for (auto j=1; j<n; j++) {
-      RAJA::forall<thread_exec>(RAJA::Index_type(1), RAJA::Index_type(j+1), [&](RAJA::Index_type i) {
+      //RAJA::forall<thread_exec>(RAJA::Index_type(1), RAJA::Index_type(j+1), [&](RAJA::Index_type i) {
+      RAJA::RangeSegment range(1, j+1);
+      RAJA::forall<thread_exec>(range, [&](RAJA::Index_type i) {
         auto x = i;
         auto y = j-i+1;
         grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)];
       });
     }
     for (auto j=n-2; j>=1; j--) {
-      RAJA::forall<thread_exec>(RAJA::Index_type(1), RAJA::Index_type(j+1), [&](RAJA::Index_type i) {
+      //RAJA::forall<thread_exec>(RAJA::Index_type(1), RAJA::Index_type(j+1), [&](RAJA::Index_type i) {
+      RAJA::RangeSegment range(1, j+1);
+      RAJA::forall<thread_exec>(range, [&](RAJA::Index_type i) {
         auto x = n+i-j-1;
         auto y = n-i;
         grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)];
diff --git a/Cxx11/stencil-raja.cc b/Cxx11/stencil-raja.cc
index 5fa333bce..e52638c8b 100644
--- a/Cxx11/stencil-raja.cc
+++ b/Cxx11/stencil-raja.cc
@@ -207,17 +207,12 @@ int main(int argc, char* argv[])
   size_t active_points = static_cast<size_t>(n-2*radius)*static_cast<size_t>(n-2*radius);
 
   // compute L1 norm in parallel
-#if 0
-  // This leads to incorrect computation of the norm.
-  RAJA::ReduceSum<RAJA::omp_reduce, double> reduced_norm(0.0);
-  RAJA::forallN<RAJA::NestedPolicy<RAJA::ExecList<thread_exec, RAJA::simd_exec>>>
-#else
+  RAJA::RangeSegment inside(radius,n-radius);
   RAJA::ReduceSum<RAJA::seq_reduce, double> reduced_norm(0.0);
-  RAJA::forallN<RAJA::NestedPolicy<RAJA::ExecList<RAJA::seq_exec, RAJA::seq_exec>>>
-#endif
-          ( RAJA::RangeSegment(radius,n-radius), RAJA::RangeSegment(radius,n-radius),
-            [&](RAJA::Index_type i, RAJA::Index_type j) {
+  RAJA::forall<RAJA::seq_exec>(inside, [&](RAJA::Index_type i) {
+    RAJA::forall<RAJA::seq_exec>(inside, [&](RAJA::Index_type j) {
       reduced_norm += std::fabs(out(i,j));
+    });
   });
   double norm = reduced_norm / active_points;
 
diff --git a/Cxx11/stencil-vector-raja.cc b/Cxx11/stencil-vector-raja.cc
index 822a45c00..ce4a7278a 100644
--- a/Cxx11/stencil-vector-raja.cc
+++ b/Cxx11/stencil-vector-raja.cc
@@ -168,21 +168,14 @@ int main(int argc, char* argv[])
   std::vector<double> in(n*n);
   std::vector<double> out(n*n);
 
-#if 0
-  RAJA::forallN<RAJA::NestedPolicy<RAJA::ExecList<thread_exec, RAJA::simd_exec>>>
-          ( RAJA::RangeSegment(0, n), RAJA::RangeSegment(0, n),
-            [&](RAJA::Index_type i, RAJA::Index_type j) {
-      in[i*n+j] = static_cast<double>(i+j);
-      out[i*n+j] = 0.0;
-  });
-#else
-  RAJA::forall<thread_exec>(RAJA::Index_type(0), RAJA::Index_type(n), [&](RAJA::Index_type i) {
-    RAJA::forall<RAJA::simd_exec>(RAJA::Index_type(0), RAJA::Index_type(n), [&](RAJA::Index_type j) {
+  RAJA::RangeSegment range(0, n);
+
+  RAJA::forall<thread_exec>(range, [&](RAJA::Index_type i) {
+    RAJA::forall<RAJA::simd_exec>(range, [&](RAJA::Index_type j) {
       in[i*n+j] = static_cast<double>(i+j);
       out[i*n+j] = 0.0;
     });
   });
-#endif
 
   for (auto iter = 0; iter<=iterations; iter++) {
 
@@ -190,19 +183,11 @@ int main(int argc, char* argv[])
     // Apply the stencil operator
     stencil(n, tile_size, in, out);
     // Add constant to solution to force refresh of neighbor data, if any
-#if 0
-    RAJA::forallN<RAJA::NestedPolicy<RAJA::ExecList<thread_exec, RAJA::simd_exec>>>
-            ( RAJA::RangeSegment(0, n), RAJA::RangeSegment(0, n),
-              [&](RAJA::Index_type i, RAJA::Index_type j) {
-        in[i*n+j] += 1.0;
-    });
-#else
-    RAJA::forall<thread_exec>(RAJA::Index_type(0), RAJA::Index_type(n), [&](RAJA::Index_type i) {
-      RAJA::forall<RAJA::simd_exec>(RAJA::Index_type(0), RAJA::Index_type(n), [&](RAJA::Index_type j) {
+    RAJA::forall<thread_exec>(range, [&](RAJA::Index_type i) {
+      RAJA::forall<RAJA::simd_exec>(range, [&](RAJA::Index_type j) {
         in[i*n+j] += 1.0;
       });
     });
-#endif
   }
 
   stencil_time = prk::wtime() - stencil_time;
@@ -215,17 +200,12 @@ int main(int argc, char* argv[])
   size_t active_points = static_cast<size_t>(n-2*radius)*static_cast<size_t>(n-2*radius);
 
   // compute L1 norm in parallel
-#if 0
-  // This leads to incorrect computation of the norm.
-  RAJA::ReduceSum<RAJA::omp_reduce, double> reduced_norm(0.0);
-  RAJA::forallN<RAJA::NestedPolicy<RAJA::ExecList<thread_exec, RAJA::simd_exec>>>
-#else
+  RAJA::RangeSegment inside(radius,n-radius);
   RAJA::ReduceSum<RAJA::seq_reduce, double> reduced_norm(0.0);
-  RAJA::forallN<RAJA::NestedPolicy<RAJA::ExecList<RAJA::seq_exec, RAJA::seq_exec>>>
-#endif
-          ( RAJA::RangeSegment(radius,n-radius), RAJA::RangeSegment(radius,n-radius),
-            [&](RAJA::Index_type i, RAJA::Index_type j) {
+  RAJA::forall<RAJA::seq_exec>(inside, [&](RAJA::Index_type i) {
+    RAJA::forall<RAJA::seq_exec>(inside, [&](RAJA::Index_type j) {
       reduced_norm += std::fabs(out[i*n+j]);
+    });
   });
   double norm = reduced_norm / active_points;
 
diff --git a/Cxx11/stencil_raja.hpp b/Cxx11/stencil_raja.hpp
index ebd2d28b1..d6d912a1d 100644
--- a/Cxx11/stencil_raja.hpp
+++ b/Cxx11/stencil_raja.hpp
@@ -1,6 +1,7 @@
 void star1(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    RAJA::forall<thread_exec>(RAJA::Index_type(1), RAJA::Index_type(n-1), [&](RAJA::Index_type i) {
-      RAJA::forall<RAJA::simd_exec>(RAJA::Index_type(1), RAJA::Index_type(n-1), [&](RAJA::Index_type j) {
+    RAJA::RangeSegment inside(1,n-1);
+    RAJA::forall<thread_exec>(inside, [&](RAJA::Index_type i) {
+      RAJA::forall<RAJA::simd_exec>(inside, [&](RAJA::Index_type j) {
             out[i*n+j] += +in[(i)*n+(j-1)] * -0.5
                           +in[(i-1)*n+(j)] * -0.5
                           +in[(i+1)*n+(j)] * 0.5
@@ -10,8 +11,9 @@ void star1(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void star2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    RAJA::forall<thread_exec>(RAJA::Index_type(2), RAJA::Index_type(n-2), [&](RAJA::Index_type i) {
-      RAJA::forall<RAJA::simd_exec>(RAJA::Index_type(2), RAJA::Index_type(n-2), [&](RAJA::Index_type j) {
+    RAJA::RangeSegment inside(2,n-2);
+    RAJA::forall<thread_exec>(inside, [&](RAJA::Index_type i) {
+      RAJA::forall<RAJA::simd_exec>(inside, [&](RAJA::Index_type j) {
             out[i*n+j] += +in[(i)*n+(j-2)] * -0.125
                           +in[(i)*n+(j-1)] * -0.25
                           +in[(i-2)*n+(j)] * -0.125
@@ -25,8 +27,9 @@ void star2(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void star3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    RAJA::forall<thread_exec>(RAJA::Index_type(3), RAJA::Index_type(n-3), [&](RAJA::Index_type i) {
-      RAJA::forall<RAJA::simd_exec>(RAJA::Index_type(3), RAJA::Index_type(n-3), [&](RAJA::Index_type j) {
+    RAJA::RangeSegment inside(3,n-3);
+    RAJA::forall<thread_exec>(inside, [&](RAJA::Index_type i) {
+      RAJA::forall<RAJA::simd_exec>(inside, [&](RAJA::Index_type j) {
             out[i*n+j] += +in[(i)*n+(j-3)] * -0.0555555555556
                           +in[(i)*n+(j-2)] * -0.0833333333333
                           +in[(i)*n+(j-1)] * -0.166666666667
@@ -44,8 +47,9 @@ void star3(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void star4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    RAJA::forall<thread_exec>(RAJA::Index_type(4), RAJA::Index_type(n-4), [&](RAJA::Index_type i) {
-      RAJA::forall<RAJA::simd_exec>(RAJA::Index_type(4), RAJA::Index_type(n-4), [&](RAJA::Index_type j) {
+    RAJA::RangeSegment inside(4,n-4);
+    RAJA::forall<thread_exec>(inside, [&](RAJA::Index_type i) {
+      RAJA::forall<RAJA::simd_exec>(inside, [&](RAJA::Index_type j) {
             out[i*n+j] += +in[(i)*n+(j-4)] * -0.03125
                           +in[(i)*n+(j-3)] * -0.0416666666667
                           +in[(i)*n+(j-2)] * -0.0625
@@ -67,8 +71,9 @@ void star4(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void star5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    RAJA::forall<thread_exec>(RAJA::Index_type(5), RAJA::Index_type(n-5), [&](RAJA::Index_type i) {
-      RAJA::forall<RAJA::simd_exec>(RAJA::Index_type(5), RAJA::Index_type(n-5), [&](RAJA::Index_type j) {
+    RAJA::RangeSegment inside(5,n-5);
+    RAJA::forall<thread_exec>(inside, [&](RAJA::Index_type i) {
+      RAJA::forall<RAJA::simd_exec>(inside, [&](RAJA::Index_type j) {
             out[i*n+j] += +in[(i)*n+(j-5)] * -0.02
                           +in[(i)*n+(j-4)] * -0.025
                           +in[(i)*n+(j-3)] * -0.0333333333333
@@ -94,8 +99,9 @@ void star5(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void grid1(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    RAJA::forall<thread_exec>(RAJA::Index_type(1), RAJA::Index_type(n-1), [&](RAJA::Index_type i) {
-      RAJA::forall<RAJA::simd_exec>(RAJA::Index_type(1), RAJA::Index_type(n-1), [&](RAJA::Index_type j) {
+    RAJA::RangeSegment inside(1,n-1);
+    RAJA::forall<thread_exec>(inside, [&](RAJA::Index_type i) {
+      RAJA::forall<RAJA::simd_exec>(inside, [&](RAJA::Index_type j) {
             out[i*n+j] += +in[(i-1)*n+(j-1)] * -0.25
                           +in[(i)*n+(j-1)] * -0.25
                           +in[(i-1)*n+(j)] * -0.25
@@ -108,8 +114,9 @@ void grid1(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void grid2(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    RAJA::forall<thread_exec>(RAJA::Index_type(2), RAJA::Index_type(n-2), [&](RAJA::Index_type i) {
-      RAJA::forall<RAJA::simd_exec>(RAJA::Index_type(2), RAJA::Index_type(n-2), [&](RAJA::Index_type j) {
+    RAJA::RangeSegment inside(2,n-2);
+    RAJA::forall<thread_exec>(inside, [&](RAJA::Index_type i) {
+      RAJA::forall<RAJA::simd_exec>(inside, [&](RAJA::Index_type j) {
             out[i*n+j] += +in[(i-2)*n+(j-2)] * -0.0625
                           +in[(i-1)*n+(j-2)] * -0.0208333333333
                           +in[(i)*n+(j-2)] * -0.0208333333333
@@ -136,8 +143,9 @@ void grid2(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void grid3(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    RAJA::forall<thread_exec>(RAJA::Index_type(3), RAJA::Index_type(n-3), [&](RAJA::Index_type i) {
-      RAJA::forall<RAJA::simd_exec>(RAJA::Index_type(3), RAJA::Index_type(n-3), [&](RAJA::Index_type j) {
+    RAJA::RangeSegment inside(3,n-3);
+    RAJA::forall<thread_exec>(inside, [&](RAJA::Index_type i) {
+      RAJA::forall<RAJA::simd_exec>(inside, [&](RAJA::Index_type j) {
             out[i*n+j] += +in[(i-3)*n+(j-3)] * -0.0277777777778
                           +in[(i-2)*n+(j-3)] * -0.00555555555556
                           +in[(i-1)*n+(j-3)] * -0.00555555555556
@@ -186,8 +194,9 @@ void grid3(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void grid4(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    RAJA::forall<thread_exec>(RAJA::Index_type(4), RAJA::Index_type(n-4), [&](RAJA::Index_type i) {
-      RAJA::forall<RAJA::simd_exec>(RAJA::Index_type(4), RAJA::Index_type(n-4), [&](RAJA::Index_type j) {
+    RAJA::RangeSegment inside(4,n-4);
+    RAJA::forall<thread_exec>(inside, [&](RAJA::Index_type i) {
+      RAJA::forall<RAJA::simd_exec>(inside, [&](RAJA::Index_type j) {
             out[i*n+j] += +in[(i-4)*n+(j-4)] * -0.015625
                           +in[(i-3)*n+(j-4)] * -0.00223214285714
                           +in[(i-2)*n+(j-4)] * -0.00223214285714
@@ -266,8 +275,9 @@ void grid4(const int n, const int t, std::vector<double> & in, std::vector<doubl
 }
 
 void grid5(const int n, const int t, std::vector<double> & in, std::vector<double> & out) {
-    RAJA::forall<thread_exec>(RAJA::Index_type(5), RAJA::Index_type(n-5), [&](RAJA::Index_type i) {
-      RAJA::forall<RAJA::simd_exec>(RAJA::Index_type(5), RAJA::Index_type(n-5), [&](RAJA::Index_type j) {
+    RAJA::RangeSegment inside(5,n-5);
+    RAJA::forall<thread_exec>(inside, [&](RAJA::Index_type i) {
+      RAJA::forall<RAJA::simd_exec>(inside, [&](RAJA::Index_type j) {
             out[i*n+j] += +in[(i-5)*n+(j-5)] * -0.01
                           +in[(i-4)*n+(j-5)] * -0.00111111111111
                           +in[(i-3)*n+(j-5)] * -0.00111111111111

From 5f4b5d68d288de14eefa943eebd6311f9136153b Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sun, 12 May 2019 18:36:04 -0700
Subject: [PATCH 202/245] switch to GCC 9 w/ its PSTL

---
 common/make.defs.gcc | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index b0566487a..f4552bd87 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -4,7 +4,7 @@
 #
 # Base compilers and language options
 #
-VERSION=-8
+VERSION=-9
 # C99 is required in some implementations.
 CC=gcc${VERSION} -std=c11 -pthread
 #EXTRA_CLIBS=-lrt
@@ -74,15 +74,16 @@ METALFLAG=-framework MetalPerformanceShaders
 #
 # TBB
 #
-TBBDIR=/usr/local/Cellar/tbb/2019_U3_1
-TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
+TBBDIR=/usr/local/Cellar/tbb/2019_U5_1
+TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
 #
 # Parallel STL, Boost, etc.
 #
-BOOSTFLAG=-I/usr/local/Cellar/boost/1.68.0_1/include
-#RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
-RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
-PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG}
+BOOSTFLAG=-I/usr/local/Cellar/boost/1.69.0_2/include
+RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
+#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
+PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG}
+#PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG}
 KOKKOSDIR=/opt/kokkos/gcc
 KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG}
 RAJADIR=/opt/raja/gcc
@@ -95,7 +96,7 @@ THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG}
 # triSYCL
 # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
 SYCLDIR=./triSYCL
-SYCLCXX=${CXX} -std=c++17 ${OPENMPFLAG}
+SYCLCXX=${CXX} -O3 -Wall -std=c++17 ${OPENMPFLAG}
 SYCLFLAG=-I${SYCLDIR}/include ${BOOSTFLAG} -DTRISYCL
 # ProGTX
 # https://github.com/ProGTX/sycl-gtx

From 30f5a22395cdb819eeb28ea949de4600818d22a3 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sun, 12 May 2019 18:36:41 -0700
Subject: [PATCH 203/245] move valarray header to the only place it is needed

---
 Cxx11/nstream-valarray.cc | 1 +
 Cxx11/prk_util.h          | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/Cxx11/nstream-valarray.cc b/Cxx11/nstream-valarray.cc
index 656f69f9c..bcc6361ac 100644
--- a/Cxx11/nstream-valarray.cc
+++ b/Cxx11/nstream-valarray.cc
@@ -63,6 +63,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
+#include <valarray>
 
 int main(int argc, char * argv[])
 {
diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h
index c969af7fd..d09c16c3b 100644
--- a/Cxx11/prk_util.h
+++ b/Cxx11/prk_util.h
@@ -54,7 +54,7 @@
 #include <exception>
 #include <list>
 #include <vector>
-#include <valarray>
+//#include <valarray>
 
 #include <chrono>
 #include <random>

From fc349a1d110474b9f9092076383c7438a472619e Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sun, 12 May 2019 18:44:53 -0700
Subject: [PATCH 204/245] PSTL in GCC9 - support it

---
 Cxx11/nstream-vector-pstl.cc |  6 +++---
 Cxx11/prk_pstl.h             | 14 ++++++++++----
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/Cxx11/nstream-vector-pstl.cc b/Cxx11/nstream-vector-pstl.cc
index 21b5e0b45..b69a8171c 100644
--- a/Cxx11/nstream-vector-pstl.cc
+++ b/Cxx11/nstream-vector-pstl.cc
@@ -126,7 +126,7 @@ int main(int argc, char * argv[])
   double scalar(3);
 
   {
-#if defined(USE_PSTL) && defined(USE_INTEL_PSTL)
+#if defined(USE_PSTL) && ( defined(USE_INTEL_PSTL) || ( defined(__GNUC__) && (__GNUC__ >= 9) ) )
     std::for_each( exec::par_unseq, std::begin(range), std::end(range), [&] (size_t i) {
 #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \
                         && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) )
@@ -140,11 +140,11 @@ int main(int argc, char * argv[])
         C[i] = 2;
     });
 
-    for (auto iter = 0; iter<=iterations; iter++) {
+    for (int iter = 0; iter<=iterations; iter++) {
 
       if (iter==1) nstream_time = prk::wtime();
 
-#if defined(USE_PSTL) && defined(USE_INTEL_PSTL)
+#if defined(USE_PSTL) && ( defined(USE_INTEL_PSTL) || ( defined(__GNUC__) && (__GNUC__ >= 9) ) )
       std::for_each( exec::par_unseq, std::begin(range), std::end(range), [&] (size_t i) {
 #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \
                         && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) )
diff --git a/Cxx11/prk_pstl.h b/Cxx11/prk_pstl.h
index 11e0368bb..97efca244 100644
--- a/Cxx11/prk_pstl.h
+++ b/Cxx11/prk_pstl.h
@@ -36,18 +36,24 @@
 #define USE_INTEL_PSTL
 #endif
 
-#ifdef USE_PSTL
-# ifdef USE_INTEL_PSTL
+#if defined(USE_PSTL)
+# if defined(__GNUC__) && (__GNUC__ >= 9)
+#  include <execution>
+#  include <algorithm>
+#  include <numeric>
+//#  include <memory>
+namespace exec = __pstl::execution;
+# elif defined(USE_INTEL_PSTL)
 #  include <pstl/execution>
 #  include <pstl/algorithm>
 #  include <pstl/numeric>
-#  include <pstl/memory>
+//#  include <pstl/memory>
+namespace exec = std::execution;
 # elif defined(__GNUC__) && defined(__GNUC_MINOR__) && \
        ( (__GNUC__ >= 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) )
 #  include <parallel/algorithm>
 #  include <parallel/numeric>
 # endif
-namespace exec = std::execution;
 #endif
 
 #endif /* PRK_PSTL_H */

From 7dee91f52205b27e70864b605b2d46209b0fcca4 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sun, 12 May 2019 18:50:23 -0700
Subject: [PATCH 205/245] PSTL in GCC9 - support it

---
 Cxx11/transpose-vector-pstl.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cxx11/transpose-vector-pstl.cc b/Cxx11/transpose-vector-pstl.cc
index ac7aefb8a..4b5839647 100644
--- a/Cxx11/transpose-vector-pstl.cc
+++ b/Cxx11/transpose-vector-pstl.cc
@@ -108,14 +108,14 @@ int main(int argc, char * argv[])
 
   auto range = prk::range(0,order);
 
-  auto trans_time = 0.0;
+  double trans_time(0);
 
-  for (auto iter = 0; iter<=iterations; iter++) {
+  for (int iter = 0; iter<=iterations; iter++) {
 
     if (iter==1) trans_time = prk::wtime();
 
     // transpose
-#if defined(USE_PSTL) && defined(USE_INTEL_PSTL)
+#if defined(USE_PSTL) && ( defined(USE_INTEL_PSTL) || ( defined(__GNUC__) && (__GNUC__ >= 9) ) )
   std::for_each( exec::par, std::begin(range), std::end(range), [&] (int i) {
     std::for_each( exec::unseq, std::begin(range), std::end(range), [&] (int j) {
 #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \

From d152d043a6b6ae567b98c018baf58e381813d128 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sun, 12 May 2019 18:56:24 -0700
Subject: [PATCH 206/245] add GCC-9 to supported PSTL configs

remove non-transform dead code in stencil
stop using auto for iter and timer - this is pointless
---
 Cxx11/p2p-hyperplane-vector-pstl.cc |  6 +++---
 Cxx11/stencil-vector-pstl.cc        | 27 +++++----------------------
 2 files changed, 8 insertions(+), 25 deletions(-)

diff --git a/Cxx11/p2p-hyperplane-vector-pstl.cc b/Cxx11/p2p-hyperplane-vector-pstl.cc
index c64757e6d..9b02a65a0 100644
--- a/Cxx11/p2p-hyperplane-vector-pstl.cc
+++ b/Cxx11/p2p-hyperplane-vector-pstl.cc
@@ -119,7 +119,7 @@ int main(int argc, char* argv[])
   // Allocate space and perform the computation
   //////////////////////////////////////////////////////////////////////
 
-  auto pipeline_time = 0.0; // silence compiler warning
+  double pipeline_time(0);
 
   std::vector<double> grid(n*n,0.0);
 
@@ -129,7 +129,7 @@ int main(int argc, char* argv[])
     grid[j*n+0] = static_cast<double>(j);
   }
 
-  for (auto iter = 0; iter<=iterations; iter++) {
+  for (int iter = 0; iter<=iterations; iter++) {
 
     if (iter==1) pipeline_time = prk::wtime();
 
@@ -156,7 +156,7 @@ int main(int argc, char* argv[])
         const auto begin = std::max(2,i-(nb+1)+2);
         const auto end   = std::min(i,nb+1)+1;
         auto range = prk::range(begin,end);
-#if defined(USE_PSTL) && defined(USE_INTEL_PSTL)
+#if defined(USE_PSTL) && ( defined(USE_INTEL_PSTL) || ( defined(__GNUC__) && (__GNUC__ >= 9) ) )
         std::for_each( exec::par, std::begin(range), std::end(range), [&] (auto j) {
 #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \
                         && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) )
diff --git a/Cxx11/stencil-vector-pstl.cc b/Cxx11/stencil-vector-pstl.cc
index ca3c83ec0..cae97761d 100644
--- a/Cxx11/stencil-vector-pstl.cc
+++ b/Cxx11/stencil-vector-pstl.cc
@@ -63,7 +63,7 @@
 #include "prk_util.h"
 #include "prk_pstl.h"
 // See ParallelSTL.md for important information.
-#if defined(USE_PSTL) && defined(USE_INTEL_PSTL)
+#if defined(USE_PSTL) && ( defined(USE_INTEL_PSTL) || ( defined(__GNUC__) && (__GNUC__ >= 9) ) )
 #include "stencil_pstl.hpp"
 #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \
                         && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) )
@@ -175,14 +175,14 @@ int main(int argc, char* argv[])
   // Allocate space and perform the computation
   //////////////////////////////////////////////////////////////////////
 
-  auto stencil_time = 0.0;
+  double stencil_time(0);
 
   std::vector<double> in(n*n);
   std::vector<double> out(n*n);
 
   // initialize the input and output arrays
   auto range = prk::range(0,n);
-#if defined(USE_PSTL) && defined(USE_INTEL_PSTL)
+#if defined(USE_PSTL) && ( defined(USE_INTEL_PSTL) || ( defined(__GNUC__) && (__GNUC__ >= 9) ) )
   std::for_each( exec::par, std::begin(range), std::end(range), [&] (int i) {
     std::for_each( exec::unseq, std::begin(range), std::end(range), [&] (int j) {
 #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \
@@ -198,35 +198,18 @@ int main(int argc, char* argv[])
     });
   });
 
-  for (auto iter = 0; iter<=iterations; iter++) {
+  for (int iter = 0; iter<=iterations; iter++) {
     if (iter==1) stencil_time = prk::wtime();
     // Apply the stencil operator
     stencil(n, tile_size, in, out);
     // Add constant to solution to force refresh of neighbor data, if any
-#if 0
-#if defined(USE_PSTL) && defined(USE_INTEL_PSTL)
-    std::for_each( exec::par, std::begin(range), std::end(range), [&] (int i) {
-      std::for_each( exec::unseq, std::begin(range), std::end(range), [&] (int j) {
-#elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \
-                        && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) )
-      __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (int i) {
-        __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (int j) {
-#else
-    std::for_each( std::begin(range), std::end(range), [&] (int i) {
-      std::for_each( std::begin(range), std::end(range), [&] (int j) {
-#endif
-        in[i*n+j] += 1.0;
-      });
-    });
-#else
-#if defined(USE_PSTL) && defined(USE_INTEL_PSTL)
+#if defined(USE_PSTL) && ( defined(USE_INTEL_PSTL) || ( defined(__GNUC__) && (__GNUC__ >= 9) ) )
     std::transform( exec::par_unseq, in.begin(), in.end(), in.begin(), [](double c) { return c+=1.0; });
 #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \
                         && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) )
     __gnu_parallel::transform( in.begin(), in.end(), in.begin(), [](double c) { return c+=1.0; });
 #else
     std::transform( in.begin(), in.end(), in.begin(), [](double c) { return c+=1.0; });
-#endif
 #endif
   }
 

From 54dbbfbe358d61d20a71854846826b353760dd83 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Mon, 13 May 2019 09:39:04 -0400
Subject: [PATCH 207/245] add valarray include to transpose instance

---
 Cxx11/transpose-valarray.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Cxx11/transpose-valarray.cc b/Cxx11/transpose-valarray.cc
index 029f893be..55420723b 100644
--- a/Cxx11/transpose-valarray.cc
+++ b/Cxx11/transpose-valarray.cc
@@ -53,6 +53,7 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
+#include <valarray>
 
 int main(int argc, char * argv[])
 {

From 7bfa597b76a65be9dad9550f1451543a3fbcf2a4 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Tue, 14 May 2019 10:22:58 -0400
Subject: [PATCH 208/245] reuse OpenMP for sequential versions of nstream and
 transpose

---
 C1z/Makefile                        |   9 +-
 C1z/{nstream.c => nstream-openmp.c} |   0
 C1z/transpose.c                     | 176 ----------------------------
 3 files changed, 6 insertions(+), 179 deletions(-)
 rename C1z/{nstream.c => nstream-openmp.c} (100%)
 delete mode 100644 C1z/transpose.c

diff --git a/C1z/Makefile b/C1z/Makefile
index 535ed9eaa..399751925 100644
--- a/C1z/Makefile
+++ b/C1z/Makefile
@@ -71,6 +71,12 @@ p2p-innerloop: p2p-innerloop-openmp.c prk_util.h
 p2p-hyperplane: p2p-hyperplane-openmp.c prk_util.h
 	$(CC) $(CFLAGS) $< $(EXTRA_CLIBS) -o $@
 
+nstream: nstream-openmp.c prk_util.h
+	$(CC) $(CFLAGS) $< $(EXTRA_CLIBS) -o $@
+
+transpose: transpose-openmp.c prk_util.h
+	$(CC) $(CFLAGS) $< $(EXTRA_CLIBS) -o $@
+
 %-mpi: %-mpi.c prk_util.h
 	$(MPICC) $(CFLAGS) $< $(EXTRA_CLIBS) -o $@
 
@@ -92,9 +98,6 @@ p2p-hyperplane: p2p-hyperplane-openmp.c prk_util.h
 %-taskloop: %-taskloop.c prk_util.h
 	$(CC) $(CFLAGS) $< $(OMPFLAGS) $(EXTRA_CLIBS) -o $@
 
-nstream-openmp: nstream.c prk_util.h
-	$(CC) $(CFLAGS) $< $(OMPFLAGS) $(EXTRA_CLIBS) -o $@
-
 %-openmp: %-openmp.c prk_util.h
 	$(CC) $(CFLAGS) $< $(OMPFLAGS) $(EXTRA_CLIBS) -o $@
 
diff --git a/C1z/nstream.c b/C1z/nstream-openmp.c
similarity index 100%
rename from C1z/nstream.c
rename to C1z/nstream-openmp.c
diff --git a/C1z/transpose.c b/C1z/transpose.c
deleted file mode 100644
index 797c2395f..000000000
--- a/C1z/transpose.c
+++ /dev/null
@@ -1,176 +0,0 @@
-///
-/// Copyright (c) 2013, Intel Corporation
-///
-/// Redistribution and use in source and binary forms, with or without
-/// modification, are permitted provided that the following conditions
-/// are met:
-///
-/// * Redistributions of source code must retain the above copyright
-///       notice, this list of conditions and the following disclaimer.
-/// * Redistributions in binary form must reproduce the above
-///       copyright notice, this list of conditions and the following
-///       disclaimer in the documentation and/or other materials provided
-///       with the distribution.
-/// * Neither the name of Intel Corporation nor the names of its
-///       contributors may be used to endorse or promote products
-///       derived from this software without specific prior written
-///       permission.
-///
-/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-/// POSSIBILITY OF SUCH DAMAGE.
-
-//////////////////////////////////////////////////////////////////////
-///
-/// NAME:    transpose
-///
-/// PURPOSE: This program measures the time for the transpose of a
-///          column-major stored matrix into a row-major stored matrix.
-///
-/// USAGE:   Program input is the matrix order and the number of times to
-///          repeat the operation:
-///
-///          transpose <matrix_size> <# iterations> [tile size]
-///
-///          An optional parameter specifies the tile size used to divide the
-///          individual matrix blocks for improved cache and TLB performance.
-///
-///          The output consists of diagnostics to make sure the
-///          transpose worked and timing statistics.
-///
-/// HISTORY: Written by  Rob Van der Wijngaart, February 2009.
-///          Converted to C++11 by Jeff Hammond, February 2016 and May 2017.
-///          C11-ification by Jeff Hammond, June 2017.
-///
-//////////////////////////////////////////////////////////////////////
-
-#include "prk_util.h"
-
-int main(int argc, char * argv[])
-{
-  printf("Parallel Research Kernels version %.2f\n", PRKVERSION );
-  printf("C11 Matrix transpose: B = A^T\n");
-
-  //////////////////////////////////////////////////////////////////////
-  /// Read and test input parameters
-  //////////////////////////////////////////////////////////////////////
-
-  if (argc < 3) {
-    printf("Usage: <# iterations> <matrix order> [tile size]\n");
-    return 1;
-  }
-
-  // number of times to do the transpose
-  int iterations = atoi(argv[1]);
-  if (iterations < 1) {
-    printf("ERROR: iterations must be >= 1\n");
-    return 1;
-  }
-
-  // order of a the matrix
-  int order = atoi(argv[2]);
-  if (order <= 0) {
-    printf("ERROR: Matrix Order must be greater than 0\n");
-    return 1;
-  }
-
-  // default tile size for tiling of local transpose
-  int tile_size = (argc>4) ? atoi(argv[3]) : 32;
-  // a negative tile size means no tiling of the local transpose
-  if (tile_size <= 0) tile_size = order;
-
-  printf("Number of iterations  = %d\n", iterations);
-  printf("Matrix order          = %d\n", order);
-  printf("Tile size             = %d\n", tile_size);
-
-  //////////////////////////////////////////////////////////////////////
-  /// Allocate space for the input and transpose matrix
-  //////////////////////////////////////////////////////////////////////
-
-  double trans_time = 0.0;
-
-  size_t bytes = order*order*sizeof(double);
-  double * restrict A = prk_malloc(bytes);
-  double * restrict B = prk_malloc(bytes);
-
-  {
-    for (int i=0;i<order; i++) {
-      for (int j=0;j<order;j++) {
-        A[i*order+j] = (double)(i*order+j);
-        B[i*order+j] = 0.0;
-      }
-    }
-
-    for (int iter = 0; iter<=iterations; iter++) {
-
-      if (iter==1) trans_time = prk_wtime();
-
-      // transpose the  matrix
-      if (tile_size < order) {
-        for (int it=0; it<order; it+=tile_size) {
-          for (int jt=0; jt<order; jt+=tile_size) {
-            for (int i=it; i<MIN(order,it+tile_size); i++) {
-              for (int j=jt; j<MIN(order,jt+tile_size); j++) {
-                B[i*order+j] += A[j*order+i];
-                A[j*order+i] += 1.0;
-              }
-            }
-          }
-        }
-      } else {
-        for (int i=0;i<order; i++) {
-          for (int j=0;j<order;j++) {
-            B[i*order+j] += A[j*order+i];
-            A[j*order+i] += 1.0;
-          }
-        }
-      }
-    }
-    trans_time = prk_wtime() - trans_time;
-  }
-
-  //////////////////////////////////////////////////////////////////////
-  // Analyze and output results
-  //////////////////////////////////////////////////////////////////////
-
-  const double addit = (iterations+1.) * (iterations/2.);
-  double abserr = 0.0;
-  for (int j=0; j<order; j++) {
-    for (int i=0; i<order; i++) {
-      const size_t ij = i*order+j;
-      const size_t ji = j*order+i;
-      const double reference = (double)(ij)*(1.+iterations)+addit;
-      abserr += fabs(B[ji] - reference);
-    }
-  }
-
-  prk_free(A);
-  prk_free(B);
-
-#ifdef VERBOSE
-  printf("Sum of absolute differences: %lf\n", abserr);
-#endif
-
-  const double epsilon = 1.0e-8;
-  if (abserr < epsilon) {
-    printf("Solution validates\n");
-    const double avgtime = trans_time/iterations;
-    printf("Rate (MB/s): %lf Avg time (s): %lf\n", 2.0e-6 * bytes/avgtime, avgtime );
-  } else {
-    printf("ERROR: Aggregate squared error %lf exceeds threshold %lf\n", abserr, epsilon );
-    return 1;
-  }
-
-  return 0;
-}
-
-

From 10bad8ec0e88768ecc373a83d015500c16ecfc29 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Tue, 14 May 2019 10:37:56 -0400
Subject: [PATCH 209/245] remove innerloop in favor of hyperplane and cleanup
 CI

---
 .gitignore                 |   8 +-
 C1z/Makefile               |  12 +-
 C1z/p2p-innerloop-openmp.c | 179 -----------------------------
 C1z/stencil.c              | 230 -------------------------------------
 travis/build-run-prk.sh    |  57 ++++-----
 5 files changed, 34 insertions(+), 452 deletions(-)
 delete mode 100644 C1z/p2p-innerloop-openmp.c
 delete mode 100644 C1z/stencil.c

diff --git a/.gitignore b/.gitignore
index a7e76eb32..2c32fc0c8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -108,8 +108,12 @@ C1z/nstream-memkind-openmp
 C1z/nstream-mmap
 C1z/nstream-mmap-openmp
 C1z/p2p
+C1z/p2p-avx
+C1z/p2p-sse
 C1z/p2p-innerloop
 C1z/p2p-innerloop-openmp
+C1z/p2p-hyperplane
+C1z/p2p-hyperplane-openmp
 C1z/p2p-tasks-openmp
 C1z/p2p-simd-openmp
 C1z/stencil
@@ -280,7 +284,3 @@ FORTRAN/transpose-ornlacc
 RUST/p2p/Cargo.lock
 RUST/stencil/Cargo.lock
 RUST/transpose/Cargo.lock
-nstream
-../C1z/p2p-avx
-../C1z/p2p-sse
-../C1z/p2p-hyperplane-openmp
diff --git a/C1z/Makefile b/C1z/Makefile
index 399751925..23562f35f 100644
--- a/C1z/Makefile
+++ b/C1z/Makefile
@@ -45,11 +45,11 @@ endif
 
 all: serial thread openmp taskloop $(EXTRA)
 
-serial: nstream p2p p2p-innerloop p2p-hyperplane stencil transpose
+serial: nstream p2p p2p-hyperplane stencil transpose
 
 thread: transpose-thread
 
-openmp: nstream-openmp p2p-simd-openmp p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp transpose-openmp p2p-hyperplane-openmp
+openmp: nstream-openmp p2p-simd-openmp p2p-tasks-openmp stencil-openmp transpose-openmp p2p-hyperplane-openmp
 
 mpi: nstream-mpi
 
@@ -65,15 +65,15 @@ cilk: stencil-cilk transpose-cilk
 
 ispc: transpose-ispc
 
-p2p-innerloop: p2p-innerloop-openmp.c prk_util.h
-	$(CC) $(CFLAGS) $< $(EXTRA_CLIBS) -o $@
-
 p2p-hyperplane: p2p-hyperplane-openmp.c prk_util.h
 	$(CC) $(CFLAGS) $< $(EXTRA_CLIBS) -o $@
 
 nstream: nstream-openmp.c prk_util.h
 	$(CC) $(CFLAGS) $< $(EXTRA_CLIBS) -o $@
 
+stencil: stencil-openmp.c prk_util.h
+	$(CC) $(CFLAGS) $< $(EXTRA_CLIBS) -o $@
+
 transpose: transpose-openmp.c prk_util.h
 	$(CC) $(CFLAGS) $< $(EXTRA_CLIBS) -o $@
 
@@ -126,7 +126,7 @@ clean:
 	-rm -f *.optrpt
 	-rm -f *.dwarf
 	-rm -rf *.dSYM # Mac
-	-rm -f nstream p2p p2p-innerloop stencil transpose
+	-rm -f nstream p2p p2p-hyperplane stencil transpose
 	-rm -f *-openmp
 	-rm -f *-mpi
 	-rm -f *-target
diff --git a/C1z/p2p-innerloop-openmp.c b/C1z/p2p-innerloop-openmp.c
deleted file mode 100644
index 35fe80cba..000000000
--- a/C1z/p2p-innerloop-openmp.c
+++ /dev/null
@@ -1,179 +0,0 @@
-///
-/// Copyright (c) 2013, Intel Corporation
-///
-/// Redistribution and use in source and binary forms, with or without
-/// modification, are permitted provided that the following conditions
-/// are met:
-///
-/// * Redistributions of source code must retain the above copyright
-///       notice, this list of conditions and the following disclaimer.
-/// * Redistributions in binary form must reproduce the above
-///       copyright notice, this list of conditions and the following
-///       disclaimer in the documentation and/or other materials provided
-///       with the distribution.
-/// * Neither the name of Intel Corporation nor the names of its
-///       contributors may be used to endorse or promote products
-///       derived from this software without specific prior written
-///       permission.
-///
-/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-/// POSSIBILITY OF SUCH DAMAGE.
-
-//////////////////////////////////////////////////////////////////////
-///
-/// NAME:    Pipeline
-///
-/// PURPOSE: This program tests the efficiency with which point-to-point
-///          synchronization can be carried out. It does so by executing
-///          a pipelined algorithm on an n^2 grid. The first array dimension
-///          is distributed among the threads (stripwise decomposition).
-///
-/// USAGE:   The program takes as input the
-///          dimensions of the grid, and the number of iterations on the grid
-///
-///                <progname> <iterations> <n>
-///
-///          The output consists of diagnostics to make sure the
-///          algorithm worked, and of timing statistics.
-///
-/// FUNCTIONS CALLED:
-///
-///          Other than standard C functions, the following
-///          functions are used in this program:
-///
-///          wtime()
-///
-/// HISTORY: - Written by Rob Van der Wijngaart, February 2009.
-///          - C99-ification by Jeff Hammond, February 2016.
-///          - C11-ification by Jeff Hammond, June 2017.
-///
-//////////////////////////////////////////////////////////////////////
-
-#include "prk_util.h"
-
-int main(int argc, char* argv[])
-{
-  printf("Parallel Research Kernels version %.2f\n", PRKVERSION);
-#ifdef _OPENMP
-  printf("C11/OpenMP INNERLOOP pipeline execution on 2D grid\n");
-#else
-  printf("C11/Serial INNERLOOP pipeline execution on 2D grid\n");
-#endif
-
-  //////////////////////////////////////////////////////////////////////
-  // Process and test input parameters
-  //////////////////////////////////////////////////////////////////////
-
-  if (argc < 3) {
-    printf("Usage: <# iterations> <array dimension>\n");
-    return 1;
-  }
-
-  // number of times to run the pipeline algorithm
-  int iterations = atoi(argv[1]);
-  if (iterations < 1) {
-    printf("ERROR: iterations must be >= 1\n");
-    return 1;
-  }
-
-  // grid dimensions
-  int n = atol(argv[2]);
-  if (n < 1) {
-    printf("ERROR: grid dimension must be positive: %d\n", n);
-    return 1;
-  }
-
-#ifdef _OPENMP
-  printf("Number of threads (max)   = %d\n", omp_get_max_threads());
-#endif
-  printf("Number of iterations      = %d\n", iterations);
-  printf("Grid sizes                = %d,%d\n", n, n);
-
-  //////////////////////////////////////////////////////////////////////
-  // Allocate space and perform the computation
-  //////////////////////////////////////////////////////////////////////
-
-  double pipeline_time = 0.0; // silence compiler warning
-
-  size_t bytes = n*n*sizeof(double);
-  double * restrict grid = prk_malloc(bytes);
-
-  OMP_PARALLEL()
-  {
-    OMP_FOR()
-    for (int i=0; i<n; i++) {
-      OMP_SIMD
-      for (int j=0; j<n; j++) {
-        grid[i*n+j] = 0.0;
-      }
-    }
-
-    // set boundary values (bottom and left side of grid)
-    OMP_MASTER
-    {
-      for (int j=0; j<n; j++) {
-        grid[0*n+j] = (double)j;
-      }
-      for (int i=0; i<n; i++) {
-        grid[i*n+0] = (double)i;
-      }
-    }
-    OMP_BARRIER
-
-    for (int iter = 0; iter<=iterations; iter++) {
-
-      if (iter==1) {
-          OMP_BARRIER
-          OMP_MASTER
-          pipeline_time = prk_wtime();
-      }
-
-      for (int i=2; i<=2*n-2; i++) {
-        OMP_FOR(simd)
-        for (int j=MAX(2,i-n+2); j<=MIN(i,n); j++) {
-          const int x = i-j+2-1;
-          const int y = j-1;
-          grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)];
-        }
-      }
-      OMP_MASTER
-      grid[0*n+0] = -grid[(n-1)*n+(n-1)];
-    }
-    OMP_BARRIER
-    OMP_MASTER
-    pipeline_time = prk_wtime() - pipeline_time;
-  }
-
-  //////////////////////////////////////////////////////////////////////
-  // Analyze and output results.
-  //////////////////////////////////////////////////////////////////////
-
-  const double epsilon = 1.e-8;
-  const double corner_val = ((iterations+1.)*(n+n-2.));
-  if ( (fabs(grid[(n-1)*n+(n-1)] - corner_val)/corner_val) > epsilon) {
-    printf("ERROR: checksum %lf does not match verification value %lf\n", grid[(n-1)*n+(n-1)], corner_val);
-    return 1;
-  }
-
-  prk_free(grid);
-
-#ifdef VERBOSE
-  printf("Solution validates; verification value = %lf\n", corner_val );
-#else
-  printf("Solution validates\n" );
-#endif
-  double avgtime = pipeline_time/iterations;
-  printf("Rate (MFlops/s): %lf Avg time (s): %lf\n", 2.0e-6 * ( (n-1)*(n-1) )/avgtime, avgtime );
-
-  return 0;
-}
diff --git a/C1z/stencil.c b/C1z/stencil.c
deleted file mode 100644
index 50ff8cbaa..000000000
--- a/C1z/stencil.c
+++ /dev/null
@@ -1,230 +0,0 @@
-
-///
-/// Copyright (c) 2013, Intel Corporation
-///
-/// Redistribution and use in source and binary forms, with or without
-/// modification, are permitted provided that the following conditions
-/// are met:
-///
-/// * Redistributions of source code must retain the above copyright
-///       notice, this list of conditions and the following disclaimer.
-/// * Redistributions in binary form must reproduce the above
-///       copyright notice, this list of conditions and the following
-///       disclaimer in the documentation and/or other materials provided
-///       with the distribution.
-/// * Neither the name of Intel Corporation nor the names of its
-///       contributors may be used to endorse or promote products
-///       derived from this software without specific prior written
-///       permission.
-///
-/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-/// POSSIBILITY OF SUCH DAMAGE.
-
-//////////////////////////////////////////////////////////////////////
-///
-/// NAME:    Stencil
-///
-/// PURPOSE: This program tests the efficiency with which a space-invariant,
-///          linear, symmetric filter (stencil) can be applied to a square
-///          grid or image.
-///
-/// USAGE:   The program takes as input the linear
-///          dimension of the grid, and the number of iterations on the grid
-///
-///                <progname> <iterations> <grid size>
-///
-///          The output consists of diagnostics to make sure the
-///          algorithm worked, and of timing statistics.
-///
-/// FUNCTIONS CALLED:
-///
-///          Other than standard C functions, the following
-///          functions are used in this program:
-///
-///          wtime()
-///
-/// HISTORY: - Written by Rob Van der Wijngaart, February 2009.
-///          - C99-ification by Jeff Hammond, February 2016.
-///          - C11-ification by Jeff Hammond, June 2017.
-///
-//////////////////////////////////////////////////////////////////////
-
-#include "prk_util.h"
-
-typedef void (*stencil_t)(const int, const double * restrict, double * restrict);
-
-void nothing(const int n, const double * restrict in, double * restrict out)
-{
-    printf("You are trying to use a stencil that does not exist.\n");
-    printf("Please generate the new stencil using the code generator.\n");
-    // n will never be zero - this is to silence compiler warnings.
-    if (n==0) printf("%p %p\n", in, out);
-    abort();
-}
-
-#include "stencil_seq.h"
-
-int main(int argc, char * argv[])
-{
-  printf("Parallel Research Kernels version %.2f\n", PRKVERSION);
-  printf("C11 Stencil execution on 2D grid\n");
-
-  //////////////////////////////////////////////////////////////////////
-  // Process and test input parameters
-  //////////////////////////////////////////////////////////////////////
-
-  if (argc < 3){
-    printf("Usage: <# iterations> <array dimension> [<star/grid> <radius>]\n");
-    return 1;
-  }
-
-  // number of times to run the algorithm
-  int iterations  = atoi(argv[1]);
-  if (iterations < 1) {
-    printf("ERROR: iterations must be >= 1\n");
-    return 1;
-  }
-
-  // linear grid dimension
-  int n  = atoi(argv[2]);
-  if (n < 1) {
-    printf("ERROR: grid dimension must be positive\n");
-    return 1;
-  } else if (n > floor(sqrt(INT_MAX))) {
-    printf("ERROR: grid dimension too large - overflow risk\n");
-    return 1;
-  }
-
-  // stencil pattern
-  bool star = true;
-  if (argc > 3) {
-      char* pattern = argv[3];
-      star = (0==strncmp(pattern,"star",4)) ? true : false;
-  }
-
-  // stencil radius
-  int radius = 2;
-  if (argc > 4) {
-      radius = atoi(argv[4]);
-  }
-
-  if ( (radius < 1) || (2*radius+1 > n) ) {
-    printf("ERROR: Stencil radius negative or too large\n");
-    return 1;
-  }
-
-  printf("Number of iterations      = %d\n", iterations);
-  printf("Grid sizes                = %d\n", n);
-  printf("Type of stencil           = %s\n", (star ? "star" : "grid") );
-  printf("Radius of stencil         = %d\n", radius );
-
-  stencil_t stencil = nothing;
-  if (star) {
-      switch (radius) {
-          case 1: stencil = star1; break;
-          case 2: stencil = star2; break;
-          case 3: stencil = star3; break;
-          case 4: stencil = star4; break;
-          case 5: stencil = star5; break;
-          case 6: stencil = star6; break;
-          case 7: stencil = star7; break;
-          case 8: stencil = star8; break;
-          case 9: stencil = star9; break;
-      }
-  } else {
-      switch (radius) {
-          case 1: stencil = grid1; break;
-          case 2: stencil = grid2; break;
-          case 3: stencil = grid3; break;
-          case 4: stencil = grid4; break;
-          case 5: stencil = grid5; break;
-          case 6: stencil = grid6; break;
-          case 7: stencil = grid7; break;
-          case 8: stencil = grid8; break;
-          case 9: stencil = grid9; break;
-      }
-  }
-
-  //////////////////////////////////////////////////////////////////////
-  // Allocate space and perform the computation
-  //////////////////////////////////////////////////////////////////////
-
-  double stencil_time = 0.0;
-
-  // interior of grid with respect to stencil
-  size_t active_points = (n-2*radius)*(n-2*radius);
-  size_t bytes = n*n*sizeof(double);
-
-  double * restrict in  = prk_malloc(bytes);
-  double * restrict out = prk_malloc(bytes);
-
-  {
-    for (int i=0; i<n; i++) {
-      for (int j=0; j<n; j++) {
-        in[i*n+j]  = (double)(i+j);
-        out[i*n+j] = 0.0;
-      }
-    }
-
-    for (int iter = 0; iter<=iterations; iter++) {
-
-      if (iter==1) stencil_time = prk_wtime();
-
-      // Apply the stencil operator
-      stencil(n, in, out);
-
-      // Add constant to solution to force refresh of neighbor data, if any
-      for (int i=0; i<n; i++) {
-        for (int j=0; j<n; j++) {
-          in[i*n+j] += 1.0;
-        }
-      }
-    }
-    stencil_time = prk_wtime() - stencil_time;
-  }
-
-  //////////////////////////////////////////////////////////////////////
-  // Analyze and output results.
-  //////////////////////////////////////////////////////////////////////
-
-  // compute L1 norm in parallel
-  double norm = 0.0;
-  for (int i=radius; i<n-radius; i++) {
-    for (int j=radius; j<n-radius; j++) {
-      norm += fabs(out[i*n+j]);
-    }
-  }
-  norm /= active_points;
-
-  prk_free(in);
-  prk_free(out);
-
-  // verify correctness
-  const double epsilon = 1.0e-8;
-  double reference_norm = 2.*(iterations+1.);
-  if (fabs(norm-reference_norm) > epsilon) {
-    printf("ERROR: L1 norm = %lf Reference L1 norm = %lf\n", norm, reference_norm);
-    return 1;
-  } else {
-    printf("Solution validates\n");
-#ifdef VERBOSE
-    printf("L1 norm = %lf Reference L1 norm = %lf\n", norm, reference_norm);
-#endif
-    const int stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1);
-    size_t flops = (2*stencil_size+1) * active_points;
-    double avgtime = stencil_time/iterations;
-    printf("Rate (MFlops/s): %lf Avg time (s): %lf\n", 1.0e-6 * (double)flops/avgtime, avgtime );
-  }
-
-  return 0;
-}
diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index dcb85da2a..0bb1f92d0 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -148,10 +148,10 @@ case "$PRK_TARGET" in
         echo "EXTRA_CLIBS=-lm -lpthread" >> common/make.defs
 
         # C11 without external parallelism
-        ${MAKE} -C $PRK_TARGET_PATH p2p stencil transpose p2p-innerloop p2p-hyperplane
+        ${MAKE} -C $PRK_TARGET_PATH nstream p2p stencil transpose p2p-hyperplane
+        $PRK_TARGET_PATH/nstream         10 16777216 32
         $PRK_TARGET_PATH/p2p             10 1024 1024
         $PRK_TARGET_PATH/p2p             10 1024 1024 100 100
-        $PRK_TARGET_PATH/p2p-innerloop   10 1024
         $PRK_TARGET_PATH/p2p-hyperplane  10 1024
         $PRK_TARGET_PATH/p2p-hyperplane  10 1024 32
         $PRK_TARGET_PATH/stencil         10 1000
@@ -170,12 +170,15 @@ case "$PRK_TARGET" in
         # C11 with OpenMP
         export OMP_NUM_THREADS=2
         case "$CC" in
+            clang*)
+                echo "Skipping Clang since OpenMP support probably missing"
+                ;;
             g*)
                 # Host
                 echo "OPENMPFLAG=-fopenmp" >> common/make.defs
-                ${MAKE} -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-innerloop-openmp p2p-hyperplane-openmp stencil-openmp transpose-openmp
+                ${MAKE} -C $PRK_TARGET_PATH nstream-openmp p2p-tasks-openmp p2p-hyperplane-openmp stencil-openmp transpose-openmp
+                $PRK_TARGET_PATH/nstream-openmp           10 16777216 32
                 $PRK_TARGET_PATH/p2p-tasks-openmp         10 1024 1024 100 100
-                $PRK_TARGET_PATH/p2p-innerloop-openmp     10 1024
                 $PRK_TARGET_PATH/p2p-hyperplane-openmp    10 1024
                 $PRK_TARGET_PATH/p2p-hyperplane-openmp    10 1024 32
                 $PRK_TARGET_PATH/stencil-openmp           10 1000
@@ -198,27 +201,14 @@ case "$PRK_TARGET" in
                     done
                 done
                 ;;
-            clang*)
-                # Host
-                echo "Skipping Clang since OpenMP support probably missing"
-                #echo "OPENMPFLAG=-fopenmp" >> common/make.defs
-                #${MAKE} -C $PRK_TARGET_PATH openmp
-                #$PRK_TARGET_PATH/p2p-tasks-openmp         10 1024 1024 100 100
-                #$PRK_TARGET_PATH/stencil-openmp           10 1000
-                #$PRK_TARGET_PATH/transpose-penmp          10 1024 32
-                #echo "Test stencil code generator"
-                #for s in star grid ; do
-                #    for r in 1 2 3 4 5 ; do
-                #        $PRK_TARGET_PATH/stencil-penmp 10 200 $s $r
-                #    done
-                #done
-                ;;
             ic*)
                 # Host
                 echo "OPENMPFLAG=-qopenmp" >> common/make.defs
-                ${MAKE} -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp transpose-openmp
+                ${MAKE} -C $PRK_TARGET_PATH nstream-openmp p2p-tasks-openmp p2p-hyperplane-openmp stencil-openmp transpose-openmp
+                $PRK_TARGET_PATH/nstream-openmp           10 16777216 32
                 $PRK_TARGET_PATH/p2p-tasks-openmp         10 1024 1024 100 100
-                $PRK_TARGET_PATH/p2p-innerloop-openmp     10 1024 1024
+                $PRK_TARGET_PATH/p2p-hyperplane-openmp    10 1024
+                $PRK_TARGET_PATH/p2p-hyperplane-openmp    10 1024 32
                 $PRK_TARGET_PATH/stencil-openmp           10 1000
                 $PRK_TARGET_PATH/transpose-openmp         10 1024 32
                 #echo "Test stencil code generator"
@@ -247,18 +237,19 @@ case "$PRK_TARGET" in
         esac
 
         # C11 with Cilk
-        if [ "${CC}" = "gcc" ] ; then
-            echo "CILKFLAG=-fcilkplus" >> common/make.defs
-            ${MAKE} -C $PRK_TARGET_PATH stencil-cilk transpose-cilk
-            $PRK_TARGET_PATH/stencil-cilk     10 1000
-            $PRK_TARGET_PATH/transpose-cilk   10 1024 32
-            #echo "Test stencil code generator"
-            for s in star grid ; do
-                for r in 1 2 3 4 5 ; do
-                    $PRK_TARGET_PATH/stencil-cilk 10 200 $s $r
-                done
-            done
-        fi
+        #if [ "${CC}" = "gcc" ] ; then
+        #    echo "CILKFLAG=-fcilkplus" >> common/make.defs
+        #    ${MAKE} -C $PRK_TARGET_PATH stencil-cilk transpose-cilk
+        #    $PRK_TARGET_PATH/stencil-cilk     10 1000
+        #    $PRK_TARGET_PATH/transpose-cilk   10 1024 32
+        #    #echo "Test stencil code generator"
+        #    for s in star grid ; do
+        #        for r in 1 2 3 4 5 ; do
+        #            $PRK_TARGET_PATH/stencil-cilk 10 200 $s $r
+        #        done
+        #    done
+        #fi
+
         # Use MUSL for GCC+Linux only
         if [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "$CC" = "gcc" ] ; then
             ${MAKE} -C $PRK_TARGET_PATH clean

From c1d8e90dc2717dfaf1a2dd6a45c60e1e9df715c3 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Tue, 14 May 2019 10:43:42 -0400
Subject: [PATCH 210/245] add nstream-target

---
 .gitignore           |   1 +
 C1z/Makefile         |   2 +-
 C1z/nstream-target.c | 178 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 180 insertions(+), 1 deletion(-)
 create mode 100644 C1z/nstream-target.c

diff --git a/.gitignore b/.gitignore
index 2c32fc0c8..846e4a560 100644
--- a/.gitignore
+++ b/.gitignore
@@ -102,6 +102,7 @@ SERIAL/Synch_p2p/p2p
 SERIAL/Transpose/transpose
 C1z/nstream
 C1z/nstream-openmp
+C1z/nstream-target
 C1z/nstream-mpi
 C1z/nstream-memkind
 C1z/nstream-memkind-openmp
diff --git a/C1z/Makefile b/C1z/Makefile
index 23562f35f..892564b72 100644
--- a/C1z/Makefile
+++ b/C1z/Makefile
@@ -57,7 +57,7 @@ memkind: nstream-memkind nstream-memkind-openmp
 
 mmap: nstream-mmap nstream-mmap-openmp
 
-target: stencil-target transpose-target
+target: nstream-target stencil-target transpose-target
 
 taskloop: stencil-taskloop transpose-taskloop
 
diff --git a/C1z/nstream-target.c b/C1z/nstream-target.c
new file mode 100644
index 000000000..244528448
--- /dev/null
+++ b/C1z/nstream-target.c
@@ -0,0 +1,178 @@
+///
+/// Copyright (c) 2019, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    nstream
+///
+/// PURPOSE: To compute memory bandwidth when adding a vector of a given
+///          number of double precision values to the scalar multiple of
+///          another vector of the same length, and storing the result in
+///          a third vector.
+///
+/// USAGE:   The program takes as input the number
+///          of iterations to loop over the triad vectors, the length of the
+///          vectors, and the offset between vectors
+///
+///          <progname> <# iterations> <vector length> <offset>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// NOTES:   Bandwidth is determined as the number of words read, plus the
+///          number of words written, times the size of the words, divided
+///          by the execution time. For a vector length of N, the total
+///          number of words read and written is 4*N*sizeof(double).
+///
+///
+/// HISTORY: This code is loosely based on the Stream benchmark by John
+///          McCalpin, but does not follow all the Stream rules. Hence,
+///          reported results should not be associated with Stream in
+///          external publications
+///
+///          Converted to C++11 by Jeff Hammond, November 2017.
+///          Converted to C11 by Jeff Hammond, February 2019.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+
+int main(int argc, char * argv[])
+{
+  printf("Parallel Research Kernels version %.2f\n", PRKVERSION );
+  printf("C11/OpenMP TARGET STREAM triad: A = B + scalar * C\n");
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  if (argc < 3) {
+    printf("Usage: <# iterations> <vector length>\n");
+    return 1;
+  }
+
+  // number of times to do the transpose
+  int iterations = atoi(argv[1]);
+  if (iterations < 1) {
+    printf("ERROR: iterations must be >= 1\n");
+    return 1;
+  }
+
+  // length of a the matrix
+  size_t length = atol(argv[2]);
+  if (length <= 0) {
+    printf("ERROR: Matrix length must be greater than 0\n");
+    return 1;
+  }
+
+#ifdef _OPENMP
+  printf("Number of threads    = %d\n", omp_get_max_threads());
+#endif
+  printf("Number of iterations = %d\n", iterations);
+  printf("Vector length        = %zu\n", length);
+  //printf("Offset               = %d\n", offset);
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  double nstream_time = 0.0;
+
+  size_t bytes = length*sizeof(double);
+  double * restrict A = prk_malloc(bytes);
+  double * restrict B = prk_malloc(bytes);
+  double * restrict C = prk_malloc(bytes);
+
+  double scalar = 3.0;
+
+  // HOST
+  OMP_PARALLEL()
+  {
+    OMP_FOR_SIMD()
+    for (size_t i=0; i<length; i++) {
+      A[i] = 0.0;
+      B[i] = 2.0;
+      C[i] = 2.0;
+    }
+  }
+
+  // DEVICE
+  OMP_TARGET( data map(tofrom: A[0:length]) map(to: B[0:length], C[0:length]) )
+  {
+    for (int iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) nstream_time = prk_wtime();
+
+      OMP_TARGET( teams distribute parallel for simd schedule(static,1) )
+      for (size_t i=0; i<length; i++) {
+          A[i] += B[i] + scalar * C[i];
+      }
+    }
+    nstream_time = prk_wtime() - nstream_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  double ar = 0.0;
+  double br = 2.0;
+  double cr = 2.0;
+  for (int i=0; i<=iterations; i++) {
+      ar += br + scalar * cr;
+  }
+
+  ar *= length;
+
+  double asum = 0.0;
+  OMP_PARALLEL_FOR_REDUCE( +:asum )
+  for (size_t i=0; i<length; i++) {
+      asum += fabs(A[i]);
+  }
+
+  double epsilon=1.e-8;
+  if (fabs(ar-asum)/asum > epsilon) {
+      printf("Failed Validation on output array\n"
+             "       Expected checksum: %lf\n"
+             "       Observed checksum: %lf\n"
+             "ERROR: solution did not validate\n", ar, asum);
+      return 1;
+  } else {
+      printf("Solution validates\n");
+      double avgtime = nstream_time/iterations;
+      double nbytes = 4.0 * length * sizeof(double);
+      printf("Rate (MB/s): %lf Avg time (s): %lf\n", 1.e-6*nbytes/avgtime, avgtime);
+  }
+
+  return 0;
+}
+
+

From f1450ab2ebf36236652f29589a7e677fd5f4bfb1 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Tue, 14 May 2019 11:37:59 -0400
Subject: [PATCH 211/245] add taskloop nstream

---
 C1z/Makefile           |   2 +-
 C1z/nstream-taskloop.c | 185 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 186 insertions(+), 1 deletion(-)
 create mode 100644 C1z/nstream-taskloop.c

diff --git a/C1z/Makefile b/C1z/Makefile
index 892564b72..9125fef1f 100644
--- a/C1z/Makefile
+++ b/C1z/Makefile
@@ -59,7 +59,7 @@ mmap: nstream-mmap nstream-mmap-openmp
 
 target: nstream-target stencil-target transpose-target
 
-taskloop: stencil-taskloop transpose-taskloop
+taskloop: nstream-taskloop stencil-taskloop transpose-taskloop
 
 cilk: stencil-cilk transpose-cilk
 
diff --git a/C1z/nstream-taskloop.c b/C1z/nstream-taskloop.c
new file mode 100644
index 000000000..69ae72639
--- /dev/null
+++ b/C1z/nstream-taskloop.c
@@ -0,0 +1,185 @@
+///
+/// Copyright (c) 2019, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    nstream
+///
+/// PURPOSE: To compute memory bandwidth when adding a vector of a given
+///          number of double precision values to the scalar multiple of
+///          another vector of the same length, and storing the result in
+///          a third vector.
+///
+/// USAGE:   The program takes as input the number
+///          of iterations to loop over the triad vectors, the length of the
+///          vectors, and the offset between vectors
+///
+///          <progname> <# iterations> <vector length> <offset>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// NOTES:   Bandwidth is determined as the number of words read, plus the
+///          number of words written, times the size of the words, divided
+///          by the execution time. For a vector length of N, the total
+///          number of words read and written is 4*N*sizeof(double).
+///
+///
+/// HISTORY: This code is loosely based on the Stream benchmark by John
+///          McCalpin, but does not follow all the Stream rules. Hence,
+///          reported results should not be associated with Stream in
+///          external publications
+///
+///          Converted to C++11 by Jeff Hammond, November 2017.
+///          Converted to C11 by Jeff Hammond, February 2019.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+
+int main(int argc, char * argv[])
+{
+  printf("Parallel Research Kernels version %.2f\n", PRKVERSION );
+#ifdef _OPENMP
+  printf("C11/OpenMP TASKLOOP STREAM triad: A = B + scalar * C\n");
+#else
+  printf("C11/Serial STREAM triad: A = B + scalar * C\n");
+#endif
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  if (argc < 3) {
+    printf("Usage: <# iterations> <vector length> [<taskloop grainsize>]\n");
+    return 1;
+  }
+
+  // number of times to do the transpose
+  int iterations = atoi(argv[1]);
+  if (iterations < 1) {
+    printf("ERROR: iterations must be >= 1\n");
+    return 1;
+  }
+
+  // length of a the matrix
+  size_t length = atol(argv[2]);
+  if (length <= 0) {
+    printf("ERROR: Matrix length must be greater than 0\n");
+    return 1;
+  }
+
+  // taskloop grainsize
+  int gs = (argc > 3) ? atoi(argv[3]) : 1024;
+
+
+#ifdef _OPENMP
+  printf("Number of threads    = %d\n", omp_get_max_threads());
+  printf("Taskloop grainsize    = %d\n", gs);
+#endif
+  printf("Number of iterations = %d\n", iterations);
+  printf("Vector length        = %zu\n", length);
+  //printf("Offset               = %d\n", offset);
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  double nstream_time = 0.0;
+
+  size_t bytes = length*sizeof(double);
+  double * restrict A = prk_malloc(bytes);
+  double * restrict B = prk_malloc(bytes);
+  double * restrict C = prk_malloc(bytes);
+
+  double scalar = 3.0;
+
+  OMP_PARALLEL()
+  OMP_MASTER
+  {
+    OMP_TASKLOOP( firstprivate(length) shared(A,B,C) grainsize(gs) )
+    for (size_t i=0; i<length; i++) {
+      A[i] = 0.0;
+      B[i] = 2.0;
+      C[i] = 2.0;
+    }
+    OMP_TASKWAIT
+
+    for (int iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) nstream_time = prk_wtime();
+
+      OMP_TASKLOOP( firstprivate(length) shared(A,B,C) grainsize(gs) )
+      for (size_t i=0; i<length; i++) {
+          A[i] += B[i] + scalar * C[i];
+      }
+      OMP_TASKWAIT
+    }
+    nstream_time = prk_wtime() - nstream_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  double ar = 0.0;
+  double br = 2.0;
+  double cr = 2.0;
+  for (int i=0; i<=iterations; i++) {
+      ar += br + scalar * cr;
+  }
+
+  ar *= length;
+
+  double asum = 0.0;
+  OMP_PARALLEL_FOR_REDUCE( +:asum )
+  for (size_t i=0; i<length; i++) {
+      asum += fabs(A[i]);
+  }
+
+  double epsilon=1.e-8;
+  if (fabs(ar-asum)/asum > epsilon) {
+      printf("Failed Validation on output array\n"
+             "       Expected checksum: %lf\n"
+             "       Observed checksum: %lf\n"
+             "ERROR: solution did not validate\n", ar, asum);
+      return 1;
+  } else {
+      printf("Solution validates\n");
+      double avgtime = nstream_time/iterations;
+      double nbytes = 4.0 * length * sizeof(double);
+      printf("Rate (MB/s): %lf Avg time (s): %lf\n", 1.e-6*nbytes/avgtime, avgtime);
+  }
+
+  return 0;
+}
+
+

From 1ff511c7d15cdb2a7f040dae7742cef1281f0b9a Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Tue, 14 May 2019 11:39:12 -0400
Subject: [PATCH 212/245] ignore stuff [ci skip]

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 846e4a560..d4640217a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -103,6 +103,7 @@ SERIAL/Transpose/transpose
 C1z/nstream
 C1z/nstream-openmp
 C1z/nstream-target
+C1z/nstream-taskloop
 C1z/nstream-mpi
 C1z/nstream-memkind
 C1z/nstream-memkind-openmp

From 5394bed370b6b50e73062231f9d47ed9bfc31197 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sun, 19 May 2019 21:53:02 -0700
Subject: [PATCH 213/245] use correct Kokkos exec space name (thanks Christian)

---
 Cxx11/stencil-kokkos.cc   | 2 +-
 Cxx11/transpose-kokkos.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cxx11/stencil-kokkos.cc b/Cxx11/stencil-kokkos.cc
index f5c3365ba..90ea21eaf 100644
--- a/Cxx11/stencil-kokkos.cc
+++ b/Cxx11/stencil-kokkos.cc
@@ -144,7 +144,7 @@ int main(int argc, char* argv[])
     std::cout << "Type of stencil      = " << (star ? "star" : "grid") << std::endl;
     std::cout << "Radius of stencil    = " << radius << std::endl;
     std::cout << "Compact representation of stencil loop body" << std::endl;
-    std::cout << "Kokkos execution space: " << typeid(Kokkos::DefaultExecutionSpace).name() << std::endl;
+    std::cout << "Kokkos execution space: " << Kokkos::DefaultExecutionSpace::name() << std::endl;
 
     auto stencil = nothing;
     if (star) {
diff --git a/Cxx11/transpose-kokkos.cc b/Cxx11/transpose-kokkos.cc
index 9b5a4f6c0..2c5c8e2ca 100644
--- a/Cxx11/transpose-kokkos.cc
+++ b/Cxx11/transpose-kokkos.cc
@@ -111,7 +111,7 @@ int main(int argc, char * argv[])
     std::cout << "Matrix order         = " << order << std::endl;
     std::cout << "Tile size            = " << tile_size << std::endl;
     std::cout << "Permute loops        = " << (permute ? "yes" : "no") << std::endl;
-    std::cout << "Kokkos execution space: " << typeid(Kokkos::DefaultExecutionSpace).name() << std::endl;
+    std::cout << "Kokkos execution space: " << Kokkos::DefaultExecutionSpace::name() << std::endl;
 
     //////////////////////////////////////////////////////////////////////
     // Allocate space and perform the computation

From 8726a7543355fbbb9a0e187949fa87211bf59aed Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Thu, 8 Aug 2019 16:55:09 -0700
Subject: [PATCH 214/245] add Intel SYCL toolchain to LLVM example

---
 common/make.defs.llvm | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/common/make.defs.llvm b/common/make.defs.llvm
index 092da96b8..180664d73 100644
--- a/common/make.defs.llvm
+++ b/common/make.defs.llvm
@@ -12,7 +12,7 @@ CC=${LLVM_PATH}clang -std=c11 -pthread
 # All of the Fortran code is written for the 2008 standard and requires preprocessing.
 FC=/opt/llvm/pgi-flang/bin/flang -Mpreprocess -Mfreeform -L/opt/llvm/pgi-flang/lib -Wl,-rpath=/opt/llvm/pgi-flang/lib
 # C++11 may not be required but does no harm here.
-CXX=${LLVM_PATH}clang++ -std=c++1z -pthread
+CXX=${LLVM_PATH}clang++ -std=c++17 -pthread
 #
 # Compiler flags
 #
@@ -57,11 +57,16 @@ OPENMPFLAG+=-L${LLVM_ROOT}/lib
 #
 # SYCL flags
 #
+# Intel SYCL - https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedWithSYCLCompiler.md
+SYCLDIR=/opt/isycl
+SYCLCXX=${SYCLDIR}/bin/clang++
+SYCLFLAG=-fsycl -lsycl -lOpenCL -Wl,-rpath=${SYCLDIR}/lib
+SYCLFLAG+=-std=c++17 -O3
 # CodePlay ComputeCpp
-SYCLDIR=/opt/sycl/latest
-SYCLCXX=${SYCLDIR}/bin/compute++
-SYCLFLAG=-DUSE_SYCL -sycl-driver -I$(SYCLDIR)/include -L$(SYCLDIR)/lib -Wl,-rpath=$(SYCLDIR)/lib -lComputeCpp
-SYCLFLAG+=-std=c++14 -O3
+#SYCLDIR=/opt/sycl/latest
+#SYCLCXX=${SYCLDIR}/bin/compute++
+#SYCLFLAG=-DUSE_SYCL -sycl-driver -I$(SYCLDIR)/include -L$(SYCLDIR)/lib -Wl,-rpath=$(SYCLDIR)/lib -lComputeCpp
+#SYCLFLAG+=-std=c++14 -O3
 # This makes a huge difference in e.g. nstream...
 #SYCLFLAG+=-no-serial-memop
 # CentOS7 and Ubuntu14 built for this
@@ -90,6 +95,7 @@ SYCLFLAG+=-std=c++14 -O3
 #
 # TBB
 #
+#TBBDIR=/usr/lib/x86_64-linux-gnu
 TBBDIR=/usr/local/Cellar/tbb/2018_U3_1
 TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
 #TBBDIR=/opt/intel/compilers_and_libraries_2019.2.159/linux/tbb

From 6d3e897d7621d12d0f0862c346c196793e0f5c99 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Thu, 22 Aug 2019 17:24:20 +0000
Subject: [PATCH 215/245] Add initial hipSYCL support

---
 Cxx11/nstream-explicit-sycl.cc   | 10 ++++++++--
 Cxx11/nstream-sycl.cc            | 11 ++++++++---
 Cxx11/prk_util.h                 | 23 ++++++++++++++++++++++-
 Cxx11/stencil-sycl.cc            | 19 ++++++++++++++++---
 Cxx11/transpose-explicit-sycl.cc | 12 +++++++++---
 Cxx11/transpose-sycl.cc          | 10 ++++++++--
 6 files changed, 71 insertions(+), 14 deletions(-)

diff --git a/Cxx11/nstream-explicit-sycl.cc b/Cxx11/nstream-explicit-sycl.cc
index e51b78a28..0213b95bc 100644
--- a/Cxx11/nstream-explicit-sycl.cc
+++ b/Cxx11/nstream-explicit-sycl.cc
@@ -245,6 +245,7 @@ int main(int argc, char * argv[])
 #endif
 
   try {
+#if SYCL_TRY_CPU_QUEUE
     if (length<100000) {
         cl::sycl::queue host(cl::sycl::host_selector{});
 #ifndef TRISYCL
@@ -258,11 +259,13 @@ int main(int argc, char * argv[])
     } else {
         std::cout << "Skipping host device since it is too slow for large problems" << std::endl;
     }
+#endif
 
     // CPU requires spir64 target
+#if SYCL_TRY_CPU_QUEUE
     if (1) {
         cl::sycl::queue cpu(cl::sycl::cpu_selector{});
-#ifndef TRISYCL
+#if !defined(TRISYCL) && !defined(__HIPSYCL__)
         auto device      = cpu.get_device();
         auto platform    = device.get_platform();
         std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
@@ -276,11 +279,13 @@ int main(int argc, char * argv[])
           run<double>(cpu, iterations, length);
         }
     }
+#endif
 
     // NVIDIA GPU requires ptx64 target and does not work very well
+#if SYCL_TRY_GPU_QUEUE
     if (1) {
         cl::sycl::queue gpu(cl::sycl::gpu_selector{});
-#ifndef TRISYCL
+#if !defined(TRISYCL) && !defined(__HIPSYCL__)
         auto device      = gpu.get_device();
         auto platform    = device.get_platform();
         std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
@@ -310,6 +315,7 @@ int main(int argc, char * argv[])
 #endif
         }
     }
+#endif
   }
   catch (cl::sycl::exception e) {
     std::cout << e.what() << std::endl;
diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc
index f7d42d732..b823f220a 100644
--- a/Cxx11/nstream-sycl.cc
+++ b/Cxx11/nstream-sycl.cc
@@ -227,6 +227,7 @@ int main(int argc, char * argv[])
 #endif
 
   try {
+#if SYCL_TRY_CPU_QUEUE
     if (length<100000) {
         cl::sycl::queue host(cl::sycl::host_selector{});
 #ifndef TRISYCL
@@ -240,11 +241,13 @@ int main(int argc, char * argv[])
     } else {
         std::cout << "Skipping host device since it is too slow for large problems" << std::endl;
     }
+#endif
 
     // CPU requires spir64 target
+#if SYCL_TRY_CPU_QUEUE
     if (1) {
         cl::sycl::queue cpu(cl::sycl::cpu_selector{});
-#ifndef TRISYCL
+#if !defined(TRISYCL) && !defined(__HIPSYCL__)
         auto device      = cpu.get_device();
         auto platform    = device.get_platform();
         std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
@@ -258,11 +261,12 @@ int main(int argc, char * argv[])
           run<double>(cpu, iterations, length);
         }
     }
-
+#endif
     // NVIDIA GPU requires ptx64 target and does not work very well
+#if SYCL_TRY_GPU_QUEUE
     if (1) {
         cl::sycl::queue gpu(cl::sycl::gpu_selector{});
-#ifndef TRISYCL
+#if !defined(TRISYCL) && !defined(__HIPSYCL__)
         auto device      = gpu.get_device();
         auto platform    = device.get_platform();
         std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
@@ -292,6 +296,7 @@ int main(int argc, char * argv[])
 #endif
         }
     }
+#endif
   }
   catch (cl::sycl::exception e) {
     std::cout << e.what() << std::endl;
diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h
index d09c16c3b..2a917ad68 100644
--- a/Cxx11/prk_util.h
+++ b/Cxx11/prk_util.h
@@ -82,13 +82,34 @@
 #define PRK_UNUSED
 #endif
 
+
 // for SYCL
-#ifdef TRISYCL
+
+// prebuilt kernels are not required/not fully supported on hipSYCL and triSYCL
+#if defined(TRISYCL) || defined(__HIPSYCL__)
 #define PREBUILD_KERNEL 0
 #else
 #define PREBUILD_KERNEL 1
 #endif
 
+// not all SYCL implementations may support all device types.
+// If an implementation does not find any devices based on a
+// device selector, it will throw an exception.
+// These macros can be used to check if there's any chance
+// of an implementation targeting a CPU and GPU.
+#if !defined(__HIPSYCL__) || defined(HIPSYCL_PLATFORM_CPU)
+#define SYCL_TRY_CPU_QUEUE 1
+#else
+#define SYCL_TRY_CPU_QUEUE 0
+#endif
+
+#if !defined(__HIPSYCL__) || !defined(HIPSYCL_PLATFORM_CPU)
+#define SYCL_TRY_GPU_QUEUE 1
+#else
+#define SYCL_TRY_GPU_QUEUE 0
+#endif
+
+
 namespace prk {
 
     int get_alignment(void)
diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc
index d9fa54ff6..53b643187 100644
--- a/Cxx11/stencil-sycl.cc
+++ b/Cxx11/stencil-sycl.cc
@@ -64,6 +64,7 @@
 #include "prk_util.h"
 #include "stencil_sycl.hpp"
 
+
 #if 0
 #include "prk_opencl.h"
 #define USE_OPENCL 1
@@ -83,7 +84,13 @@ void nothing(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl
     std::cout << "You are trying to use a stencil that does not exist.\n";
     std::cout << "Please generate the new stencil using the code generator\n";
     std::cout << "and add it to the case-switch in the driver." << std::endl;
+    // There seems to be an issue with the clang CUDA/HIP toolchains not having
+    // std::abort() available
+#if defined(HIPSYCL_PLATFORM_CUDA) || defined(HIPSYCL_PLATFORM_HCC)
+    abort();
+#else
     std::abort();
+#endif
 }
 
 template <typename T>
@@ -322,9 +329,10 @@ int main(int argc, char * argv[])
 
   try {
 
+#if SYCL_TRY_CPU_QUEUE
     if (1) {
         cl::sycl::queue host(cl::sycl::host_selector{});
-#ifndef TRISYCL
+#if !defined(TRISYCL) && !defined(__HIPSYCL__)
         auto device      = host.get_device();
         auto platform    = device.get_platform();
         std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
@@ -334,11 +342,13 @@ int main(int argc, char * argv[])
         run<float>(host, iterations, n, tile_size, star, radius);
         run<double>(host, iterations, n, tile_size, star, radius);
     }
+#endif
 
     // CPU requires spir64 target
+#if SYCL_TRY_CPU_QUEUE
     if (1) {
         cl::sycl::queue cpu(cl::sycl::cpu_selector{});
-#ifndef TRISYCL
+#if !defined(TRISYCL) && !defined(__HIPSYCL__)
         auto device      = cpu.get_device();
         auto platform    = device.get_platform();
         std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
@@ -352,11 +362,13 @@ int main(int argc, char * argv[])
           run<double>(cpu, iterations, n, tile_size, star, radius);
         }
     }
+#endif
 
     // NVIDIA GPU requires ptx64 target and does not work very well
+#if SYCL_TRY_GPU_QUEUE
     if (0) {
         cl::sycl::queue gpu(cl::sycl::gpu_selector{});
-#ifndef TRISYCL
+#if !defined(TRISYCL) && !defined(__HIPSYCL__)
         auto device      = gpu.get_device();
         auto platform    = device.get_platform();
         std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
@@ -386,6 +398,7 @@ int main(int argc, char * argv[])
 #endif
         }
     }
+#endif
   }
   catch (cl::sycl::exception e) {
     std::cout << e.what() << std::endl;
diff --git a/Cxx11/transpose-explicit-sycl.cc b/Cxx11/transpose-explicit-sycl.cc
index cedeafd68..2a5cfbf12 100644
--- a/Cxx11/transpose-explicit-sycl.cc
+++ b/Cxx11/transpose-explicit-sycl.cc
@@ -258,9 +258,10 @@ int main(int argc, char * argv[])
 #endif
 
   try {
+#if SYCL_TRY_CPU_QUEUE
     if (1) {
         cl::sycl::queue host(cl::sycl::host_selector{});
-#ifndef TRISYCL
+#if !defined(TRISYCL) && !defined(__HIPSYCL__)
         auto device      = host.get_device();
         auto platform    = device.get_platform();
         std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
@@ -269,11 +270,13 @@ int main(int argc, char * argv[])
         run<float>(host, iterations, order);
         run<double>(host, iterations, order);
     }
+#endif
 
     // CPU requires spir64 target
+#if SYCL_TRY_CPU_QUEUE
     if (1) {
         cl::sycl::queue cpu(cl::sycl::cpu_selector{});
-#ifndef TRISYCL
+#if !defined(TRISYCL) && !defined(__HIPSYCL__)
         auto device      = cpu.get_device();
         auto platform    = device.get_platform();
         std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
@@ -287,11 +290,13 @@ int main(int argc, char * argv[])
           run<double>(cpu, iterations, order);
         }
     }
+#endif
 
     // NVIDIA GPU requires ptx64 target and does not work very well
+#if SYCL_TRY_GPU_QUEUE
     if (0) {
         cl::sycl::queue gpu(cl::sycl::gpu_selector{});
-#ifndef TRISYCL
+#if !defined(TRISYCL) && !defined(__HIPSYCL__)
         auto device      = gpu.get_device();
         auto platform    = device.get_platform();
         std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
@@ -321,6 +326,7 @@ int main(int argc, char * argv[])
 #endif
         }
     }
+#endif
   }
   catch (cl::sycl::exception e) {
     std::cout << e.what() << std::endl;
diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc
index 761fa136d..0323a08ac 100644
--- a/Cxx11/transpose-sycl.cc
+++ b/Cxx11/transpose-sycl.cc
@@ -224,6 +224,7 @@ int main(int argc, char * argv[])
 #endif
 
   try {
+#if SYCL_TRY_CPU_QUEUE
     if (1) {
         cl::sycl::queue host(cl::sycl::host_selector{});
 #ifndef TRISYCL
@@ -235,11 +236,13 @@ int main(int argc, char * argv[])
         run<float>(host, iterations, order);
         run<double>(host, iterations, order);
     }
+#endif
 
     // CPU requires spir64 target
+#if SYCL_TRY_CPU_QUEUE
     if (1) {
         cl::sycl::queue cpu(cl::sycl::cpu_selector{});
-#ifndef TRISYCL
+#if !defined(TRISYCL) && !defined(__HIPSYCL__)
         auto device      = cpu.get_device();
         auto platform    = device.get_platform();
         std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
@@ -253,11 +256,13 @@ int main(int argc, char * argv[])
           run<double>(cpu, iterations, order);
         }
     }
+#endif
 
     // NVIDIA GPU requires ptx64 target and does not work very well
+#if SYCL_TRY_GPU_QUEUE
     if (0) {
         cl::sycl::queue gpu(cl::sycl::gpu_selector{});
-#ifndef TRISYCL
+#if !defined(TRISYCL) && !defined(__HIPSYCL__)
         auto device      = gpu.get_device();
         auto platform    = device.get_platform();
         std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
@@ -287,6 +292,7 @@ int main(int argc, char * argv[])
 #endif
         }
     }
+#endif
   }
   catch (cl::sycl::exception e) {
     std::cout << e.what() << std::endl;

From a9353475c919074bf4fa67113fecd0de607eebd4 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Thu, 29 Aug 2019 18:00:29 -0700
Subject: [PATCH 216/245] fix copy-and-paste error

---
 Cxx11/prk_util.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h
index 2a917ad68..c0d5d321f 100644
--- a/Cxx11/prk_util.h
+++ b/Cxx11/prk_util.h
@@ -103,7 +103,7 @@
 #define SYCL_TRY_CPU_QUEUE 0
 #endif
 
-#if !defined(__HIPSYCL__) || !defined(HIPSYCL_PLATFORM_CPU)
+#if !defined(__HIPSYCL__) || !defined(HIPSYCL_PLATFORM_GPU)
 #define SYCL_TRY_GPU_QUEUE 1
 #else
 #define SYCL_TRY_GPU_QUEUE 0

From 6974328dc685fba5d13ed5e72db64bebe0d3702d Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Tue, 20 Aug 2019 12:35:49 -0700
Subject: [PATCH 217/245] catch exceptions by reference in SYCL codes

---
 Cxx11/nstream-explicit-sycl.cc   | 8 ++++----
 Cxx11/nstream-sycl.cc            | 8 ++++----
 Cxx11/stencil-sycl.cc            | 8 ++++----
 Cxx11/transpose-explicit-sycl.cc | 8 ++++----
 Cxx11/transpose-sycl.cc          | 8 ++++----
 5 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/Cxx11/nstream-explicit-sycl.cc b/Cxx11/nstream-explicit-sycl.cc
index 0213b95bc..5201a48b1 100644
--- a/Cxx11/nstream-explicit-sycl.cc
+++ b/Cxx11/nstream-explicit-sycl.cc
@@ -142,7 +142,7 @@ void run(cl::sycl::queue & q, int iterations, size_t length)
     });
     q.wait();
   }
-  catch (cl::sycl::exception e) {
+  catch (cl::sycl::exception & e) {
     std::cout << e.what() << std::endl;
 #ifdef __COMPUTECPP__
     std::cout << e.get_file_name() << std::endl;
@@ -153,7 +153,7 @@ void run(cl::sycl::queue & q, int iterations, size_t length)
 #endif
     return;
   }
-  catch (std::exception e) {
+  catch (std::exception & e) {
     std::cout << e.what() << std::endl;
     return;
   }
@@ -317,7 +317,7 @@ int main(int argc, char * argv[])
     }
 #endif
   }
-  catch (cl::sycl::exception e) {
+  catch (cl::sycl::exception & e) {
     std::cout << e.what() << std::endl;
 #ifdef __COMPUTECPP__
     std::cout << e.get_file_name() << std::endl;
@@ -328,7 +328,7 @@ int main(int argc, char * argv[])
 #endif
     return 1;
   }
-  catch (std::exception e) {
+  catch (std::exception & e) {
     std::cout << e.what() << std::endl;
     return 1;
   }
diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc
index b823f220a..2657f7200 100644
--- a/Cxx11/nstream-sycl.cc
+++ b/Cxx11/nstream-sycl.cc
@@ -124,7 +124,7 @@ void run(cl::sycl::queue & q, int iterations, size_t length)
     // for other device-oriented programming models.
     nstream_time = prk::wtime() - nstream_time;
   }
-  catch (cl::sycl::exception e) {
+  catch (cl::sycl::exception & e) {
     std::cout << e.what() << std::endl;
 #ifdef __COMPUTECPP__
     std::cout << e.get_file_name() << std::endl;
@@ -135,7 +135,7 @@ void run(cl::sycl::queue & q, int iterations, size_t length)
 #endif
     return;
   }
-  catch (std::exception e) {
+  catch (std::exception & e) {
     std::cout << e.what() << std::endl;
     return;
   }
@@ -298,7 +298,7 @@ int main(int argc, char * argv[])
     }
 #endif
   }
-  catch (cl::sycl::exception e) {
+  catch (cl::sycl::exception & e) {
     std::cout << e.what() << std::endl;
 #ifdef __COMPUTECPP__
     std::cout << e.get_file_name() << std::endl;
@@ -309,7 +309,7 @@ int main(int argc, char * argv[])
 #endif
     return 1;
   }
-  catch (std::exception e) {
+  catch (std::exception & e) {
     std::cout << e.what() << std::endl;
     return 1;
   }
diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc
index 53b643187..b333c4194 100644
--- a/Cxx11/stencil-sycl.cc
+++ b/Cxx11/stencil-sycl.cc
@@ -195,7 +195,7 @@ void run(cl::sycl::queue & q, int iterations, size_t n, size_t tile_size, bool s
     }
     stencil_time = prk::wtime() - stencil_time;
   }
-  catch (cl::sycl::exception e) {
+  catch (cl::sycl::exception & e) {
     std::cout << e.what() << std::endl;
 #ifdef __COMPUTECPP__
     std::cout << e.get_file_name() << std::endl;
@@ -206,7 +206,7 @@ void run(cl::sycl::queue & q, int iterations, size_t n, size_t tile_size, bool s
 #endif
     return;
   }
-  catch (std::exception e) {
+  catch (std::exception & e) {
     std::cout << e.what() << std::endl;
     return;
   }
@@ -400,7 +400,7 @@ int main(int argc, char * argv[])
     }
 #endif
   }
-  catch (cl::sycl::exception e) {
+  catch (cl::sycl::exception & e) {
     std::cout << e.what() << std::endl;
 #ifdef __COMPUTECPP__
     std::cout << e.get_file_name() << std::endl;
@@ -411,7 +411,7 @@ int main(int argc, char * argv[])
 #endif
     return 1;
   }
-  catch (std::exception e) {
+  catch (std::exception & e) {
     std::cout << e.what() << std::endl;
     return 1;
   }
diff --git a/Cxx11/transpose-explicit-sycl.cc b/Cxx11/transpose-explicit-sycl.cc
index 2a5cfbf12..e92dfaa1f 100644
--- a/Cxx11/transpose-explicit-sycl.cc
+++ b/Cxx11/transpose-explicit-sycl.cc
@@ -157,7 +157,7 @@ void run(cl::sycl::queue & q, int iterations, size_t order)
     });
     q.wait();
   }
-  catch (cl::sycl::exception e) {
+  catch (cl::sycl::exception & e) {
     std::cout << e.what() << std::endl;
 #ifdef __COMPUTECPP__
     std::cout << e.get_file_name() << std::endl;
@@ -168,7 +168,7 @@ void run(cl::sycl::queue & q, int iterations, size_t order)
 #endif
     return;
   }
-  catch (std::exception e) {
+  catch (std::exception & e) {
     std::cout << e.what() << std::endl;
     return;
   }
@@ -328,7 +328,7 @@ int main(int argc, char * argv[])
     }
 #endif
   }
-  catch (cl::sycl::exception e) {
+  catch (cl::sycl::exception & e) {
     std::cout << e.what() << std::endl;
 #ifdef __COMPUTECPP__
     std::cout << e.get_file_name() << std::endl;
@@ -339,7 +339,7 @@ int main(int argc, char * argv[])
 #endif
     return 1;
   }
-  catch (std::exception e) {
+  catch (std::exception & e) {
     std::cout << e.what() << std::endl;
     return 1;
   }
diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc
index 0323a08ac..b22b162be 100644
--- a/Cxx11/transpose-sycl.cc
+++ b/Cxx11/transpose-sycl.cc
@@ -123,7 +123,7 @@ void run(cl::sycl::queue & q, int iterations, size_t order)
     // for other device-oriented programming models.
     trans_time = prk::wtime() - trans_time;
   }
-  catch (cl::sycl::exception e) {
+  catch (cl::sycl::exception & e) {
     std::cout << e.what() << std::endl;
 #ifdef __COMPUTECPP__
     std::cout << e.get_file_name() << std::endl;
@@ -134,7 +134,7 @@ void run(cl::sycl::queue & q, int iterations, size_t order)
 #endif
     return;
   }
-  catch (std::exception e) {
+  catch (std::exception & e) {
     std::cout << e.what() << std::endl;
     return;
   }
@@ -294,7 +294,7 @@ int main(int argc, char * argv[])
     }
 #endif
   }
-  catch (cl::sycl::exception e) {
+  catch (cl::sycl::exception & e) {
     std::cout << e.what() << std::endl;
 #ifdef __COMPUTECPP__
     std::cout << e.get_file_name() << std::endl;
@@ -305,7 +305,7 @@ int main(int argc, char * argv[])
 #endif
     return 1;
   }
-  catch (std::exception e) {
+  catch (std::exception & e) {
     std::cout << e.what() << std::endl;
     return 1;
   }

From 3a7e165bfab8064d73328da13bfd9c48d677cfc9 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Thu, 29 Aug 2019 18:21:41 -0700
Subject: [PATCH 218/245] revert incorrect fix and add comment so future Jeff
 understands

---
 Cxx11/prk_util.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h
index c0d5d321f..7feaec93d 100644
--- a/Cxx11/prk_util.h
+++ b/Cxx11/prk_util.h
@@ -103,7 +103,8 @@
 #define SYCL_TRY_CPU_QUEUE 0
 #endif
 
-#if !defined(__HIPSYCL__) || !defined(HIPSYCL_PLATFORM_GPU)
+// !defined(HIPSYCL_PLATFORM_CPU) = !( defined(HIPSYCL_PLATFORM_CUDA) || defined(HIPSYCL_PLATFORM_HCC) )
+#if !defined(__HIPSYCL__) || !defined(HIPSYCL_PLATFORM_CPU)
 #define SYCL_TRY_GPU_QUEUE 1
 #else
 #define SYCL_TRY_GPU_QUEUE 0

From 443540af80438f3de4b2786f08e7673d7e83af6c Mon Sep 17 00:00:00 2001
From: Toby Isaac <tisaac@cc.gatech.edu>
Date: Tue, 10 Sep 2019 08:28:42 -0400
Subject: [PATCH 219/245] clean OpenMP target stencil in C1z

Same as previous by jeffhammond in Cxx11/:

> - GPU-style target means the functions are invoked on host, so must
>   remove "declare target" for correctness (caught by LLVM 5)
---
 C1z/generate-c-stencil.py | 8 ++++----
 C1z/stencil_target.h      | 2 --
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/C1z/generate-c-stencil.py b/C1z/generate-c-stencil.py
index b16dc8dcb..fb1a57f48 100755
--- a/C1z/generate-c-stencil.py
+++ b/C1z/generate-c-stencil.py
@@ -76,13 +76,13 @@ def instance(src,model,pattern,r):
 def main():
     for model in ['seq','openmp','target','cilk','taskloop']:
       src = open('stencil_'+model+'.h','w')
-      if (model=='target'):
-          src.write('OMP( declare target )\n')
+      #if (model=='target'):
+      #    src.write('OMP( declare target )\n')
       for pattern in ['star','grid']:
         for r in range(1,10):
           instance(src,model,pattern,r)
-      if (model=='target'):
-          src.write('OMP( end declare target )\n')
+      #if (model=='target'):
+      #    src.write('OMP( end declare target )\n')
       src.close()
 
 if __name__ == '__main__':
diff --git a/C1z/stencil_target.h b/C1z/stencil_target.h
index 4f7edfd36..ae64a29f6 100644
--- a/C1z/stencil_target.h
+++ b/C1z/stencil_target.h
@@ -1,4 +1,3 @@
-OMP( declare target )
 void star1(const int n, const double * restrict in, double * restrict out) {
     OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
     for (int i=1; i<n-1; i++) {
@@ -1562,4 +1561,3 @@ void grid9(const int n, const double * restrict in, double * restrict out) {
      }
 }
 
-OMP( end declare target )

From a4299f05a80b6f6e5cb34377a509dd1e1d9d2040 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 21 Sep 2019 20:05:15 -0700
Subject: [PATCH 220/245] do not map timers

---
 Cxx11/nstream-openmp-target.cc      |  2 +-
 Cxx11/transpose-openmp-target.cc    | 24 ++++++++++++------------
 FORTRAN/dgemm-openmp-target.f90     |  3 +--
 FORTRAN/nstream-openmp-target.f90   |  3 +--
 FORTRAN/stencil-openmp-target.f90   |  3 +--
 FORTRAN/transpose-openmp-target.f90 |  2 +-
 6 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/Cxx11/nstream-openmp-target.cc b/Cxx11/nstream-openmp-target.cc
index 06af1c204..d4a437a08 100644
--- a/Cxx11/nstream-openmp-target.cc
+++ b/Cxx11/nstream-openmp-target.cc
@@ -129,7 +129,7 @@ int main(int argc, char * argv[])
   }
 
   // DEVICE
-  OMP_TARGET( data map(tofrom: A[0:length], B[0:length], C[0:length]) map(from:nstream_time) )
+  OMP_TARGET( data map(tofrom: A[0:length], B[0:length], C[0:length]) )
   {
     for (auto iter = 0; iter<=iterations; iter++) {
 
diff --git a/Cxx11/transpose-openmp-target.cc b/Cxx11/transpose-openmp-target.cc
index a611997f5..8702ec45b 100644
--- a/Cxx11/transpose-openmp-target.cc
+++ b/Cxx11/transpose-openmp-target.cc
@@ -111,9 +111,9 @@ int main(int argc, char * argv[])
   OMP_PARALLEL()
   {
     OMP_FOR()
-    for (auto i=0;i<order; i++) {
+    for (int i=0;i<order; i++) {
       PRAGMA_SIMD
-      for (auto j=0;j<order;j++) {
+      for (int j=0;j<order;j++) {
         A[i*order+j] = static_cast<double>(i*order+j);
         B[i*order+j] = 0.0;
       }
@@ -121,19 +121,19 @@ int main(int argc, char * argv[])
   }
 
   // DEVICE
-  OMP_TARGET( data map(tofrom: A[0:order*order], B[0:order*order]) map(from:trans_time) )
+  OMP_TARGET( data map(tofrom: A[0:order*order], B[0:order*order]) )
   {
-    for (auto iter = 0; iter<=iterations; iter++) {
+    for (int iter = 0; iter<=iterations; iter++) {
 
       if (iter==1) trans_time = omp_get_wtime();
 
       // transpose the  matrix
       if (tile_size < order) {
         OMP_TARGET( teams distribute parallel for simd collapse(2) )
-        for (auto it=0; it<order; it+=tile_size) {
-          for (auto jt=0; jt<order; jt+=tile_size) {
-            for (auto i=it; i<std::min(order,it+tile_size); i++) {
-              for (auto j=jt; j<std::min(order,jt+tile_size); j++) {
+        for (int it=0; it<order; it+=tile_size) {
+          for (int jt=0; jt<order; jt+=tile_size) {
+            for (int i=it; i<std::min(order,it+tile_size); i++) {
+              for (int j=jt; j<std::min(order,jt+tile_size); j++) {
                 B[i*order+j] += A[j*order+i];
                 A[j*order+i] += 1.0;
               }
@@ -142,8 +142,8 @@ int main(int argc, char * argv[])
         }
       } else {
         OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
-        for (auto i=0;i<order; i++) {
-          for (auto j=0;j<order;j++) {
+        for (int i=0;i<order; i++) {
+          for (int j=0;j<order;j++) {
             B[i*order+j] += A[j*order+i];
             A[j*order+i] += 1.0;
           }
@@ -160,8 +160,8 @@ int main(int argc, char * argv[])
   const auto addit = (iterations+1.) * (iterations/2.);
   auto abserr = 0.0;
   OMP_PARALLEL_FOR_REDUCE( +:abserr )
-  for (auto j=0; j<order; j++) {
-    for (auto i=0; i<order; i++) {
+  for (int j=0; j<order; j++) {
+    for (int i=0; i<order; i++) {
       const int ij = i*order+j;
       const int ji = j*order+i;
       const double reference = static_cast<double>(ij)*(1.+iterations)+addit;
diff --git a/FORTRAN/dgemm-openmp-target.f90 b/FORTRAN/dgemm-openmp-target.f90
index 3c8ffbeec..d1af37ba5 100644
--- a/FORTRAN/dgemm-openmp-target.f90
+++ b/FORTRAN/dgemm-openmp-target.f90
@@ -149,8 +149,7 @@ program main
   enddo
   !$omp end parallel do
 
-  !$omp target data map(to: A,B) map(tofrom: C) map(from:dgemm_time) &
-  !$omp& map(to:iterations,order)
+  !$omp target data map(to: A,B) map(tofrom: C) map(to:order)
 
   t0 = 0
 
diff --git a/FORTRAN/nstream-openmp-target.f90 b/FORTRAN/nstream-openmp-target.f90
index 954a86b1e..13e77f628 100644
--- a/FORTRAN/nstream-openmp-target.f90
+++ b/FORTRAN/nstream-openmp-target.f90
@@ -163,8 +163,7 @@ program main
   enddo
   !$omp end parallel do simd
 
-  !$omp target data map(tofrom: A) map(to: B,C) map(from:nstream_time) &
-  !$omp& map(to:iterations,length)
+  !$omp target data map(tofrom: A) map(to: B,C) map(to:length)
 
   do k=0,iterations
 
diff --git a/FORTRAN/stencil-openmp-target.f90 b/FORTRAN/stencil-openmp-target.f90
index f7724ada9..7bceb70e1 100644
--- a/FORTRAN/stencil-openmp-target.f90
+++ b/FORTRAN/stencil-openmp-target.f90
@@ -320,8 +320,7 @@ program main
 #endif
   !$omp end parallel
 
-  !$omp target data map(to:W, A) map(tofrom: B) map(from:stencil_time) &
-  !$omp& map(to:iterations,n)
+  !$omp target data map(to:W, A) map(tofrom: B) map(to:n)
 
   t0 = 0
 
diff --git a/FORTRAN/transpose-openmp-target.f90 b/FORTRAN/transpose-openmp-target.f90
index 1da28346a..a2c08e1a8 100644
--- a/FORTRAN/transpose-openmp-target.f90
+++ b/FORTRAN/transpose-openmp-target.f90
@@ -143,7 +143,7 @@ program main
   enddo
   !$omp end parallel do simd
 
-  !$omp target data map(to: A) map(tofrom: B) map(from:trans_time)
+  !$omp target data map(to: A) map(tofrom: B)
 
   t0 = 0
 

From a106364e221ac93ae284c18ea7b9ac8f9cb3059e Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Mon, 30 Sep 2019 14:30:53 -0700
Subject: [PATCH 221/245] eliminate std::math in target region because not
 declare target (#415)

also remove half-baked p2p
---
 Cxx11/p2p-openmp-target.cc       | 185 -------------------------------
 Cxx11/prk_openmp.h               |  11 ++
 Cxx11/transpose-openmp-target.cc |   4 +-
 3 files changed, 13 insertions(+), 187 deletions(-)
 delete mode 100644 Cxx11/p2p-openmp-target.cc

diff --git a/Cxx11/p2p-openmp-target.cc b/Cxx11/p2p-openmp-target.cc
deleted file mode 100644
index a9220285f..000000000
--- a/Cxx11/p2p-openmp-target.cc
+++ /dev/null
@@ -1,185 +0,0 @@
-///
-/// Copyright (c) 2013, Intel Corporation
-///
-/// Redistribution and use in source and binary forms, with or without
-/// modification, are permitted provided that the following conditions
-/// are met:
-///
-/// * Redistributions of source code must retain the above copyright
-///       notice, this list of conditions and the following disclaimer.
-/// * Redistributions in binary form must reproduce the above
-///       copyright notice, this list of conditions and the following
-///       disclaimer in the documentation and/or other materials provided
-///       with the distribution.
-/// * Neither the name of Intel Corporation nor the names of its
-///       contributors may be used to endorse or promote products
-///       derived from this software without specific prior written
-///       permission.
-///
-/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-/// POSSIBILITY OF SUCH DAMAGE.
-
-//////////////////////////////////////////////////////////////////////
-///
-/// NAME:    Pipeline
-///
-/// PURPOSE: This program tests the efficiency with which point-to-point
-///          synchronization can be carried out. It does so by executing
-///          a pipelined algorithm on an m*n grid. The first array dimension
-///          is distributed among the threads (stripwise decomposition).
-///
-/// USAGE:   The program takes as input the
-///          dimensions of the grid, and the number of iterations on the grid
-///
-///                <progname> <iterations> <m> <n>
-///
-///          The output consists of diagnostics to make sure the
-///          algorithm worked, and of timing statistics.
-///
-/// FUNCTIONS CALLED:
-///
-///          Other than standard C functions, the following
-///          functions are used in this program:
-///
-///          wtime()
-///
-/// HISTORY: - Written by Rob Van der Wijngaart, February 2009.
-///            C99-ification by Jeff Hammond, February 2016.
-///            C++11-ification by Jeff Hammond, May 2017.
-///
-//////////////////////////////////////////////////////////////////////
-
-#include "prk_util.h"
-
-int main(int argc, char* argv[])
-{
-  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
-  std::cout << "C++11/OpenMP TARGET DOACROSS pipeline execution on 2D grid" << std::endl;
-
-  //////////////////////////////////////////////////////////////////////
-  // Process and test input parameters
-  //////////////////////////////////////////////////////////////////////
-
-  int iterations;
-  int m, n;
-  try {
-      if (argc < 4){
-        throw " <# iterations> <first array dimension> <second array dimension>";
-      }
-
-      // number of times to run the pipeline algorithm
-      iterations  = std::atoi(argv[1]);
-      if (iterations < 1) {
-        throw "ERROR: iterations must be >= 1";
-      }
-
-      // grid dimensions
-      m = std::atoi(argv[2]);
-      n = std::atoi(argv[3]);
-      if (m < 1 || n < 1) {
-        throw "ERROR: grid dimensions must be positive";
-      } else if ( static_cast<size_t>(m)*static_cast<size_t>(n) > INT_MAX) {
-        throw "ERROR: grid dimension too large - overflow risk";
-      }
-  }
-  catch (const char * e) {
-    std::cout << e << std::endl;
-    return 1;
-  }
-
-  std::cout << "Number of threads (max)   = " << omp_get_max_threads() << std::endl;
-  std::cout << "Number of iterations = " << iterations << std::endl;
-  std::cout << "Grid sizes           = " << m << ", " << n << std::endl;
-
-  //////////////////////////////////////////////////////////////////////
-  // Allocate space and perform the computation
-  //////////////////////////////////////////////////////////////////////
-
-  auto pipeline_time = 0.0; // silence compiler warning
-
-  // working set
-  double * grid = new double[m*n];
-
-  OMP_PARALLEL()
-  {
-    OMP_FOR()
-    for (auto i=0; i<n; i++) {
-      for (auto j=0; j<n; j++) {
-        grid[i*n+j] = 0.0;
-      }
-    }
-
-    // set boundary values (bottom and left side of grid)
-    OMP_MASTER
-    {
-      for (auto j=0; j<n; j++) {
-        grid[0*n+j] = static_cast<double>(j);
-      }
-      for (auto i=0; i<m; i++) {
-        grid[i*n+0] = static_cast<double>(i);
-      }
-    }
-    OMP_BARRIER
-  }
-
-  OMP_TARGET( data map(tofrom:grid[0:m*n]) map(from:pipeline_time) )
-  {
-    for (auto iter = 0; iter<=iterations; iter++) {
-
-      if (iter==1) pipeline_time = omp_get_wtime();
-
-      OMP_PARALLEL() {
-        OMP_FOR( collapse(2) ordered(2) )
-        for (auto i=1; i<m; i++) {
-          for (auto j=1; j<n; j++) {
-            OMP_ORDERED( depend(sink: i-1,j) depend(sink: i,j-1) depend(sink: i-1,j-1) )
-            grid[i*n+j] = grid[(i-1)*n+j] + grid[i*n+(j-1)] - grid[(i-1)*n+(j-1)];
-            OMP_ORDERED( depend (source) )
-          }
-        }
-
-        OMP_MASTER
-        grid[0*n+0] = -grid[(m-1)*n+(n-1)];
-      }
-    }
-
-    pipeline_time = omp_get_wtime() - pipeline_time;
-  }
-
-  //////////////////////////////////////////////////////////////////////
-  // Analyze and output results.
-  //////////////////////////////////////////////////////////////////////
-
-  // error tolerance
-  const double epsilon = 1.e-8;
-
-  // verify correctness, using top right value
-  auto corner_val = ((iterations+1.)*(n+m-2.));
-  if ( (std::fabs(grid[(m-1)*n+(n-1)] - corner_val)/corner_val) > epsilon) {
-    std::cout << "ERROR: checksum " << grid[(m-1)*n+(n-1)]
-              << " does not match verification value " << corner_val << std::endl;
-    return 1;
-  }
-
-#ifdef VERBOSE
-  std::cout << "Solution validates; verification value = " << corner_val << std::endl;
-#else
-  std::cout << "Solution validates" << std::endl;
-#endif
-  auto avgtime = pipeline_time/iterations;
-  std::cout << "Rate (MFlops/s): "
-            << 2.0e-6 * ( (m-1.)*(n-1.) )/avgtime
-            << " Avg time (s): " << avgtime << std::endl;
-
-  return 0;
-}
diff --git a/Cxx11/prk_openmp.h b/Cxx11/prk_openmp.h
index 4d6396b9b..578e713e5 100644
--- a/Cxx11/prk_openmp.h
+++ b/Cxx11/prk_openmp.h
@@ -91,4 +91,15 @@
 # define OMP_END_DECLARE_TARGET
 #endif
 
+// used in OpenMP target code because std::min etc are not declare target
+#ifndef MIN
+#define MIN(x,y) ((x)<(y)?(x):(y))
+#endif
+#ifndef MAX
+#define MAX(x,y) ((x)>(y)?(x):(y))
+#endif
+#ifndef ABS
+#define ABS(a) ((a) >= 0 ? (a) : -(a))
+#endif
+
 #endif /* PRK_OPENMP_H */
diff --git a/Cxx11/transpose-openmp-target.cc b/Cxx11/transpose-openmp-target.cc
index 8702ec45b..b106e6f48 100644
--- a/Cxx11/transpose-openmp-target.cc
+++ b/Cxx11/transpose-openmp-target.cc
@@ -132,8 +132,8 @@ int main(int argc, char * argv[])
         OMP_TARGET( teams distribute parallel for simd collapse(2) )
         for (int it=0; it<order; it+=tile_size) {
           for (int jt=0; jt<order; jt+=tile_size) {
-            for (int i=it; i<std::min(order,it+tile_size); i++) {
-              for (int j=jt; j<std::min(order,jt+tile_size); j++) {
+            for (int i=it; i<MIN(order,it+tile_size); i++) {
+              for (int j=jt; j<MIN(order,jt+tile_size); j++) {
                 B[i*order+j] += A[j*order+i];
                 A[j*order+i] += 1.0;
               }

From 0675884b9faf48850f93b9bee157a5c46caaa520 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Fri, 4 Aug 2017 14:44:32 -0700
Subject: [PATCH 222/245] add ignores

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index d4640217a..0b5905f43 100644
--- a/.gitignore
+++ b/.gitignore
@@ -118,6 +118,8 @@ C1z/p2p-hyperplane
 C1z/p2p-hyperplane-openmp
 C1z/p2p-tasks-openmp
 C1z/p2p-simd-openmp
+C1z/p2p-avx
+C1z/p2p-sse
 C1z/stencil
 C1z/stencil-cilk
 C1z/stencil-openmp

From f0c24d7f16a0c3416146f60a8ccea6142617ebcb Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Mon, 30 Sep 2019 21:03:31 -0700
Subject: [PATCH 223/245] add example of all the crap required when using
 https://github.com/boostorg/boost

---
 common/make.defs.boost | 186 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 186 insertions(+)
 create mode 100644 common/make.defs.boost

diff --git a/common/make.defs.boost b/common/make.defs.boost
new file mode 100644
index 000000000..d9065433a
--- /dev/null
+++ b/common/make.defs.boost
@@ -0,0 +1,186 @@
+#
+# This file shows the GCC toolchain options for PRKs using
+# OpenMP, MPI and/or Fortran coarrays only.
+#
+# Base compilers and language options
+#
+VERSION=-9
+# C99 is required in some implementations.
+CC=gcc${VERSION} -std=c11 -pthread
+#EXTRA_CLIBS=-lrt
+# All of the Fortran code is written for the 2008 standard and requires preprocessing.
+FC=gfortran${VERSION} -std=f2008 -cpp
+# C++11 may not be required but does no harm here.
+#CXX=g++${VERSION} -std=gnu++17 -pthread
+CXX=clang++ -std=gnu++17 -pthread
+#
+# Compiler flags
+#
+# -mtune=native is appropriate for most cases.
+# -march=native is appropriate if you want portable binaries.
+DEFAULT_OPT_FLAGS=-O3 -mtune=native -ffast-math
+#DEFAULT_OPT_FLAGS=-O0
+DEFAULT_OPT_FLAGS+=-g3
+#DEFAULT_OPT_FLAGS+=-fsanitize=undefined
+#DEFAULT_OPT_FLAGS+=-fsanitize=undefined,leak
+#DEFAULT_OPT_FLAGS+=-fsanitize=address
+#DEFAULT_OPT_FLAGS+=-fsanitize=thread
+# If you are compiling for KNL on a Xeon login node, use the following:
+# DEFAULT_OPT_FLAGS=-g -O3 -march=knl
+# See https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html for details.
+#
+#DEFAULT_OPT_FLAGS+=-fopt-info-vec-missed
+DEFAULT_OPT_FLAGS+=-Wall #-Werror
+DEFAULT_OPT_FLAGS+=-Wno-ignored-attributes -Wno-deprecated-declarations
+#DEFAULT_OPT_FLAGS+=-mavx -mfma
+#
+# OpenMP flags
+#
+OPENMPFLAG=-fopenmp
+OPENMPSIMDFLAG=-fopenmp-simd
+OFFLOADFLAG=-foffload="-O3 -v"
+ORNLACCFLAG=-fopenacc
+#
+# OpenCL flags
+#
+# MacOS
+#OPENCLFLAG=-framework OpenCL
+# POCL
+# http://portablecl.org/docs/html/using.html#linking-your-program-directly-with-pocl is not correct...
+OPENCLFLAG=-I/opt/pocl/include -I/opt/pocl/share/pocl/include -L/opt/pocl/lib -Wl,-rpath -Wl,/opt/pocl/lib -lpocl
+# Linux
+#OPENCLDIR=/etc/alternatives/opencl-intel-tools
+#OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL
+OPENCLFLAG+=-Wno-ignored-attributes -Wno-deprecated-declarations
+METALFLAG=-framework MetalPerformanceShaders
+#
+# OCCA
+#
+#OCCADIR=${HOME}/prk-repo/Cxx11/occa
+#
+# Cilk
+#
+#CILKFLAG=-fcilkplus
+#
+# TBB
+#
+TBBDIR=/usr/local/Cellar/tbb/2019_U5_1
+TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
+#
+# Parallel STL, Boost, etc.
+#
+#BOOSTFLAG=-I/usr/local/Cellar/boost/1.71.0/include
+BOOSTROOT=/Users/jrhammon/Work/Languages/boost/libs
+BOOSTFLAG=
+BOOSTFLAG+=-I${BOOSTROOT}/circular_buffer/include
+BOOSTFLAG+=-I${BOOSTROOT}/compute/include
+BOOSTFLAG+=-I${BOOSTROOT}/algorithm/include
+BOOSTFLAG+=-I${BOOSTROOT}/config/include
+BOOSTFLAG+=-I${BOOSTROOT}/core/include
+BOOSTFLAG+=-I${BOOSTROOT}/log/include
+BOOSTFLAG+=-I${BOOSTROOT}/array/include
+BOOSTFLAG+=-I${BOOSTROOT}/multi_array/include
+BOOSTFLAG+=-I${BOOSTROOT}/optional/include
+BOOSTFLAG+=-I${BOOSTROOT}/preprocessor/include
+BOOSTFLAG+=-I${BOOSTROOT}/type_index/include
+BOOSTFLAG+=-I${BOOSTROOT}/utility/include
+BOOSTFLAG+=-I${BOOSTROOT}/assert/include
+BOOSTFLAG+=-I${BOOSTROOT}/static_assert/include
+BOOSTFLAG+=-I${BOOSTROOT}/exception/include
+BOOSTFLAG+=-I${BOOSTROOT}/throw_exception/include
+BOOSTFLAG+=-I${BOOSTROOT}/concept_check/include
+BOOSTFLAG+=-I${BOOSTROOT}/type_traits/include
+BOOSTFLAG+=-I${BOOSTROOT}/iterator/include
+BOOSTFLAG+=-I${BOOSTROOT}/mpl/include
+BOOSTFLAG+=-I${BOOSTROOT}/detail/include
+BOOSTFLAG+=-I${BOOSTROOT}/functional/include
+BOOSTFLAG+=-I${BOOSTROOT}/move/include
+BOOSTFLAG+=-I${BOOSTROOT}/range/include
+BOOSTFLAG+=-I${BOOSTROOT}/function/include
+BOOSTFLAG+=-I${BOOSTROOT}/integer/include
+BOOSTFLAG+=-I${BOOSTROOT}/container_hash/include
+BOOSTFLAG+=-I${BOOSTROOT}/bind/include
+BOOSTFLAG+=-I${BOOSTROOT}/chrono/include
+BOOSTFLAG+=-I${BOOSTROOT}/predef/include
+BOOSTFLAG+=-I${BOOSTROOT}/ratio/include
+BOOSTFLAG+=-I${BOOSTROOT}/function_types/include
+BOOSTFLAG+=-I${BOOSTROOT}/tuple/include
+BOOSTFLAG+=-I${BOOSTROOT}/lexical_cast/include
+BOOSTFLAG+=-I${BOOSTROOT}/numeric/conversion/include
+BOOSTFLAG+=-I${BOOSTROOT}/container/include
+BOOSTFLAG+=-I${BOOSTROOT}/math/include
+BOOSTFLAG+=-I${BOOSTROOT}/fusion/include
+BOOSTFLAG+=-I${BOOSTROOT}/typeof/include
+BOOSTFLAG+=-I${BOOSTROOT}/uuid/include
+BOOSTFLAG+=-I${BOOSTROOT}/smart_ptr/include
+BOOSTFLAG+=-I${BOOSTROOT}/proto/include
+BOOSTFLAG+=-DBOOST_COMPUTE_USE_CPP11
+RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
+##RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
+PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG}
+KOKKOSDIR=/opt/kokkos/gcc
+KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG}
+RAJADIR=/opt/raja/gcc
+RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
+THRUSTDIR=/Users/jrhammon/Work/NVIDIA/thrust
+THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG}
+#
+# SYCL flags
+#
+# triSYCL
+# https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
+SYCLDIR=./triSYCL
+SYCLCXX=${CXX} -O3 -Wall -std=c++17 ${OPENMPFLAG}
+SYCLFLAG=-I${SYCLDIR}/include ${BOOSTFLAG} -DTRISYCL
+# ProGTX
+# https://github.com/ProGTX/sycl-gtx
+#SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx
+#SYCLCXX=${CXX} ${OPENMPFLAG}
+#SYCLFLAG=-I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG}
+SYCLFLAG+=${RANGEFLAG}
+#
+# CBLAS for C++ DGEMM
+#
+BLASFLAG=-DACCELERATE -framework Accelerate
+CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions
+#
+# CUDA flags
+#
+# Mac w/ CUDA emulation via https://github.com/hughperkins/coriander
+NVCC=/opt/llvm/cocl/bin/cocl
+# Linux w/ NVIDIA CUDA
+#NVCC=nvcc
+#CUDAFLAGS=-g -O3 -std=c++11 -arch=sm_50
+# https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233
+#CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED
+#
+# Halide
+#
+HALIDECXX=c++
+HALIDEDIR=/opt/halide
+HALIDEFLAG=-I${HALIDEDIR}/include
+HALIDEFLAG+=-L${HALIDEDIR}/lib -lhalide
+#HALIDEFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0
+HALIDEFLAG+=${DEFAULT_OPT_FLAGS}
+HALIDEFLAG+=-std=c++17 -g3
+#
+# ISPC
+#
+ISPC=ispc
+ISPCFLAG=-O3 --target=host --opt=fast-math
+#
+# MPI
+#
+# We assume you have installed an implementation of MPI-3 that is in your path.
+MPICC=mpicc -std=c99
+#
+# Fortran 2008 coarrays
+#
+# see https://github.com/ParRes/Kernels/blob/master/FORTRAN/README.md for details
+# single-node
+COARRAYFLAG=-fcoarray=single -lcaf_single
+# multi-node
+# COARRAYFLAG=-fcoarray=lib -lcaf_mpi
+
+MEMKINDDIR=/home/parallels/PRK/deps
+MEMKINDFLAGS=-I${MEMKINDDIR}/include -L${MEMKINDDIR}/lib -lmemkind -Wl,-rpath=${MEMKINDDIR}/lib

From 2163dc8c2a853e12ed38dbe9051e5f98fcaa395b Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Tue, 1 Oct 2019 05:29:29 +0000
Subject: [PATCH 224/245] fix loop index in lambda, now required by USM

---
 Cxx11/nstream-sycl-usm.cc | 349 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 349 insertions(+)
 create mode 100644 Cxx11/nstream-sycl-usm.cc

diff --git a/Cxx11/nstream-sycl-usm.cc b/Cxx11/nstream-sycl-usm.cc
new file mode 100644
index 000000000..1aed0931c
--- /dev/null
+++ b/Cxx11/nstream-sycl-usm.cc
@@ -0,0 +1,349 @@
+///
+/// Copyright (c) 2017, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    nstream
+///
+/// PURPOSE: To compute memory bandwidth when adding a vector of a given
+///          number of double precision values to the scalar multiple of
+///          another vector of the same length, and storing the result in
+///          a third vector.
+///
+/// USAGE:   The program takes as input the number
+///          of iterations to loop over the triad vectors, the length of the
+///          vectors, and the offset between vectors
+///
+///          <progname> <# iterations> <vector length> <offset>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// NOTES:   Bandwidth is determined as the number of words read, plus the
+///          number of words written, times the size of the words, divided
+///          by the execution time. For a vector length of N, the total
+///          number of words read and written is 4*N*sizeof(double).
+///
+///
+/// HISTORY: This code is loosely based on the Stream benchmark by John
+///          McCalpin, but does not follow all the Stream rules. Hence,
+///          reported results should not be associated with Stream in
+///          external publications
+///
+///          Converted to C++11 by Jeff Hammond, November 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "CL/sycl.hpp"
+#include "prk_util.h"
+
+namespace sycl = cl::sycl;
+
+#if 0
+#include "prk_opencl.h"
+#define USE_OPENCL 1
+#endif
+
+template <typename T> class nstream;
+
+template <typename T>
+void run(sycl::queue & q, int iterations, size_t length)
+{
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  double nstream_time(0);
+
+  const T scalar(3);
+
+  //std::vector<T> h_A(length,0);
+  //std::vector<T> h_B(length,2);
+  //std::vector<T> h_C(length,2);
+
+  T * A;
+  T * B;
+  T * C;
+
+  try {
+
+    auto ctx = q.get_context();
+    auto dev = q.get_device();
+
+#if PREBUILD_KERNEL
+    sycl::program kernel(ctx);
+    kernel.build_with_kernel_type<nstream<T>>();
+#endif
+
+    //sycl::buffer<T,1> d_A { h_A.data(), sycl::range<1>(h_A.size()) };
+    //sycl::buffer<T,1> d_B { h_B.data(), sycl::range<1>(h_B.size()) };
+    //sycl::buffer<T,1> d_C { h_C.data(), sycl::range<1>(h_C.size()) };
+
+    A = static_cast<T*>(sycl::malloc_shared(length * sizeof(T), dev, ctx));
+    B = static_cast<T*>(sycl::malloc_shared(length * sizeof(T), dev, ctx));
+    C = static_cast<T*>(sycl::malloc_shared(length * sizeof(T), dev, ctx));
+
+    for (size_t i=0; i<length; i++) {
+      A[i] = 0.0;
+      B[i] = 2.0;
+      C[i] = 2.0;
+    }
+
+    for (int iter = 0; iter<=iterations; ++iter) {
+
+      if (iter==1) nstream_time = prk::wtime();
+
+      q.submit([&](sycl::handler& h) {
+
+        //auto A = d_A.template get_access<sycl::access::mode::read_write>(h);
+        //auto B = d_B.template get_access<sycl::access::mode::read>(h);
+        //auto C = d_C.template get_access<sycl::access::mode::read>(h);
+
+        h.parallel_for<class nstream<T>>(
+#if PREBUILD_KERNEL
+                kernel.get_kernel<nstream<T>>(),
+#endif
+                sycl::range<1>{length}, [=] (sycl::id<1> it) {
+            const size_t i = it[0];
+            A[i] += B[i] + scalar * C[i];
+        });
+      });
+      q.wait();
+    }
+
+    // Stop timer before buffer+accessor destructors fire,
+    // since that will move data, and we do not time that
+    // for other device-oriented programming models.
+    nstream_time = prk::wtime() - nstream_time;
+
+    sycl::free(A, ctx);
+    sycl::free(B, ctx);
+    sycl::free(C, ctx);
+
+  }
+  catch (sycl::exception & e) {
+    std::cout << e.what() << std::endl;
+#ifdef __COMPUTECPP__
+    std::cout << e.get_file_name() << std::endl;
+    std::cout << e.get_line_number() << std::endl;
+    std::cout << e.get_description() << std::endl;
+    std::cout << e.get_cl_error_message() << std::endl;
+    std::cout << e.get_cl_code() << std::endl;
+#endif
+    return;
+  }
+  catch (std::exception & e) {
+    std::cout << e.what() << std::endl;
+    return;
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  T ar(0);
+  T br(2);
+  T cr(2);
+  for (int i=0; i<=iterations; ++i) {
+      ar += br + scalar * cr;
+  }
+
+  ar *= length;
+
+  double asum(0);
+  for (size_t i=0; i<length; ++i) {
+      asum += std::fabs(A[i]);
+  }
+
+  const double epsilon(1.e-8);
+  if (std::fabs(ar-asum)/asum > epsilon) {
+      std::cout << "Failed Validation on output array\n"
+                << "       Expected checksum: " << ar << "\n"
+                << "       Observed checksum: " << asum << std::endl;
+      std::cout << "ERROR: solution did not validate" << std::endl;
+  } else {
+      std::cout << "Solution validates" << std::endl;
+      double avgtime = nstream_time/iterations;
+      double nbytes = 4.0 * length * sizeof(T);
+      std::cout << 8*sizeof(T) << "B "
+                << "Rate (MB/s): " << 1.e-6*nbytes/avgtime
+                << " Avg time (s): " << avgtime << std::endl;
+  }
+}
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/SYCL STREAM triad: A = B + scalar * C" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations, offset;
+  size_t length;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <vector length>";
+      }
+
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      length = std::atol(argv[2]);
+      if (length <= 0) {
+        throw "ERROR: vector length must be positive";
+      }
+
+      offset = (argc>3) ? std::atoi(argv[3]) : 0;
+      if (length <= 0) {
+        throw "ERROR: offset must be nonnegative";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Vector length        = " << length << std::endl;
+  std::cout << "Offset               = " << offset << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Setup SYCL environment
+  //////////////////////////////////////////////////////////////////////
+
+#ifdef USE_OPENCL
+  prk::opencl::listPlatforms();
+#endif
+
+  try {
+#if SYCL_TRY_CPU_QUEUE
+    if (length<100000) {
+        sycl::queue host(sycl::host_selector{});
+#ifndef TRISYCL
+        auto device      = host.get_device();
+        auto platform    = device.get_platform();
+        std::cout << "SYCL Device:   " << device.get_info<sycl::info::device::name>() << std::endl;
+        std::cout << "SYCL Platform: " << platform.get_info<sycl::info::platform::name>() << std::endl;
+#endif
+        run<float>(host, iterations, length);
+        run<double>(host, iterations, length);
+    } else {
+        std::cout << "Skipping host device since it is too slow for large problems" << std::endl;
+    }
+#endif
+
+    // CPU requires spir64 target
+#if SYCL_TRY_CPU_QUEUE
+    if (1) {
+        sycl::queue cpu(sycl::cpu_selector{});
+#if !defined(TRISYCL) && !defined(__HIPSYCL__)
+        auto device      = cpu.get_device();
+        auto platform    = device.get_platform();
+        std::cout << "SYCL Device:   " << device.get_info<sycl::info::device::name>() << std::endl;
+        std::cout << "SYCL Platform: " << platform.get_info<sycl::info::platform::name>() << std::endl;
+        bool has_spir = device.has_extension(sycl::string_class("cl_khr_spir"));
+#else
+        bool has_spir = true; // ?
+#endif
+        if (has_spir) {
+          run<float>(cpu, iterations, length);
+          run<double>(cpu, iterations, length);
+        }
+    }
+#endif
+    // NVIDIA GPU requires ptx64 target and does not work very well
+#if SYCL_TRY_GPU_QUEUE
+    if (1) {
+        sycl::queue gpu(sycl::gpu_selector{});
+#if !defined(TRISYCL) && !defined(__HIPSYCL__)
+        auto device      = gpu.get_device();
+        auto platform    = device.get_platform();
+        std::cout << "SYCL Device:   " << device.get_info<sycl::info::device::name>() << std::endl;
+        std::cout << "SYCL Platform: " << platform.get_info<sycl::info::platform::name>() << std::endl;
+        bool has_spir = device.has_extension(sycl::string_class("cl_khr_spir"));
+        bool has_fp64 = device.has_extension(sycl::string_class("cl_khr_fp64"));
+#else
+        bool has_spir = true; // ?
+        bool has_fp64 = true;
+#endif
+        if (!has_fp64) {
+          std::cout << "SYCL GPU device lacks FP64 support." << std::endl;
+        }
+        if (has_spir) {
+          run<float>(gpu, iterations, length);
+          if (has_fp64) {
+            run<double>(gpu, iterations, length);
+          }
+        } else {
+          std::cout << "SYCL GPU device lacks SPIR-V support." << std::endl;
+#ifdef __COMPUTECPP__
+          std::cout << "You are using ComputeCpp so we will try it anyways..." << std::endl;
+          run<float>(gpu, iterations, length);
+          if (has_fp64) {
+            run<double>(gpu, iterations, length);
+          }
+#endif
+        }
+    }
+#endif
+  }
+  catch (sycl::exception & e) {
+    std::cout << e.what() << std::endl;
+#ifdef __COMPUTECPP__
+    std::cout << e.get_file_name() << std::endl;
+    std::cout << e.get_line_number() << std::endl;
+    std::cout << e.get_description() << std::endl;
+    std::cout << e.get_cl_error_message() << std::endl;
+    std::cout << e.get_cl_code() << std::endl;
+#endif
+    return 1;
+  }
+  catch (std::exception & e) {
+    std::cout << e.what() << std::endl;
+    return 1;
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
+
+

From 39c8935e324a92d5952019ed9fcbe5f395071d48 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Fri, 4 Oct 2019 14:48:23 -0700
Subject: [PATCH 225/245] switch default LLVM SYCL to triSYCL and other fixes

---
 common/make.defs.llvm | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/common/make.defs.llvm b/common/make.defs.llvm
index 180664d73..318e64595 100644
--- a/common/make.defs.llvm
+++ b/common/make.defs.llvm
@@ -58,10 +58,10 @@ OPENMPFLAG+=-L${LLVM_ROOT}/lib
 # SYCL flags
 #
 # Intel SYCL - https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedWithSYCLCompiler.md
-SYCLDIR=/opt/isycl
-SYCLCXX=${SYCLDIR}/bin/clang++
-SYCLFLAG=-fsycl -lsycl -lOpenCL -Wl,-rpath=${SYCLDIR}/lib
-SYCLFLAG+=-std=c++17 -O3
+#SYCLDIR=/opt/isycl
+#SYCLCXX=${SYCLDIR}/bin/clang++
+#SYCLFLAG=-fsycl -lsycl -lOpenCL -Wl,-rpath=${SYCLDIR}/lib
+#SYCLFLAG+=-std=c++17 -O3
 # CodePlay ComputeCpp
 #SYCLDIR=/opt/sycl/latest
 #SYCLCXX=${SYCLDIR}/bin/compute++
@@ -80,9 +80,10 @@ SYCLFLAG+=-std=c++17 -O3
 #
 # triSYCL
 # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
-#SYCLDIR=./triSYCL
+SYCLDIR=./triSYCL
 #SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS)
-#SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include
+SYCLCXX=${CXX} $(DEFAULT_OPT_FLAGS)
+SYCLFLAG=-std=gnu++17 -I$(SYCLDIR)/include -DTRISYCL
 # ProGTX
 # https://github.com/ProGTX/sycl-gtx
 #SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx

From 619f7cac65053dbed7002e0cf18b6a2de83114e0 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Tue, 8 Oct 2019 21:54:06 -0700
Subject: [PATCH 226/245] more SYCL USM (#422)

* add example of all the crap required when using https://github.com/boostorg/boost

* fix loop index in lambda, now required by USM

* clean SYCL USM

* cleanup and homogenize nstream sycl and sycl-usm

* cleanup and homogenize nstream sycl and sycl-usm and explicit-sycl

* USM transpose

* create SYCL util header and factor out a bunch of preprocessor crap

* SYCL USM stencil

* rename to avoid failed builds with triSYCL

* use prk::SYCL namespace for SYCL utils

* fix example make.defs

* remove trailing whitespace

* Stencil SYCL USM

* fix build system for SYCL explicit

* fix scope issue

* fix sycl codegen
---
 Cxx11/Makefile                                |  18 +-
 Cxx11/generate-sycl-stencil.py                |  61 ++-
 ...licit-sycl.cc => nstream-sycl-explicit.cc} | 131 ++---
 Cxx11/nstream-sycl-usm.cc                     | 103 +---
 Cxx11/nstream-sycl.cc                         | 115 ++---
 Cxx11/p2p-hyperplane-sycl.cc                  |  39 +-
 Cxx11/prk_sycl.h                              | 104 ++++
 Cxx11/prk_util.h                              |  29 --
 Cxx11/stencil-sycl-usm.cc                     | 341 +++++++++++++
 Cxx11/stencil-sycl.cc                         | 144 ++----
 Cxx11/stencil_sycl.hpp                        | 470 ++++++++++++------
 ...cit-sycl.cc => transpose-sycl-explicit.cc} | 149 ++----
 Cxx11/transpose-sycl-usm.cc                   | 276 ++++++++++
 Cxx11/transpose-sycl.cc                       | 123 ++---
 common/README.freebsd                         |   2 +-
 common/make.defs.gcc                          |  44 +-
 common/make.defs.llvm                         |  23 +-
 17 files changed, 1370 insertions(+), 802 deletions(-)
 rename Cxx11/{nstream-explicit-sycl.cc => nstream-sycl-explicit.cc} (61%)
 create mode 100644 Cxx11/prk_sycl.h
 create mode 100644 Cxx11/stencil-sycl-usm.cc
 rename Cxx11/{transpose-explicit-sycl.cc => transpose-sycl-explicit.cc} (56%)
 create mode 100644 Cxx11/transpose-sycl-usm.cc

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index 1bb8d88ce..84665feaf 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -69,7 +69,7 @@ ifneq ($(findstring pgc++,$(CXX)),pgc++)
   EXTRA += tbb pstl
 endif
 
-all: sequential vector valarray openmp taskloop stl rangefor kokkos opencl sycl boost-compute $(EXTRA) # raja
+all: sequential vector valarray openmp taskloop stl rangefor opencl sycl boost-compute $(EXTRA)
 
 #p2p: p2p-vector p2p-doacross-openmp p2p-hyperplane-openmp p2p-tasks-openmp p2p-openmp-target \
      p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl p2p-hyperplane-vector-tbb \
@@ -105,7 +105,11 @@ taskloop: stencil-vector-taskloop transpose-vector-taskloop nstream-vector-taskl
 
 opencl: p2p-innerloop-opencl stencil-opencl transpose-opencl nstream-opencl
 
-sycl: p2p-hyperplane-sycl stencil-sycl transpose-sycl nstream-sycl transpose-explicit-sycl nstream-explicit-sycl
+sycl: nstream-sycl p2p-hyperplane-sycl stencil-sycl transpose-sycl
+
+sycl-usm: nstream-sycl-usm stencil-sycl-usm transpose-sycl-usm
+
+sycl-explicit: nstream-sycl-explicit transpose-sycl-explicit
 
 tbb: p2p-innerloop-vector-tbb p2p-vector-tbb stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb \
      p2p-hyperplane-vector-tbb p2p-tasks-tbb
@@ -150,11 +154,17 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h
 	$(CXX) $(CXXFLAGS) $< $(OPENCLFLAGS) -o $@
 
 %-opencl: %-opencl.cc prk_util.h prk_opencl.h
-	$(CXX) $(CXXFLAGS) $< $(OPENCLFLAGS) -o $@
+	$(SYCLCXX) $(CXXFLAGS) $< $(OPENCLFLAGS) -o $@
 
 %-sycl: %-sycl.cc prk_util.h
 	$(SYCLCXX) $(CPPFLAGS) $(SYCLFLAGS) $< -o $@
 
+%-sycl-usm: %-sycl-usm.cc prk_util.h
+	$(SYCLCXX) $(CPPFLAGS) $(SYCLFLAGS) $< -o $@
+
+%-sycl-explicit: %-sycl-explicit.cc prk_util.h
+	$(SYCLCXX) $(CPPFLAGS) $(SYCLFLAGS) $< -o $@
+
 %-target: %-target.cc prk_util.h
 	$(CXX) $(CXXFLAGS) $< $(OMPFLAGS) $(TARGETFLAGS) -o $@
 
@@ -235,6 +245,8 @@ clean:
 	-rm -f *-taskloop
 	-rm -f *-opencl
 	-rm -f *-sycl
+	-rm -f *-sycl-explicit
+	-rm -f *-sycl-usm
 	-rm -f *-tbb
 	-rm -f *-stl
 	-rm -f *-pstl
diff --git a/Cxx11/generate-sycl-stencil.py b/Cxx11/generate-sycl-stencil.py
index d88cae37b..404b9edfc 100755
--- a/Cxx11/generate-sycl-stencil.py
+++ b/Cxx11/generate-sycl-stencil.py
@@ -5,31 +5,39 @@
 import string
 import os
 
-def codegen(src,pattern,stencil_size,radius,model,dim):
+def codegen(src,pattern,stencil_size,radius,model,dim,usm):
     src.write('// declare the kernel name used in SYCL parallel_for\n')
-    src.write('template <typename T> class '+pattern+str(radius)+'_'+str(dim)+'d;\n\n')
+    if (usm):
+        kernel_name = pattern+str(radius)+'_usm'
+    else:
+        kernel_name = pattern+str(radius)+'_'+str(dim)+'d'
+    src.write('template <typename T> class '+kernel_name+';\n\n')
     src.write('template <typename T>\n')
-    src.write('void '+pattern+str(radius)+'(cl::sycl::queue & q, const size_t n, ')
-    if (dim==2):
-        src.write('cl::sycl::buffer<T, 2> & d_in, ')
-        src.write('cl::sycl::buffer<T, 2> & d_out)\n')
+    src.write('void '+pattern+str(radius)+'(sycl::queue & q, const size_t n, ')
+    if (usm):
+        src.write('const T * in, ')
+        src.write('T * out)\n')
+    elif (dim==2):
+        src.write('sycl::buffer<T, 2> & d_in, ')
+        src.write('sycl::buffer<T, 2> & d_out)\n')
     else:
-        src.write('cl::sycl::buffer<T> & d_in, ')
-        src.write('cl::sycl::buffer<T> & d_out)\n')
+        src.write('sycl::buffer<T> & d_in, ')
+        src.write('sycl::buffer<T> & d_out)\n')
     src.write('{\n')
-    src.write('  q.submit([&](cl::sycl::handler& h) {\n')
-    src.write('    auto in  = d_in.template get_access<cl::sycl::access::mode::read>(h);\n')
-    src.write('    auto out = d_out.template get_access<cl::sycl::access::mode::read_write>(h);\n')
+    src.write('  q.submit([&](sycl::handler& h) {\n')
+    if (not usm):
+        src.write('    auto in  = d_in.template get_access<sycl::access::mode::read>(h);\n')
+        src.write('    auto out = d_out.template get_access<sycl::access::mode::read_write>(h);\n')
     if (dim==2):
         for r in range(1,radius+1):
-            src.write('    cl::sycl::id<2> dx'+str(r)+'(cl::sycl::range<2> {'+str(r)+',0});\n')
-            src.write('    cl::sycl::id<2> dy'+str(r)+'(cl::sycl::range<2> {0,'+str(r)+'});\n')
-    src.write('    h.parallel_for<class '+pattern+str(radius)+'_'+str(dim)+'d<T>>(')
-    src.write('cl::sycl::range<2> {n-'+str(2*radius)+',n-'+str(2*radius)+'}, ')
-    src.write('cl::sycl::id<2> {'+str(radius)+','+str(radius)+'}, ')
-    src.write('[=] (cl::sycl::item<2> it) {\n')
+            src.write('    sycl::id<2> dx'+str(r)+'(sycl::range<2> {'+str(r)+',0});\n')
+            src.write('    sycl::id<2> dy'+str(r)+'(sycl::range<2> {0,'+str(r)+'});\n')
+    src.write('    h.parallel_for<class '+kernel_name+'<T>>(')
+    src.write('sycl::range<2> {n-'+str(2*radius)+',n-'+str(2*radius)+'}, ')
+    src.write('sycl::id<2> {'+str(radius)+','+str(radius)+'}, ')
+    src.write('[=] (sycl::item<2> it) {\n')
     if (dim==2):
-        src.write('        cl::sycl::id<2> xy = it.get_id();\n')
+        src.write('        sycl::id<2> xy = it.get_id();\n')
         src.write('        out[xy] += ')
     else:
         # 1D indexing the slow way
@@ -37,7 +45,9 @@ def codegen(src,pattern,stencil_size,radius,model,dim):
         #src.write('        auto j = it[1];\n')
         #src.write('        out[i*n+j] += ')
         # 1D indexing the fast way
-        src.write('        out[it[0]*n+it[1]] += ')
+        src.write('        const auto i = it[0];\n')
+        src.write('        const auto j = it[1];\n')
+        src.write('        out[i*n+j] += ')
     if pattern == 'star':
         for i in range(1,radius+1):
             if (dim==2):
@@ -67,13 +77,13 @@ def codegen(src,pattern,stencil_size,radius,model,dim):
                 if i > 1:
                     src.write('\n')
                     src.write(30*' ')
-                src.write('+in[it[0]*n+(it[1]+'+str(i)+')] * static_cast<T>('+str(+1./(2.*i*radius))+')')
+                src.write('+in[i*n+(j+'+str(i)+')] * static_cast<T>('+str(+1./(2.*i*radius))+')')
                 src.write('\n'+30*' ')
-                src.write('+in[it[0]*n+(it[1]-'+str(i)+')] * static_cast<T>('+str(-1./(2.*i*radius))+')')
+                src.write('+in[i*n+(j-'+str(i)+')] * static_cast<T>('+str(-1./(2.*i*radius))+')')
                 src.write('\n'+30*' ')
-                src.write('+in[(it[0]+'+str(i)+')*n+it[1]] * static_cast<T>('+str(+1./(2.*i*radius))+')')
+                src.write('+in[(i+'+str(i)+')*n+j] * static_cast<T>('+str(+1./(2.*i*radius))+')')
                 src.write('\n'+30*' ')
-                src.write('+in[(it[0]-'+str(i)+')*n+it[1]] * static_cast<T>('+str(-1./(2.*i*radius))+')')
+                src.write('+in[(i-'+str(i)+')*n+j] * static_cast<T>('+str(-1./(2.*i*radius))+')')
             if i == radius:
                 src.write(';\n')
     else:
@@ -87,8 +97,9 @@ def instance(src,model,pattern,r):
         stencil_size = 4*r+1
     else:
         stencil_size = (2*r+1)**2
-    codegen(src,pattern,stencil_size,r,model,1)
-    codegen(src,pattern,stencil_size,r,model,2)
+    codegen(src,pattern,stencil_size,r,model,1,False)
+    codegen(src,pattern,stencil_size,r,model,2,False)
+    codegen(src,pattern,stencil_size,r,model,1,True)
 
 def main():
     for model in ['sycl']:
diff --git a/Cxx11/nstream-explicit-sycl.cc b/Cxx11/nstream-sycl-explicit.cc
similarity index 61%
rename from Cxx11/nstream-explicit-sycl.cc
rename to Cxx11/nstream-sycl-explicit.cc
index 5201a48b1..ef2a0392b 100644
--- a/Cxx11/nstream-explicit-sycl.cc
+++ b/Cxx11/nstream-sycl-explicit.cc
@@ -62,18 +62,13 @@
 ///
 //////////////////////////////////////////////////////////////////////
 
-#include "CL/sycl.hpp"
+#include "prk_sycl.h"
 #include "prk_util.h"
 
-#if 0
-#include "prk_opencl.h"
-#define USE_OPENCL 1
-#endif
-
 template <typename T> class nstream;
 
 template <typename T>
-void run(cl::sycl::queue & q, int iterations, size_t length)
+void run(sycl::queue & q, int iterations, size_t length)
 {
   //////////////////////////////////////////////////////////////////////
   // Allocate space and perform the computation
@@ -87,25 +82,28 @@ void run(cl::sycl::queue & q, int iterations, size_t length)
 
   try {
 
+    auto ctx = q.get_context();
+    auto dev = q.get_device();
+
 #if PREBUILD_KERNEL
-    cl::sycl::program kernel(q.get_context());
+    sycl::program kernel(ctx);
     kernel.build_with_kernel_type<nstream<T>>();
 #endif
 
-    cl::sycl::buffer<T> d_A { cl::sycl::range<1>{length} };
-    cl::sycl::buffer<T> d_B { cl::sycl::range<1>{length} };
-    cl::sycl::buffer<T> d_C { cl::sycl::range<1>{length} };
+    sycl::buffer<T> d_A { sycl::range<1>{length} };
+    sycl::buffer<T> d_B { sycl::range<1>{length} };
+    sycl::buffer<T> d_C { sycl::range<1>{length} };
 
-    q.submit([&](cl::sycl::handler& h) {
-        cl::sycl::accessor<T, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> A(d_A, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0));
+    q.submit([&](sycl::handler& h) {
+        sycl::accessor<T, 1, sycl::access::mode::write, sycl::access::target::global_buffer> A(d_A, h, sycl::range<1>(length), sycl::id<1>(0));
         h.fill(A,(T)0);
     });
-    q.submit([&](cl::sycl::handler& h) {
-        cl::sycl::accessor<T, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0));
+    q.submit([&](sycl::handler& h) {
+        sycl::accessor<T, 1, sycl::access::mode::write, sycl::access::target::global_buffer> B(d_B, h, sycl::range<1>(length), sycl::id<1>(0));
         h.fill(B,(T)2);
     });
-    q.submit([&](cl::sycl::handler& h) {
-        cl::sycl::accessor<T, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> C(d_C, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0));
+    q.submit([&](sycl::handler& h) {
+        sycl::accessor<T, 1, sycl::access::mode::write, sycl::access::target::global_buffer> C(d_C, h, sycl::range<1>(length), sycl::id<1>(0));
         h.fill(C,(T)2);
     });
     q.wait();
@@ -114,17 +112,18 @@ void run(cl::sycl::queue & q, int iterations, size_t length)
 
       if (iter==1) nstream_time = prk::wtime();
 
-      q.submit([&](cl::sycl::handler& h) {
+      q.submit([&](sycl::handler& h) {
 
-        cl::sycl::accessor<T, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer> A(d_A, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0));
-        cl::sycl::accessor<T, 1, cl::sycl::access::mode::read,       cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0));
-        cl::sycl::accessor<T, 1, cl::sycl::access::mode::read,       cl::sycl::access::target::global_buffer> C(d_C, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0));
+        sycl::accessor<T, 1, sycl::access::mode::read_write, sycl::access::target::global_buffer> A(d_A, h, sycl::range<1>(length), sycl::id<1>(0));
+        sycl::accessor<T, 1, sycl::access::mode::read,       sycl::access::target::global_buffer> B(d_B, h, sycl::range<1>(length), sycl::id<1>(0));
+        sycl::accessor<T, 1, sycl::access::mode::read,       sycl::access::target::global_buffer> C(d_C, h, sycl::range<1>(length), sycl::id<1>(0));
 
         h.parallel_for<class nstream<T>>(
 #if PREBUILD_KERNEL
                 kernel.get_kernel<nstream<T>>(),
 #endif
-                cl::sycl::range<1>{length}, [=] (cl::sycl::item<1> i) {
+                sycl::range<1>{length}, [=] (sycl::id<1> it) {
+            const size_t i = it[0];
             A[i] += B[i] + scalar * C[i];
         });
       });
@@ -136,21 +135,15 @@ void run(cl::sycl::queue & q, int iterations, size_t length)
     // for other device-oriented programming models.
     nstream_time = prk::wtime() - nstream_time;
 
-    q.submit([&](cl::sycl::handler& h) {
-        cl::sycl::accessor<T, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> A(d_A, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0));
+    q.submit([&](sycl::handler& h) {
+        sycl::accessor<T, 1, sycl::access::mode::read, sycl::access::target::global_buffer> A(d_A, h, sycl::range<1>(length), sycl::id<1>(0));
         h.copy(A,h_A.data());
     });
     q.wait();
   }
-  catch (cl::sycl::exception & e) {
+  catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
-#ifdef __COMPUTECPP__
-    std::cout << e.get_file_name() << std::endl;
-    std::cout << e.get_line_number() << std::endl;
-    std::cout << e.get_description() << std::endl;
-    std::cout << e.get_cl_error_message() << std::endl;
-    std::cout << e.get_cl_code() << std::endl;
-#endif
+    prk::SYCL::print_exception_details(e);
     return;
   }
   catch (std::exception & e) {
@@ -247,15 +240,10 @@ int main(int argc, char * argv[])
   try {
 #if SYCL_TRY_CPU_QUEUE
     if (length<100000) {
-        cl::sycl::queue host(cl::sycl::host_selector{});
-#ifndef TRISYCL
-        auto device      = host.get_device();
-        auto platform    = device.get_platform();
-        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
-        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
-#endif
-        run<float>(host, iterations, length);
-        run<double>(host, iterations, length);
+        sycl::queue q(sycl::host_selector{});
+        prk::SYCL::print_device_platform(q);
+        run<float>(q, iterations, length);
+        run<double>(q, iterations, length);
     } else {
         std::cout << "Skipping host device since it is too slow for large problems" << std::endl;
     }
@@ -264,68 +252,39 @@ int main(int argc, char * argv[])
     // CPU requires spir64 target
 #if SYCL_TRY_CPU_QUEUE
     if (1) {
-        cl::sycl::queue cpu(cl::sycl::cpu_selector{});
-#if !defined(TRISYCL) && !defined(__HIPSYCL__)
-        auto device      = cpu.get_device();
-        auto platform    = device.get_platform();
-        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
-        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
-        bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir"));
-#else
-        bool has_spir = true; // ?
-#endif
+        sycl::queue q(sycl::cpu_selector{});
+        prk::SYCL::print_device_platform(q);
+        bool has_spir = prk::SYCL::has_spir(q);
         if (has_spir) {
-          run<float>(cpu, iterations, length);
-          run<double>(cpu, iterations, length);
+          run<float>(q, iterations, length);
+          run<double>(q, iterations, length);
         }
     }
 #endif
 
-    // NVIDIA GPU requires ptx64 target and does not work very well
+    // NVIDIA GPU requires ptx64 target
 #if SYCL_TRY_GPU_QUEUE
     if (1) {
-        cl::sycl::queue gpu(cl::sycl::gpu_selector{});
-#if !defined(TRISYCL) && !defined(__HIPSYCL__)
-        auto device      = gpu.get_device();
-        auto platform    = device.get_platform();
-        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
-        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
-        bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir"));
-        bool has_fp64 = device.has_extension(cl::sycl::string_class("cl_khr_fp64"));
-#else
-        bool has_spir = true; // ?
-        bool has_fp64 = true;
-#endif
+        sycl::queue q(sycl::gpu_selector{});
+        prk::SYCL::print_device_platform(q);
+        bool has_spir = prk::SYCL::has_spir(q);
+        bool has_fp64 = prk::SYCL::has_fp64(q);
+        bool has_ptx  = prk::SYCL::has_ptx(q);
         if (!has_fp64) {
           std::cout << "SYCL GPU device lacks FP64 support." << std::endl;
         }
-        if (has_spir) {
-          run<float>(gpu, iterations, length);
-          if (has_fp64) {
-            run<double>(gpu, iterations, length);
-          }
-        } else {
-          std::cout << "SYCL GPU device lacks SPIR-V support." << std::endl;
-#ifdef __COMPUTECPP__
-          std::cout << "You are using ComputeCpp so we will try it anyways..." << std::endl;
-          run<float>(gpu, iterations, length);
+        if (has_spir || has_ptx) {
+          run<float>(q, iterations, length);
           if (has_fp64) {
-            run<double>(gpu, iterations, length);
+            run<double>(q, iterations, length);
           }
-#endif
         }
     }
 #endif
   }
-  catch (cl::sycl::exception & e) {
+  catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
-#ifdef __COMPUTECPP__
-    std::cout << e.get_file_name() << std::endl;
-    std::cout << e.get_line_number() << std::endl;
-    std::cout << e.get_description() << std::endl;
-    std::cout << e.get_cl_error_message() << std::endl;
-    std::cout << e.get_cl_code() << std::endl;
-#endif
+    prk::SYCL::print_exception_details(e);
     return 1;
   }
   catch (std::exception & e) {
diff --git a/Cxx11/nstream-sycl-usm.cc b/Cxx11/nstream-sycl-usm.cc
index 1aed0931c..c92a52bc9 100644
--- a/Cxx11/nstream-sycl-usm.cc
+++ b/Cxx11/nstream-sycl-usm.cc
@@ -62,16 +62,9 @@
 ///
 //////////////////////////////////////////////////////////////////////
 
-#include "CL/sycl.hpp"
+#include "prk_sycl.h"
 #include "prk_util.h"
 
-namespace sycl = cl::sycl;
-
-#if 0
-#include "prk_opencl.h"
-#define USE_OPENCL 1
-#endif
-
 template <typename T> class nstream;
 
 template <typename T>
@@ -85,10 +78,6 @@ void run(sycl::queue & q, int iterations, size_t length)
 
   const T scalar(3);
 
-  //std::vector<T> h_A(length,0);
-  //std::vector<T> h_B(length,2);
-  //std::vector<T> h_C(length,2);
-
   T * A;
   T * B;
   T * C;
@@ -103,10 +92,6 @@ void run(sycl::queue & q, int iterations, size_t length)
     kernel.build_with_kernel_type<nstream<T>>();
 #endif
 
-    //sycl::buffer<T,1> d_A { h_A.data(), sycl::range<1>(h_A.size()) };
-    //sycl::buffer<T,1> d_B { h_B.data(), sycl::range<1>(h_B.size()) };
-    //sycl::buffer<T,1> d_C { h_C.data(), sycl::range<1>(h_C.size()) };
-
     A = static_cast<T*>(sycl::malloc_shared(length * sizeof(T), dev, ctx));
     B = static_cast<T*>(sycl::malloc_shared(length * sizeof(T), dev, ctx));
     C = static_cast<T*>(sycl::malloc_shared(length * sizeof(T), dev, ctx));
@@ -122,11 +107,6 @@ void run(sycl::queue & q, int iterations, size_t length)
       if (iter==1) nstream_time = prk::wtime();
 
       q.submit([&](sycl::handler& h) {
-
-        //auto A = d_A.template get_access<sycl::access::mode::read_write>(h);
-        //auto B = d_B.template get_access<sycl::access::mode::read>(h);
-        //auto C = d_C.template get_access<sycl::access::mode::read>(h);
-
         h.parallel_for<class nstream<T>>(
 #if PREBUILD_KERNEL
                 kernel.get_kernel<nstream<T>>(),
@@ -151,13 +131,7 @@ void run(sycl::queue & q, int iterations, size_t length)
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
-#ifdef __COMPUTECPP__
-    std::cout << e.get_file_name() << std::endl;
-    std::cout << e.get_line_number() << std::endl;
-    std::cout << e.get_description() << std::endl;
-    std::cout << e.get_cl_error_message() << std::endl;
-    std::cout << e.get_cl_code() << std::endl;
-#endif
+    prk::SYCL::print_exception_details(e);
     return;
   }
   catch (std::exception & e) {
@@ -254,15 +228,10 @@ int main(int argc, char * argv[])
   try {
 #if SYCL_TRY_CPU_QUEUE
     if (length<100000) {
-        sycl::queue host(sycl::host_selector{});
-#ifndef TRISYCL
-        auto device      = host.get_device();
-        auto platform    = device.get_platform();
-        std::cout << "SYCL Device:   " << device.get_info<sycl::info::device::name>() << std::endl;
-        std::cout << "SYCL Platform: " << platform.get_info<sycl::info::platform::name>() << std::endl;
-#endif
-        run<float>(host, iterations, length);
-        run<double>(host, iterations, length);
+        sycl::queue q(sycl::host_selector{});
+        prk::SYCL::print_device_platform(q);
+        run<float>(q, iterations, length);
+        run<double>(q, iterations, length);
     } else {
         std::cout << "Skipping host device since it is too slow for large problems" << std::endl;
     }
@@ -271,67 +240,39 @@ int main(int argc, char * argv[])
     // CPU requires spir64 target
 #if SYCL_TRY_CPU_QUEUE
     if (1) {
-        sycl::queue cpu(sycl::cpu_selector{});
-#if !defined(TRISYCL) && !defined(__HIPSYCL__)
-        auto device      = cpu.get_device();
-        auto platform    = device.get_platform();
-        std::cout << "SYCL Device:   " << device.get_info<sycl::info::device::name>() << std::endl;
-        std::cout << "SYCL Platform: " << platform.get_info<sycl::info::platform::name>() << std::endl;
-        bool has_spir = device.has_extension(sycl::string_class("cl_khr_spir"));
-#else
-        bool has_spir = true; // ?
-#endif
+        sycl::queue q(sycl::cpu_selector{});
+        prk::SYCL::print_device_platform(q);
+        bool has_spir = prk::SYCL::has_spir(q);
         if (has_spir) {
-          run<float>(cpu, iterations, length);
-          run<double>(cpu, iterations, length);
+          run<float>(q, iterations, length);
+          run<double>(q, iterations, length);
         }
     }
 #endif
-    // NVIDIA GPU requires ptx64 target and does not work very well
+
+    // NVIDIA GPU requires ptx64 target
 #if SYCL_TRY_GPU_QUEUE
     if (1) {
-        sycl::queue gpu(sycl::gpu_selector{});
-#if !defined(TRISYCL) && !defined(__HIPSYCL__)
-        auto device      = gpu.get_device();
-        auto platform    = device.get_platform();
-        std::cout << "SYCL Device:   " << device.get_info<sycl::info::device::name>() << std::endl;
-        std::cout << "SYCL Platform: " << platform.get_info<sycl::info::platform::name>() << std::endl;
-        bool has_spir = device.has_extension(sycl::string_class("cl_khr_spir"));
-        bool has_fp64 = device.has_extension(sycl::string_class("cl_khr_fp64"));
-#else
-        bool has_spir = true; // ?
-        bool has_fp64 = true;
-#endif
+        sycl::queue q(sycl::gpu_selector{});
+        prk::SYCL::print_device_platform(q);
+        bool has_spir = prk::SYCL::has_spir(q);
+        bool has_fp64 = prk::SYCL::has_fp64(q);
+        bool has_ptx  = prk::SYCL::has_ptx(q);
         if (!has_fp64) {
           std::cout << "SYCL GPU device lacks FP64 support." << std::endl;
         }
-        if (has_spir) {
-          run<float>(gpu, iterations, length);
+        if (has_spir || has_ptx) {
+          run<float>(q, iterations, length);
           if (has_fp64) {
-            run<double>(gpu, iterations, length);
+            run<double>(q, iterations, length);
           }
-        } else {
-          std::cout << "SYCL GPU device lacks SPIR-V support." << std::endl;
-#ifdef __COMPUTECPP__
-          std::cout << "You are using ComputeCpp so we will try it anyways..." << std::endl;
-          run<float>(gpu, iterations, length);
-          if (has_fp64) {
-            run<double>(gpu, iterations, length);
-          }
-#endif
         }
     }
 #endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
-#ifdef __COMPUTECPP__
-    std::cout << e.get_file_name() << std::endl;
-    std::cout << e.get_line_number() << std::endl;
-    std::cout << e.get_description() << std::endl;
-    std::cout << e.get_cl_error_message() << std::endl;
-    std::cout << e.get_cl_code() << std::endl;
-#endif
+    prk::SYCL::print_exception_details(e);
     return 1;
   }
   catch (std::exception & e) {
diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc
index 2657f7200..bc52e6649 100644
--- a/Cxx11/nstream-sycl.cc
+++ b/Cxx11/nstream-sycl.cc
@@ -62,18 +62,13 @@
 ///
 //////////////////////////////////////////////////////////////////////
 
-#include "CL/sycl.hpp"
+#include "prk_sycl.h"
 #include "prk_util.h"
 
-#if 0
-#include "prk_opencl.h"
-#define USE_OPENCL 1
-#endif
-
 template <typename T> class nstream;
 
 template <typename T>
-void run(cl::sycl::queue & q, int iterations, size_t length)
+void run(sycl::queue & q, int iterations, size_t length)
 {
   //////////////////////////////////////////////////////////////////////
   // Allocate space and perform the computation
@@ -89,30 +84,33 @@ void run(cl::sycl::queue & q, int iterations, size_t length)
 
   try {
 
+    auto ctx = q.get_context();
+
 #if PREBUILD_KERNEL
-    cl::sycl::program kernel(q.get_context());
+    sycl::program kernel(ctx);
     kernel.build_with_kernel_type<nstream<T>>();
 #endif
 
-    cl::sycl::buffer<T,1> d_A { h_A.data(), cl::sycl::range<1>(h_A.size()) };
-    cl::sycl::buffer<T,1> d_B { h_B.data(), cl::sycl::range<1>(h_B.size()) };
-    cl::sycl::buffer<T,1> d_C { h_C.data(), cl::sycl::range<1>(h_C.size()) };
+    sycl::buffer<T,1> d_A { h_A.data(), sycl::range<1>(h_A.size()) };
+    sycl::buffer<T,1> d_B { h_B.data(), sycl::range<1>(h_B.size()) };
+    sycl::buffer<T,1> d_C { h_C.data(), sycl::range<1>(h_C.size()) };
 
     for (int iter = 0; iter<=iterations; ++iter) {
 
       if (iter==1) nstream_time = prk::wtime();
 
-      q.submit([&](cl::sycl::handler& h) {
+      q.submit([&](sycl::handler& h) {
 
-        auto A = d_A.template get_access<cl::sycl::access::mode::read_write>(h);
-        auto B = d_B.template get_access<cl::sycl::access::mode::read>(h);
-        auto C = d_C.template get_access<cl::sycl::access::mode::read>(h);
+        auto A = d_A.template get_access<sycl::access::mode::read_write>(h);
+        auto B = d_B.template get_access<sycl::access::mode::read>(h);
+        auto C = d_C.template get_access<sycl::access::mode::read>(h);
 
         h.parallel_for<class nstream<T>>(
 #if PREBUILD_KERNEL
                 kernel.get_kernel<nstream<T>>(),
 #endif
-                cl::sycl::range<1>{length}, [=] (cl::sycl::item<1> i) {
+                sycl::range<1>{length}, [=] (sycl::id<1> it) {
+            const size_t i = it[0];
             A[i] += B[i] + scalar * C[i];
         });
       });
@@ -124,15 +122,9 @@ void run(cl::sycl::queue & q, int iterations, size_t length)
     // for other device-oriented programming models.
     nstream_time = prk::wtime() - nstream_time;
   }
-  catch (cl::sycl::exception & e) {
+  catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
-#ifdef __COMPUTECPP__
-    std::cout << e.get_file_name() << std::endl;
-    std::cout << e.get_line_number() << std::endl;
-    std::cout << e.get_description() << std::endl;
-    std::cout << e.get_cl_error_message() << std::endl;
-    std::cout << e.get_cl_code() << std::endl;
-#endif
+    prk::SYCL::print_exception_details(e);
     return;
   }
   catch (std::exception & e) {
@@ -229,15 +221,10 @@ int main(int argc, char * argv[])
   try {
 #if SYCL_TRY_CPU_QUEUE
     if (length<100000) {
-        cl::sycl::queue host(cl::sycl::host_selector{});
-#ifndef TRISYCL
-        auto device      = host.get_device();
-        auto platform    = device.get_platform();
-        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
-        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
-#endif
-        run<float>(host, iterations, length);
-        run<double>(host, iterations, length);
+        sycl::queue q(sycl::host_selector{});
+        prk::SYCL::print_device_platform(q);
+        run<float>(q, iterations, length);
+        run<double>(q, iterations, length);
     } else {
         std::cout << "Skipping host device since it is too slow for large problems" << std::endl;
     }
@@ -246,67 +233,39 @@ int main(int argc, char * argv[])
     // CPU requires spir64 target
 #if SYCL_TRY_CPU_QUEUE
     if (1) {
-        cl::sycl::queue cpu(cl::sycl::cpu_selector{});
-#if !defined(TRISYCL) && !defined(__HIPSYCL__)
-        auto device      = cpu.get_device();
-        auto platform    = device.get_platform();
-        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
-        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
-        bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir"));
-#else
-        bool has_spir = true; // ?
-#endif
+        sycl::queue q(sycl::cpu_selector{});
+        prk::SYCL::print_device_platform(q);
+        bool has_spir = prk::SYCL::has_spir(q);
         if (has_spir) {
-          run<float>(cpu, iterations, length);
-          run<double>(cpu, iterations, length);
+          run<float>(q, iterations, length);
+          run<double>(q, iterations, length);
         }
     }
 #endif
-    // NVIDIA GPU requires ptx64 target and does not work very well
+
+    // NVIDIA GPU requires ptx64 target
 #if SYCL_TRY_GPU_QUEUE
     if (1) {
-        cl::sycl::queue gpu(cl::sycl::gpu_selector{});
-#if !defined(TRISYCL) && !defined(__HIPSYCL__)
-        auto device      = gpu.get_device();
-        auto platform    = device.get_platform();
-        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
-        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
-        bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir"));
-        bool has_fp64 = device.has_extension(cl::sycl::string_class("cl_khr_fp64"));
-#else
-        bool has_spir = true; // ?
-        bool has_fp64 = true;
-#endif
+        sycl::queue q(sycl::gpu_selector{});
+        prk::SYCL::print_device_platform(q);
+        bool has_spir = prk::SYCL::has_spir(q);
+        bool has_fp64 = prk::SYCL::has_fp64(q);
+        bool has_ptx  = prk::SYCL::has_ptx(q);
         if (!has_fp64) {
           std::cout << "SYCL GPU device lacks FP64 support." << std::endl;
         }
-        if (has_spir) {
-          run<float>(gpu, iterations, length);
-          if (has_fp64) {
-            run<double>(gpu, iterations, length);
-          }
-        } else {
-          std::cout << "SYCL GPU device lacks SPIR-V support." << std::endl;
-#ifdef __COMPUTECPP__
-          std::cout << "You are using ComputeCpp so we will try it anyways..." << std::endl;
-          run<float>(gpu, iterations, length);
+        if (has_spir || has_ptx) {
+          run<float>(q, iterations, length);
           if (has_fp64) {
-            run<double>(gpu, iterations, length);
+            run<double>(q, iterations, length);
           }
-#endif
         }
     }
 #endif
   }
-  catch (cl::sycl::exception & e) {
+  catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
-#ifdef __COMPUTECPP__
-    std::cout << e.get_file_name() << std::endl;
-    std::cout << e.get_line_number() << std::endl;
-    std::cout << e.get_description() << std::endl;
-    std::cout << e.get_cl_error_message() << std::endl;
-    std::cout << e.get_cl_code() << std::endl;
-#endif
+    prk::SYCL::print_exception_details(e);
     return 1;
   }
   catch (std::exception & e) {
diff --git a/Cxx11/p2p-hyperplane-sycl.cc b/Cxx11/p2p-hyperplane-sycl.cc
index a738beffa..05e3adeb5 100644
--- a/Cxx11/p2p-hyperplane-sycl.cc
+++ b/Cxx11/p2p-hyperplane-sycl.cc
@@ -59,8 +59,7 @@
 ///
 //////////////////////////////////////////////////////////////////////
 
-#include "CL/sycl.hpp"
-
+#include "prk_sycl.h"
 #include "prk_util.h"
 #include "p2p-kernel.h"
 
@@ -131,9 +130,9 @@ int main(int argc, char* argv[])
     h_grid[j*n+0] = static_cast<double>(j);
   }
 
-  cl::sycl::queue q;
+  sycl::queue q;
   {
-    cl::sycl::buffer<double> d_grid { h_grid.data(), h_grid.size() };
+    sycl::buffer<double> d_grid { h_grid.data(), h_grid.size() };
 
     for (auto iter = 0; iter<=iterations; iter++) {
 
@@ -141,36 +140,36 @@ int main(int argc, char* argv[])
 
       for (int i=2; i<=2*n-2; i++) {
 
-        cl::sycl::id<1> I{unsigned(i)};
-        cl::sycl::id<1> One{1};
+        sycl::id<1> I{unsigned(i)};
+        sycl::id<1> One{1};
 
-        q.submit([&](cl::sycl::handler& h) {
+        q.submit([&](sycl::handler& h) {
 
-          auto grid = d_grid.get_access<cl::sycl::access::mode::read_write>(h);
+          auto grid = d_grid.get_access<sycl::access::mode::read_write>(h);
 
           unsigned begin = std::max(2,i-n+2);
           unsigned end   = std::min(i,n)+1;
           unsigned range = end-begin;
 
-          h.parallel_for<class sweep>(cl::sycl::range<1>{range}, cl::sycl::id<1>{begin}, [=] (cl::sycl::item<1> j) {
+          h.parallel_for<class sweep>(sycl::range<1>{range}, sycl::id<1>{begin}, [=] (sycl::item<1> j) {
             auto J = j.get_id();
-            cl::sycl::id<1> N{unsigned(n)};
-            cl::sycl::id<1> X{I-J+One};
-            cl::sycl::id<1> Y{J-One};
-            cl::sycl::id<1> Xold{X-One}; // x-1
-            cl::sycl::id<1> Yold{Y-One}; // y-1
-            cl::sycl::id<1> index0{X*N+Y};
-            cl::sycl::id<1> index1{Xold*N+Y};
-            cl::sycl::id<1> index2{X*N+Yold};
-            cl::sycl::id<1> index3{Xold*N+Yold};
+            sycl::id<1> N{unsigned(n)};
+            sycl::id<1> X{I-J+One};
+            sycl::id<1> Y{J-One};
+            sycl::id<1> Xold{X-One}; // x-1
+            sycl::id<1> Yold{Y-One}; // y-1
+            sycl::id<1> index0{X*N+Y};
+            sycl::id<1> index1{Xold*N+Y};
+            sycl::id<1> index2{X*N+Yold};
+            sycl::id<1> index3{Xold*N+Yold};
             grid[index0] = grid[index1] + grid[index2] - grid[index3];
           });
         });
         q.wait();
       }
-      q.submit([&](cl::sycl::handler& h) {
+      q.submit([&](sycl::handler& h) {
 
-        auto grid = d_grid.get_access<cl::sycl::access::mode::read_write>(h);
+        auto grid = d_grid.get_access<sycl::access::mode::read_write>(h);
 
         h.single_task<class corner>([=] {
             grid[0*n+0] = -grid[(n-1)*n+(n-1)];
diff --git a/Cxx11/prk_sycl.h b/Cxx11/prk_sycl.h
new file mode 100644
index 000000000..cdd18d211
--- /dev/null
+++ b/Cxx11/prk_sycl.h
@@ -0,0 +1,104 @@
+#ifndef PRK_SYCL_HPP
+#define PRK_SYCL_HPP
+
+#include <cstdlib>
+#include <iostream>
+
+#include "CL/sycl.hpp"
+
+namespace sycl = cl::sycl;
+
+// prebuilt kernels are not required/not fully supported on hipSYCL and triSYCL
+#if defined(TRISYCL) || defined(__HIPSYCL__)
+#define PREBUILD_KERNEL 0
+#else
+#define PREBUILD_KERNEL 1
+#endif
+
+// not all SYCL implementations may support all device types.
+// If an implementation does not find any devices based on a
+// device selector, it will throw an exception.
+// These macros can be used to check if there's any chance
+// of an implementation targeting a CPU and GPU.
+#if !defined(__HIPSYCL__) || defined(HIPSYCL_PLATFORM_CPU)
+#define SYCL_TRY_CPU_QUEUE 1
+#else
+#define SYCL_TRY_CPU_QUEUE 0
+#endif
+
+// !defined(HIPSYCL_PLATFORM_CPU) = !( defined(HIPSYCL_PLATFORM_CUDA) || defined(HIPSYCL_PLATFORM_HCC) )
+#if !defined(__HIPSYCL__) || !defined(HIPSYCL_PLATFORM_CPU)
+#define SYCL_TRY_GPU_QUEUE 1
+#else
+#define SYCL_TRY_GPU_QUEUE 0
+#endif
+
+#if 0
+#include "prk_opencl.h"
+#define USE_OPENCL 1
+#endif
+
+namespace prk {
+
+    // There seems to be an issue with the clang CUDA/HIP toolchains not having
+    // std::abort() available
+    void abort(void) {
+#if defined(HIPSYCL_PLATFORM_CUDA) || defined(HIPSYCL_PLATFORM_HCC)
+        abort();
+#else
+        std::abort();
+#endif
+    }
+
+    namespace SYCL {
+
+        void print_device_platform(const sycl::queue & q) {
+#if !defined(TRISYCL) && !defined(__HIPSYCL__)
+            auto device      = q.get_device();
+            auto platform    = device.get_platform();
+            std::cout << "SYCL Device:   " << device.get_info<sycl::info::device::name>() << std::endl;
+            std::cout << "SYCL Platform: " << platform.get_info<sycl::info::platform::name>() << std::endl;
+#endif
+        }
+
+        bool has_spir(const sycl::queue & q) {
+#if !defined(TRISYCL) && !defined(__HIPSYCL__)
+            auto device = q.get_device();
+            return device.has_extension(sycl::string_class("cl_khr_spir"));
+#else
+            return true;
+#endif
+        }
+
+        bool has_ptx(const sycl::queue & q) {
+#ifdef __COMPUTECPP__
+            return true;
+#else
+            return false;
+#endif
+        }
+
+        bool has_fp64(const sycl::queue & q) {
+#if !defined(TRISYCL) && !defined(__HIPSYCL__)
+            auto device      = q.get_device();
+            return device.has_extension(sycl::string_class("cl_khr_fp64"));
+#else
+            return true;
+#endif
+        }
+
+        void print_exception_details(sycl::exception & e) {
+#ifdef __COMPUTECPP__
+            std::cout << e.get_file_name() << std::endl;
+            std::cout << e.get_line_number() << std::endl;
+            std::cout << e.get_description() << std::endl;
+            std::cout << e.get_cl_error_message() << std::endl;
+            std::cout << e.get_cl_code() << std::endl;
+#endif
+        }
+
+    } // namespace SYCL
+
+} // namespace prk
+
+#endif // PRK_SYCL_HPP
diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h
index 7feaec93d..abdf6388d 100644
--- a/Cxx11/prk_util.h
+++ b/Cxx11/prk_util.h
@@ -82,35 +82,6 @@
 #define PRK_UNUSED
 #endif
 
-
-// for SYCL
-
-// prebuilt kernels are not required/not fully supported on hipSYCL and triSYCL
-#if defined(TRISYCL) || defined(__HIPSYCL__)
-#define PREBUILD_KERNEL 0
-#else
-#define PREBUILD_KERNEL 1
-#endif
-
-// not all SYCL implementations may support all device types.
-// If an implementation does not find any devices based on a
-// device selector, it will throw an exception.
-// These macros can be used to check if there's any chance
-// of an implementation targeting a CPU and GPU.
-#if !defined(__HIPSYCL__) || defined(HIPSYCL_PLATFORM_CPU)
-#define SYCL_TRY_CPU_QUEUE 1
-#else
-#define SYCL_TRY_CPU_QUEUE 0
-#endif
-
-// !defined(HIPSYCL_PLATFORM_CPU) = !( defined(HIPSYCL_PLATFORM_CUDA) || defined(HIPSYCL_PLATFORM_HCC) )
-#if !defined(__HIPSYCL__) || !defined(HIPSYCL_PLATFORM_CPU)
-#define SYCL_TRY_GPU_QUEUE 1
-#else
-#define SYCL_TRY_GPU_QUEUE 0
-#endif
-
-
 namespace prk {
 
     int get_alignment(void)
diff --git a/Cxx11/stencil-sycl-usm.cc b/Cxx11/stencil-sycl-usm.cc
new file mode 100644
index 000000000..1689841c8
--- /dev/null
+++ b/Cxx11/stencil-sycl-usm.cc
@@ -0,0 +1,341 @@
+
+///
+/// Copyright (c) 2017, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    Stencil
+///
+/// PURPOSE: This program tests the efficiency with which a space-invariant,
+///          linear, symmetric filter (stencil) can be applied to a square
+///          grid or image.
+///
+/// USAGE:   The program takes as input the linear
+///          dimension of the grid, and the number of iterations on the grid
+///
+///                <progname> <iterations> <grid size>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// FUNCTIONS CALLED:
+///
+///          Other than standard C functions, the following functions are used in
+///          this program:
+///          wtime()
+///
+/// HISTORY: - Written by Rob Van der Wijngaart, February 2009.
+///          - RvdW: Removed unrolling pragmas for clarity;
+///            added constant to array "in" at end of each iteration to force
+///            refreshing of neighbor data in parallel versions; August 2013
+///            C++11-ification by Jeff Hammond, May 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_sycl.h"
+#include "prk_util.h"
+#include "stencil_sycl.hpp"
+
+template <typename T> class init;
+template <typename T> class add;
+
+template <typename T>
+void nothing(sycl::queue & q, const size_t n, const T * in, T *out)
+{
+    std::cout << "You are trying to use a stencil that does not exist.\n";
+    std::cout << "Please generate the new stencil using the code generator\n";
+    std::cout << "and add it to the case-switch in the driver." << std::endl;
+    prk::abort();
+}
+
+template <typename T>
+void run(sycl::queue & q, int iterations, size_t n, size_t tile_size, bool star, size_t radius)
+{
+  auto stencil = nothing<T>;
+  if (star) {
+      switch (radius) {
+          case 1: stencil = star1; break;
+          case 2: stencil = star2; break;
+          case 3: stencil = star3; break;
+          case 4: stencil = star4; break;
+          case 5: stencil = star5; break;
+      }
+  }
+#if 0
+  else {
+      switch (radius) {
+          case 1: stencil = grid1; break;
+          case 2: stencil = grid2; break;
+          case 3: stencil = grid3; break;
+          case 4: stencil = grid4; break;
+          case 5: stencil = grid5; break;
+      }
+  }
+#endif
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  double stencil_time(0);
+
+  T * in;
+  T * out;
+
+  auto ctx = q.get_context();
+  auto dev = q.get_device();
+
+  try {
+
+    in  = static_cast<T*>(sycl::malloc_shared(n * n * sizeof(T), dev, ctx));
+    out = static_cast<T*>(sycl::malloc_shared(n * n * sizeof(T), dev, ctx));
+
+    q.submit([&](sycl::handler& h) {
+
+      h.parallel_for<class init<T>>(sycl::range<2> {n, n}, [=] (sycl::id<2> it) {
+          const auto i = it[0];
+          const auto j = it[1];
+          in[i*n+j] = static_cast<T>(i+j);
+      });
+    });
+    q.wait();
+
+    for (int iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) stencil_time = prk::wtime();
+
+      stencil(q, n, in, out);
+
+      q.submit([&](sycl::handler& h) {
+        // Add constant to solution to force refresh of neighbor data, if any
+        h.parallel_for<class add<T>>(sycl::range<2> {n, n}, sycl::id<2> {0, 0}, [=] (sycl::id<2> it) {
+            const auto i = it[0];
+            const auto j = it[1];
+            in[i*n+j] += static_cast<T>(1);
+        });
+      });
+      q.wait();
+    }
+    stencil_time = prk::wtime() - stencil_time;
+
+    sycl::free(in, ctx);
+  }
+  catch (sycl::exception & e) {
+    std::cout << e.what() << std::endl;
+    prk::SYCL::print_exception_details(e);
+    return;
+  }
+  catch (std::exception & e) {
+    std::cout << e.what() << std::endl;
+    return;
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  // interior of grid with respect to stencil
+  auto active_points = (n-2L*radius)*(n-2L*radius);
+
+  // compute L1 norm in parallel
+  double norm(0);
+  for (int i=radius; i<n-radius; i++) {
+    for (int j=radius; j<n-radius; j++) {
+      norm += std::fabs(out[i*n+j]);
+    }
+  }
+  norm /= active_points;
+
+  sycl::free(out, ctx);
+
+  // verify correctness
+  const double epsilon = 1.0e-8;
+  const double reference_norm = 2*(iterations+1);
+  if (std::fabs(norm-reference_norm) > epsilon) {
+    std::cout << "ERROR: L1 norm = " << norm
+              << " Reference L1 norm = " << reference_norm << std::endl;
+  } else {
+    std::cout << "Solution validates" << std::endl;
+#ifdef VERBOSE
+    std::cout << "L1 norm = " << norm
+              << " Reference L1 norm = " << reference_norm << std::endl;
+#endif
+    const size_t stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1);
+    size_t flops = (2L*stencil_size+1L) * active_points;
+    double avgtime = stencil_time/iterations;
+    std::cout << 8*sizeof(T) << "B "
+              << "Rate (MFlops/s): " << 1.0e-6 * static_cast<double>(flops)/avgtime
+              << " Avg time (s): " << avgtime << std::endl;
+  }
+}
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/SYCL Stencil execution on 2D grid" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Process and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations;
+  size_t n, tile_size;
+  bool star = true;
+  size_t radius = 2;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <array dimension> [<tile size> <star/grid> <stencil radius>]";
+      }
+
+      // number of times to run the algorithm
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      // linear grid dimension
+      n  = std::atoi(argv[2]);
+      if (n < 1) {
+        throw "ERROR: grid dimension must be positive";
+      } else if (n > std::floor(std::sqrt(INT_MAX))) {
+        throw "ERROR: grid dimension too large - overflow risk";
+      }
+
+      // default tile size for tiling of local transpose
+      tile_size = 32;
+      if (argc > 3) {
+          tile_size = std::atoi(argv[3]);
+          if (tile_size <= 0) tile_size = n;
+          if (tile_size > n) tile_size = n;
+      }
+
+      // stencil pattern
+      if (argc > 4) {
+          auto stencil = std::string(argv[4]);
+          auto grid = std::string("grid");
+          star = (stencil == grid) ? false : true;
+      }
+
+      // stencil radius
+      radius = 2;
+      if (argc > 5) {
+          radius = std::atoi(argv[5]);
+      }
+
+      if ( (radius < 1) || (2*radius+1 > n) ) {
+        throw "ERROR: Stencil radius negative or too large";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Grid size            = " << n << std::endl;
+  std::cout << "Type of stencil      = " << (star ? "star" : "grid") << std::endl;
+  std::cout << "Radius of stencil    = " << radius << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Setup SYCL environment
+  //////////////////////////////////////////////////////////////////////
+
+#ifdef USE_OPENCL
+  prk::opencl::listPlatforms();
+#endif
+
+  try {
+#if SYCL_TRY_CPU_QUEUE
+    if (n<10000) {
+        sycl::queue q(sycl::host_selector{});
+        prk::SYCL::print_device_platform(q);
+        run<float>(q, iterations, n, tile_size, star, radius);
+        run<double>(q, iterations, n, tile_size, star, radius);
+    } else {
+        std::cout << "Skipping host device since it is too slow for large problems" << std::endl;
+    }
+#endif
+
+    // CPU requires spir64 target
+#if SYCL_TRY_CPU_QUEUE
+    if (1) {
+        sycl::queue q(sycl::cpu_selector{});
+        prk::SYCL::print_device_platform(q);
+        bool has_spir = prk::SYCL::has_spir(q);
+        if (has_spir) {
+          run<float>(q, iterations, n, tile_size, star, radius);
+          run<double>(q, iterations, n, tile_size, star, radius);
+        }
+    }
+#endif
+
+    // NVIDIA GPU requires ptx64 target
+#if SYCL_TRY_GPU_QUEUE
+    if (1) {
+        sycl::queue q(sycl::gpu_selector{});
+        prk::SYCL::print_device_platform(q);
+        bool has_spir = prk::SYCL::has_spir(q);
+        bool has_fp64 = prk::SYCL::has_fp64(q);
+        bool has_ptx  = prk::SYCL::has_ptx(q);
+        if (!has_fp64) {
+          std::cout << "SYCL GPU device lacks FP64 support." << std::endl;
+        }
+        if (has_spir || has_ptx) {
+          run<float>(q, iterations, n, tile_size, star, radius);
+          if (has_fp64) {
+            run<double>(q, iterations, n, tile_size, star, radius);
+          }
+        }
+    }
+#endif
+  }
+  catch (sycl::exception & e) {
+    std::cout << e.what() << std::endl;
+    prk::SYCL::print_exception_details(e);
+    return 1;
+  }
+  catch (std::exception & e) {
+    std::cout << e.what() << std::endl;
+    return 1;
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
+
+
diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc
index b333c4194..949e4d632 100644
--- a/Cxx11/stencil-sycl.cc
+++ b/Cxx11/stencil-sycl.cc
@@ -60,41 +60,29 @@
 ///
 //////////////////////////////////////////////////////////////////////
 
-#include "CL/sycl.hpp"
+#include "prk_sycl.h"
 #include "prk_util.h"
 #include "stencil_sycl.hpp"
 
-
-#if 0
-#include "prk_opencl.h"
-#define USE_OPENCL 1
-#endif
-
 template <typename T> class init;
 template <typename T> class add;
 
 #if USE_2D_INDEXING
 template <typename T>
-void nothing(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, cl::sycl::buffer<T, 2> & d_out)
+void nothing(sycl::queue & q, const size_t n, sycl::buffer<T, 2> & d_in, sycl::buffer<T, 2> & d_out)
 #else
 template <typename T>
-void nothing(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::sycl::buffer<T> & d_out)
+void nothing(sycl::queue & q, const size_t n, sycl::buffer<T> & d_in, sycl::buffer<T> & d_out)
 #endif
 {
     std::cout << "You are trying to use a stencil that does not exist.\n";
     std::cout << "Please generate the new stencil using the code generator\n";
     std::cout << "and add it to the case-switch in the driver." << std::endl;
-    // There seems to be an issue with the clang CUDA/HIP toolchains not having
-    // std::abort() available
-#if defined(HIPSYCL_PLATFORM_CUDA) || defined(HIPSYCL_PLATFORM_HCC)
-    abort();
-#else
-    std::abort();
-#endif
+    prk::abort();
 }
 
 template <typename T>
-void run(cl::sycl::queue & q, int iterations, size_t n, size_t tile_size, bool star, size_t radius)
+void run(sycl::queue & q, int iterations, size_t n, size_t tile_size, bool star, size_t radius)
 {
   auto stencil = nothing<T>;
   if (star) {
@@ -131,23 +119,23 @@ void run(cl::sycl::queue & q, int iterations, size_t n, size_t tile_size, bool s
 
     // initialize device buffers from host buffers
 #if USE_2D_INDEXING
-    cl::sycl::buffer<T, 2> d_in  { cl::sycl::range<2> {n, n} };
-    cl::sycl::buffer<T, 2> d_out { h_out.data(), cl::sycl::range<2> {n, n} };
+    sycl::buffer<T, 2> d_in  { sycl::range<2> {n, n} };
+    sycl::buffer<T, 2> d_out { h_out.data(), sycl::range<2> {n, n} };
 #else
     // FIXME: if I don't initialize this buffer from host, the results are wrong.  Why?
-    //cl::sycl::buffer<T> d_in  { cl::sycl::range<1> {n*n} };
-    cl::sycl::buffer<T> d_in  { h_in.data(),  h_in.size() };
-    cl::sycl::buffer<T> d_out { h_out.data(), h_out.size() };
+    //sycl::buffer<T> d_in  { sycl::range<1> {n*n} };
+    sycl::buffer<T> d_in  { h_in.data(),  h_in.size() };
+    sycl::buffer<T> d_out { h_out.data(), h_out.size() };
 #endif
 
-    q.submit([&](cl::sycl::handler& h) {
+    q.submit([&](sycl::handler& h) {
 
       // accessor methods
-      auto in  = d_in.template get_access<cl::sycl::access::mode::read_write>(h);
+      auto in  = d_in.template get_access<sycl::access::mode::read_write>(h);
 
-      h.parallel_for<class init<T>>(cl::sycl::range<2> {n, n}, [=] (cl::sycl::item<2> it) {
+      h.parallel_for<class init<T>>(sycl::range<2> {n, n}, [=] (sycl::item<2> it) {
 #if USE_2D_INDEXING
-          cl::sycl::id<2> xy = it.get_id();
+          sycl::id<2> xy = it.get_id();
           auto i = it[0];
           auto j = it[1];
           in[xy] = static_cast<T>(i+j);
@@ -160,7 +148,7 @@ void run(cl::sycl::queue & q, int iterations, size_t n, size_t tile_size, bool s
     });
     q.wait();
 
-    for (auto iter = 0; iter<=iterations; iter++) {
+    for (int iter = 0; iter<=iterations; iter++) {
 
       if (iter==1) stencil_time = prk::wtime();
 
@@ -169,16 +157,16 @@ void run(cl::sycl::queue & q, int iterations, size_t n, size_t tile_size, bool s
       q.wait();
 #endif
 
-      q.submit([&](cl::sycl::handler& h) {
+      q.submit([&](sycl::handler& h) {
 
         // accessor methods
-        auto in  = d_in.template get_access<cl::sycl::access::mode::read_write>(h);
+        auto in  = d_in.template get_access<sycl::access::mode::read_write>(h);
 
         // Add constant to solution to force refresh of neighbor data, if any
-        h.parallel_for<class add<T>>(cl::sycl::range<2> {n, n}, cl::sycl::id<2> {0, 0},
-                                  [=] (cl::sycl::item<2> it) {
+        h.parallel_for<class add<T>>(sycl::range<2> {n, n}, sycl::id<2> {0, 0},
+                                  [=] (sycl::item<2> it) {
 #if USE_2D_INDEXING
-            cl::sycl::id<2> xy = it.get_id();
+            sycl::id<2> xy = it.get_id();
             in[xy] += static_cast<T>(1);
 #else
 #if 0 // This is noticeably slower :-(
@@ -195,15 +183,9 @@ void run(cl::sycl::queue & q, int iterations, size_t n, size_t tile_size, bool s
     }
     stencil_time = prk::wtime() - stencil_time;
   }
-  catch (cl::sycl::exception & e) {
+  catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
-#ifdef __COMPUTECPP__
-    std::cout << e.get_file_name() << std::endl;
-    std::cout << e.get_line_number() << std::endl;
-    std::cout << e.get_description() << std::endl;
-    std::cout << e.get_cl_error_message() << std::endl;
-    std::cout << e.get_cl_code() << std::endl;
-#endif
+    prk::SYCL::print_exception_details(e);
     return;
   }
   catch (std::exception & e) {
@@ -328,87 +310,53 @@ int main(int argc, char * argv[])
 #endif
 
   try {
-
 #if SYCL_TRY_CPU_QUEUE
-    if (1) {
-        cl::sycl::queue host(cl::sycl::host_selector{});
-#if !defined(TRISYCL) && !defined(__HIPSYCL__)
-        auto device      = host.get_device();
-        auto platform    = device.get_platform();
-        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
-        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
-#endif
-
-        run<float>(host, iterations, n, tile_size, star, radius);
-        run<double>(host, iterations, n, tile_size, star, radius);
+    if (n<10000) {
+        sycl::queue q(sycl::host_selector{});
+        prk::SYCL::print_device_platform(q);
+        run<float>(q, iterations, n, tile_size, star, radius);
+        run<double>(q, iterations, n, tile_size, star, radius);
+    } else {
+        std::cout << "Skipping host device since it is too slow for large problems" << std::endl;
     }
 #endif
 
     // CPU requires spir64 target
 #if SYCL_TRY_CPU_QUEUE
     if (1) {
-        cl::sycl::queue cpu(cl::sycl::cpu_selector{});
-#if !defined(TRISYCL) && !defined(__HIPSYCL__)
-        auto device      = cpu.get_device();
-        auto platform    = device.get_platform();
-        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
-        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
-        bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir"));
-#else
-        bool has_spir = true; // ?
-#endif
+        sycl::queue q(sycl::cpu_selector{});
+        prk::SYCL::print_device_platform(q);
+        bool has_spir = prk::SYCL::has_spir(q);
         if (has_spir) {
-          run<float>(cpu, iterations, n, tile_size, star, radius);
-          run<double>(cpu, iterations, n, tile_size, star, radius);
+          run<float>(q, iterations, n, tile_size, star, radius);
+          run<double>(q, iterations, n, tile_size, star, radius);
         }
     }
 #endif
 
-    // NVIDIA GPU requires ptx64 target and does not work very well
+    // NVIDIA GPU requires ptx64 target
 #if SYCL_TRY_GPU_QUEUE
-    if (0) {
-        cl::sycl::queue gpu(cl::sycl::gpu_selector{});
-#if !defined(TRISYCL) && !defined(__HIPSYCL__)
-        auto device      = gpu.get_device();
-        auto platform    = device.get_platform();
-        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
-        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
-        bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir"));
-        bool has_fp64 = device.has_extension(cl::sycl::string_class("cl_khr_fp64"));
-#else
-        bool has_spir = true; // ?
-        bool has_fp64 = true;
-#endif
+    if (1) {
+        sycl::queue q(sycl::gpu_selector{});
+        prk::SYCL::print_device_platform(q);
+        bool has_spir = prk::SYCL::has_spir(q);
+        bool has_fp64 = prk::SYCL::has_fp64(q);
+        bool has_ptx  = prk::SYCL::has_ptx(q);
         if (!has_fp64) {
           std::cout << "SYCL GPU device lacks FP64 support." << std::endl;
         }
-        if (has_spir) {
-          run<float>(gpu, iterations, n, tile_size, star, radius);
-          if (has_fp64) {
-            run<double>(gpu, iterations, n, tile_size, star, radius);
-          }
-        } else {
-          std::cout << "SYCL GPU device lacks SPIR-V support." << std::endl;
-#ifdef __COMPUTECPP__
-          std::cout << "You are using ComputeCpp so we will try it anyways..." << std::endl;
-          run<float>(gpu, iterations, n, tile_size, star, radius);
+        if (has_spir || has_ptx) {
+          run<float>(q, iterations, n, tile_size, star, radius);
           if (has_fp64) {
-            run<double>(gpu, iterations, n, tile_size, star, radius);
+            run<double>(q, iterations, n, tile_size, star, radius);
           }
-#endif
         }
     }
 #endif
   }
-  catch (cl::sycl::exception & e) {
+  catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
-#ifdef __COMPUTECPP__
-    std::cout << e.get_file_name() << std::endl;
-    std::cout << e.get_line_number() << std::endl;
-    std::cout << e.get_description() << std::endl;
-    std::cout << e.get_cl_error_message() << std::endl;
-    std::cout << e.get_cl_code() << std::endl;
-#endif
+    prk::SYCL::print_exception_details(e);
     return 1;
   }
   catch (std::exception & e) {
diff --git a/Cxx11/stencil_sycl.hpp b/Cxx11/stencil_sycl.hpp
index 41412e5b4..de3cde61b 100644
--- a/Cxx11/stencil_sycl.hpp
+++ b/Cxx11/stencil_sycl.hpp
@@ -2,16 +2,18 @@
 template <typename T> class star1_1d;
 
 template <typename T>
-void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::sycl::buffer<T> & d_out)
+void star1(sycl::queue & q, const size_t n, sycl::buffer<T> & d_in, sycl::buffer<T> & d_out)
 {
-  q.submit([&](cl::sycl::handler& h) {
-    auto in  = d_in.template get_access<cl::sycl::access::mode::read>(h);
-    auto out = d_out.template get_access<cl::sycl::access::mode::read_write>(h);
-    h.parallel_for<class star1_1d<T>>(cl::sycl::range<2> {n-2,n-2}, cl::sycl::id<2> {1,1}, [=] (cl::sycl::item<2> it) {
-        out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast<T>(0.5)
-                              +in[it[0]*n+(it[1]-1)] * static_cast<T>(-0.5)
-                              +in[(it[0]+1)*n+it[1]] * static_cast<T>(0.5)
-                              +in[(it[0]-1)*n+it[1]] * static_cast<T>(-0.5);
+  q.submit([&](sycl::handler& h) {
+    auto in  = d_in.template get_access<sycl::access::mode::read>(h);
+    auto out = d_out.template get_access<sycl::access::mode::read_write>(h);
+    h.parallel_for<class star1_1d<T>>(sycl::range<2> {n-2,n-2}, sycl::id<2> {1,1}, [=] (sycl::item<2> it) {
+        const auto i = it[0];
+        const auto j = it[1];
+        out[i*n+j] += +in[i*n+(j+1)] * static_cast<T>(0.5)
+                              +in[i*n+(j-1)] * static_cast<T>(-0.5)
+                              +in[(i+1)*n+j] * static_cast<T>(0.5)
+                              +in[(i-1)*n+j] * static_cast<T>(-0.5);
     });
   });
 }
@@ -20,15 +22,15 @@ void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::
 template <typename T> class star1_2d;
 
 template <typename T>
-void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, cl::sycl::buffer<T, 2> & d_out)
+void star1(sycl::queue & q, const size_t n, sycl::buffer<T, 2> & d_in, sycl::buffer<T, 2> & d_out)
 {
-  q.submit([&](cl::sycl::handler& h) {
-    auto in  = d_in.template get_access<cl::sycl::access::mode::read>(h);
-    auto out = d_out.template get_access<cl::sycl::access::mode::read_write>(h);
-    cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
-    cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
-    h.parallel_for<class star1_2d<T>>(cl::sycl::range<2> {n-2,n-2}, cl::sycl::id<2> {1,1}, [=] (cl::sycl::item<2> it) {
-        cl::sycl::id<2> xy = it.get_id();
+  q.submit([&](sycl::handler& h) {
+    auto in  = d_in.template get_access<sycl::access::mode::read>(h);
+    auto out = d_out.template get_access<sycl::access::mode::read_write>(h);
+    sycl::id<2> dx1(sycl::range<2> {1,0});
+    sycl::id<2> dy1(sycl::range<2> {0,1});
+    h.parallel_for<class star1_2d<T>>(sycl::range<2> {n-2,n-2}, sycl::id<2> {1,1}, [=] (sycl::item<2> it) {
+        sycl::id<2> xy = it.get_id();
         out[xy] += +in[xy+dx1] * static_cast<T>(0.5)
                    +in[xy-dx1] * static_cast<T>(-0.5)
                    +in[xy+dy1] * static_cast<T>(0.5)
@@ -37,24 +39,44 @@ void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, c
   });
 }
 
+// declare the kernel name used in SYCL parallel_for
+template <typename T> class star1_usm;
+
+template <typename T>
+void star1(sycl::queue & q, const size_t n, const T * in, T * out)
+{
+  q.submit([&](sycl::handler& h) {
+    h.parallel_for<class star1_usm<T>>(sycl::range<2> {n-2,n-2}, sycl::id<2> {1,1}, [=] (sycl::item<2> it) {
+        const auto i = it[0];
+        const auto j = it[1];
+        out[i*n+j] += +in[i*n+(j+1)] * static_cast<T>(0.5)
+                              +in[i*n+(j-1)] * static_cast<T>(-0.5)
+                              +in[(i+1)*n+j] * static_cast<T>(0.5)
+                              +in[(i-1)*n+j] * static_cast<T>(-0.5);
+    });
+  });
+}
+
 // declare the kernel name used in SYCL parallel_for
 template <typename T> class star2_1d;
 
 template <typename T>
-void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::sycl::buffer<T> & d_out)
+void star2(sycl::queue & q, const size_t n, sycl::buffer<T> & d_in, sycl::buffer<T> & d_out)
 {
-  q.submit([&](cl::sycl::handler& h) {
-    auto in  = d_in.template get_access<cl::sycl::access::mode::read>(h);
-    auto out = d_out.template get_access<cl::sycl::access::mode::read_write>(h);
-    h.parallel_for<class star2_1d<T>>(cl::sycl::range<2> {n-4,n-4}, cl::sycl::id<2> {2,2}, [=] (cl::sycl::item<2> it) {
-        out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast<T>(0.25)
-                              +in[it[0]*n+(it[1]-1)] * static_cast<T>(-0.25)
-                              +in[(it[0]+1)*n+it[1]] * static_cast<T>(0.25)
-                              +in[(it[0]-1)*n+it[1]] * static_cast<T>(-0.25)
-                              +in[it[0]*n+(it[1]+2)] * static_cast<T>(0.125)
-                              +in[it[0]*n+(it[1]-2)] * static_cast<T>(-0.125)
-                              +in[(it[0]+2)*n+it[1]] * static_cast<T>(0.125)
-                              +in[(it[0]-2)*n+it[1]] * static_cast<T>(-0.125);
+  q.submit([&](sycl::handler& h) {
+    auto in  = d_in.template get_access<sycl::access::mode::read>(h);
+    auto out = d_out.template get_access<sycl::access::mode::read_write>(h);
+    h.parallel_for<class star2_1d<T>>(sycl::range<2> {n-4,n-4}, sycl::id<2> {2,2}, [=] (sycl::item<2> it) {
+        const auto i = it[0];
+        const auto j = it[1];
+        out[i*n+j] += +in[i*n+(j+1)] * static_cast<T>(0.25)
+                              +in[i*n+(j-1)] * static_cast<T>(-0.25)
+                              +in[(i+1)*n+j] * static_cast<T>(0.25)
+                              +in[(i-1)*n+j] * static_cast<T>(-0.25)
+                              +in[i*n+(j+2)] * static_cast<T>(0.125)
+                              +in[i*n+(j-2)] * static_cast<T>(-0.125)
+                              +in[(i+2)*n+j] * static_cast<T>(0.125)
+                              +in[(i-2)*n+j] * static_cast<T>(-0.125);
     });
   });
 }
@@ -63,17 +85,17 @@ void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::
 template <typename T> class star2_2d;
 
 template <typename T>
-void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, cl::sycl::buffer<T, 2> & d_out)
+void star2(sycl::queue & q, const size_t n, sycl::buffer<T, 2> & d_in, sycl::buffer<T, 2> & d_out)
 {
-  q.submit([&](cl::sycl::handler& h) {
-    auto in  = d_in.template get_access<cl::sycl::access::mode::read>(h);
-    auto out = d_out.template get_access<cl::sycl::access::mode::read_write>(h);
-    cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
-    cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
-    cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0});
-    cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2});
-    h.parallel_for<class star2_2d<T>>(cl::sycl::range<2> {n-4,n-4}, cl::sycl::id<2> {2,2}, [=] (cl::sycl::item<2> it) {
-        cl::sycl::id<2> xy = it.get_id();
+  q.submit([&](sycl::handler& h) {
+    auto in  = d_in.template get_access<sycl::access::mode::read>(h);
+    auto out = d_out.template get_access<sycl::access::mode::read_write>(h);
+    sycl::id<2> dx1(sycl::range<2> {1,0});
+    sycl::id<2> dy1(sycl::range<2> {0,1});
+    sycl::id<2> dx2(sycl::range<2> {2,0});
+    sycl::id<2> dy2(sycl::range<2> {0,2});
+    h.parallel_for<class star2_2d<T>>(sycl::range<2> {n-4,n-4}, sycl::id<2> {2,2}, [=] (sycl::item<2> it) {
+        sycl::id<2> xy = it.get_id();
         out[xy] += +in[xy+dx1] * static_cast<T>(0.25)
                    +in[xy-dx1] * static_cast<T>(-0.25)
                    +in[xy+dy1] * static_cast<T>(0.25)
@@ -86,28 +108,52 @@ void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, c
   });
 }
 
+// declare the kernel name used in SYCL parallel_for
+template <typename T> class star2_usm;
+
+template <typename T>
+void star2(sycl::queue & q, const size_t n, const T * in, T * out)
+{
+  q.submit([&](sycl::handler& h) {
+    h.parallel_for<class star2_usm<T>>(sycl::range<2> {n-4,n-4}, sycl::id<2> {2,2}, [=] (sycl::item<2> it) {
+        const auto i = it[0];
+        const auto j = it[1];
+        out[i*n+j] += +in[i*n+(j+1)] * static_cast<T>(0.25)
+                              +in[i*n+(j-1)] * static_cast<T>(-0.25)
+                              +in[(i+1)*n+j] * static_cast<T>(0.25)
+                              +in[(i-1)*n+j] * static_cast<T>(-0.25)
+                              +in[i*n+(j+2)] * static_cast<T>(0.125)
+                              +in[i*n+(j-2)] * static_cast<T>(-0.125)
+                              +in[(i+2)*n+j] * static_cast<T>(0.125)
+                              +in[(i-2)*n+j] * static_cast<T>(-0.125);
+    });
+  });
+}
+
 // declare the kernel name used in SYCL parallel_for
 template <typename T> class star3_1d;
 
 template <typename T>
-void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::sycl::buffer<T> & d_out)
+void star3(sycl::queue & q, const size_t n, sycl::buffer<T> & d_in, sycl::buffer<T> & d_out)
 {
-  q.submit([&](cl::sycl::handler& h) {
-    auto in  = d_in.template get_access<cl::sycl::access::mode::read>(h);
-    auto out = d_out.template get_access<cl::sycl::access::mode::read_write>(h);
-    h.parallel_for<class star3_1d<T>>(cl::sycl::range<2> {n-6,n-6}, cl::sycl::id<2> {3,3}, [=] (cl::sycl::item<2> it) {
-        out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast<T>(0.16666666666666666)
-                              +in[it[0]*n+(it[1]-1)] * static_cast<T>(-0.16666666666666666)
-                              +in[(it[0]+1)*n+it[1]] * static_cast<T>(0.16666666666666666)
-                              +in[(it[0]-1)*n+it[1]] * static_cast<T>(-0.16666666666666666)
-                              +in[it[0]*n+(it[1]+2)] * static_cast<T>(0.08333333333333333)
-                              +in[it[0]*n+(it[1]-2)] * static_cast<T>(-0.08333333333333333)
-                              +in[(it[0]+2)*n+it[1]] * static_cast<T>(0.08333333333333333)
-                              +in[(it[0]-2)*n+it[1]] * static_cast<T>(-0.08333333333333333)
-                              +in[it[0]*n+(it[1]+3)] * static_cast<T>(0.05555555555555555)
-                              +in[it[0]*n+(it[1]-3)] * static_cast<T>(-0.05555555555555555)
-                              +in[(it[0]+3)*n+it[1]] * static_cast<T>(0.05555555555555555)
-                              +in[(it[0]-3)*n+it[1]] * static_cast<T>(-0.05555555555555555);
+  q.submit([&](sycl::handler& h) {
+    auto in  = d_in.template get_access<sycl::access::mode::read>(h);
+    auto out = d_out.template get_access<sycl::access::mode::read_write>(h);
+    h.parallel_for<class star3_1d<T>>(sycl::range<2> {n-6,n-6}, sycl::id<2> {3,3}, [=] (sycl::item<2> it) {
+        const auto i = it[0];
+        const auto j = it[1];
+        out[i*n+j] += +in[i*n+(j+1)] * static_cast<T>(0.166666666667)
+                              +in[i*n+(j-1)] * static_cast<T>(-0.166666666667)
+                              +in[(i+1)*n+j] * static_cast<T>(0.166666666667)
+                              +in[(i-1)*n+j] * static_cast<T>(-0.166666666667)
+                              +in[i*n+(j+2)] * static_cast<T>(0.0833333333333)
+                              +in[i*n+(j-2)] * static_cast<T>(-0.0833333333333)
+                              +in[(i+2)*n+j] * static_cast<T>(0.0833333333333)
+                              +in[(i-2)*n+j] * static_cast<T>(-0.0833333333333)
+                              +in[i*n+(j+3)] * static_cast<T>(0.0555555555556)
+                              +in[i*n+(j-3)] * static_cast<T>(-0.0555555555556)
+                              +in[(i+3)*n+j] * static_cast<T>(0.0555555555556)
+                              +in[(i-3)*n+j] * static_cast<T>(-0.0555555555556);
     });
   });
 }
@@ -116,31 +162,57 @@ void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::
 template <typename T> class star3_2d;
 
 template <typename T>
-void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, cl::sycl::buffer<T, 2> & d_out)
+void star3(sycl::queue & q, const size_t n, sycl::buffer<T, 2> & d_in, sycl::buffer<T, 2> & d_out)
+{
+  q.submit([&](sycl::handler& h) {
+    auto in  = d_in.template get_access<sycl::access::mode::read>(h);
+    auto out = d_out.template get_access<sycl::access::mode::read_write>(h);
+    sycl::id<2> dx1(sycl::range<2> {1,0});
+    sycl::id<2> dy1(sycl::range<2> {0,1});
+    sycl::id<2> dx2(sycl::range<2> {2,0});
+    sycl::id<2> dy2(sycl::range<2> {0,2});
+    sycl::id<2> dx3(sycl::range<2> {3,0});
+    sycl::id<2> dy3(sycl::range<2> {0,3});
+    h.parallel_for<class star3_2d<T>>(sycl::range<2> {n-6,n-6}, sycl::id<2> {3,3}, [=] (sycl::item<2> it) {
+        sycl::id<2> xy = it.get_id();
+        out[xy] += +in[xy+dx1] * static_cast<T>(0.166666666667)
+                   +in[xy-dx1] * static_cast<T>(-0.166666666667)
+                   +in[xy+dy1] * static_cast<T>(0.166666666667)
+                   +in[xy-dy1] * static_cast<T>(-0.166666666667)
+                   +in[xy+dx2] * static_cast<T>(0.0833333333333)
+                   +in[xy-dx2] * static_cast<T>(-0.0833333333333)
+                   +in[xy+dy2] * static_cast<T>(0.0833333333333)
+                   +in[xy-dy2] * static_cast<T>(-0.0833333333333)
+                   +in[xy+dx3] * static_cast<T>(0.0555555555556)
+                   +in[xy-dx3] * static_cast<T>(-0.0555555555556)
+                   +in[xy+dy3] * static_cast<T>(0.0555555555556)
+                   +in[xy-dy3] * static_cast<T>(-0.0555555555556);
+    });
+  });
+}
+
+// declare the kernel name used in SYCL parallel_for
+template <typename T> class star3_usm;
+
+template <typename T>
+void star3(sycl::queue & q, const size_t n, const T * in, T * out)
 {
-  q.submit([&](cl::sycl::handler& h) {
-    auto in  = d_in.template get_access<cl::sycl::access::mode::read>(h);
-    auto out = d_out.template get_access<cl::sycl::access::mode::read_write>(h);
-    cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
-    cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
-    cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0});
-    cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2});
-    cl::sycl::id<2> dx3(cl::sycl::range<2> {3,0});
-    cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3});
-    h.parallel_for<class star3_2d<T>>(cl::sycl::range<2> {n-6,n-6}, cl::sycl::id<2> {3,3}, [=] (cl::sycl::item<2> it) {
-        cl::sycl::id<2> xy = it.get_id();
-        out[xy] += +in[xy+dx1] * static_cast<T>(0.16666666666666666)
-                   +in[xy-dx1] * static_cast<T>(-0.16666666666666666)
-                   +in[xy+dy1] * static_cast<T>(0.16666666666666666)
-                   +in[xy-dy1] * static_cast<T>(-0.16666666666666666)
-                   +in[xy+dx2] * static_cast<T>(0.08333333333333333)
-                   +in[xy-dx2] * static_cast<T>(-0.08333333333333333)
-                   +in[xy+dy2] * static_cast<T>(0.08333333333333333)
-                   +in[xy-dy2] * static_cast<T>(-0.08333333333333333)
-                   +in[xy+dx3] * static_cast<T>(0.05555555555555555)
-                   +in[xy-dx3] * static_cast<T>(-0.05555555555555555)
-                   +in[xy+dy3] * static_cast<T>(0.05555555555555555)
-                   +in[xy-dy3] * static_cast<T>(-0.05555555555555555);
+  q.submit([&](sycl::handler& h) {
+    h.parallel_for<class star3_usm<T>>(sycl::range<2> {n-6,n-6}, sycl::id<2> {3,3}, [=] (sycl::item<2> it) {
+        const auto i = it[0];
+        const auto j = it[1];
+        out[i*n+j] += +in[i*n+(j+1)] * static_cast<T>(0.166666666667)
+                              +in[i*n+(j-1)] * static_cast<T>(-0.166666666667)
+                              +in[(i+1)*n+j] * static_cast<T>(0.166666666667)
+                              +in[(i-1)*n+j] * static_cast<T>(-0.166666666667)
+                              +in[i*n+(j+2)] * static_cast<T>(0.0833333333333)
+                              +in[i*n+(j-2)] * static_cast<T>(-0.0833333333333)
+                              +in[(i+2)*n+j] * static_cast<T>(0.0833333333333)
+                              +in[(i-2)*n+j] * static_cast<T>(-0.0833333333333)
+                              +in[i*n+(j+3)] * static_cast<T>(0.0555555555556)
+                              +in[i*n+(j-3)] * static_cast<T>(-0.0555555555556)
+                              +in[(i+3)*n+j] * static_cast<T>(0.0555555555556)
+                              +in[(i-3)*n+j] * static_cast<T>(-0.0555555555556);
     });
   });
 }
@@ -149,28 +221,30 @@ void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, c
 template <typename T> class star4_1d;
 
 template <typename T>
-void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::sycl::buffer<T> & d_out)
+void star4(sycl::queue & q, const size_t n, sycl::buffer<T> & d_in, sycl::buffer<T> & d_out)
 {
-  q.submit([&](cl::sycl::handler& h) {
-    auto in  = d_in.template get_access<cl::sycl::access::mode::read>(h);
-    auto out = d_out.template get_access<cl::sycl::access::mode::read_write>(h);
-    h.parallel_for<class star4_1d<T>>(cl::sycl::range<2> {n-8,n-8}, cl::sycl::id<2> {4,4}, [=] (cl::sycl::item<2> it) {
-        out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast<T>(0.125)
-                              +in[it[0]*n+(it[1]-1)] * static_cast<T>(-0.125)
-                              +in[(it[0]+1)*n+it[1]] * static_cast<T>(0.125)
-                              +in[(it[0]-1)*n+it[1]] * static_cast<T>(-0.125)
-                              +in[it[0]*n+(it[1]+2)] * static_cast<T>(0.0625)
-                              +in[it[0]*n+(it[1]-2)] * static_cast<T>(-0.0625)
-                              +in[(it[0]+2)*n+it[1]] * static_cast<T>(0.0625)
-                              +in[(it[0]-2)*n+it[1]] * static_cast<T>(-0.0625)
-                              +in[it[0]*n+(it[1]+3)] * static_cast<T>(0.041666666666666664)
-                              +in[it[0]*n+(it[1]-3)] * static_cast<T>(-0.041666666666666664)
-                              +in[(it[0]+3)*n+it[1]] * static_cast<T>(0.041666666666666664)
-                              +in[(it[0]-3)*n+it[1]] * static_cast<T>(-0.041666666666666664)
-                              +in[it[0]*n+(it[1]+4)] * static_cast<T>(0.03125)
-                              +in[it[0]*n+(it[1]-4)] * static_cast<T>(-0.03125)
-                              +in[(it[0]+4)*n+it[1]] * static_cast<T>(0.03125)
-                              +in[(it[0]-4)*n+it[1]] * static_cast<T>(-0.03125);
+  q.submit([&](sycl::handler& h) {
+    auto in  = d_in.template get_access<sycl::access::mode::read>(h);
+    auto out = d_out.template get_access<sycl::access::mode::read_write>(h);
+    h.parallel_for<class star4_1d<T>>(sycl::range<2> {n-8,n-8}, sycl::id<2> {4,4}, [=] (sycl::item<2> it) {
+        const auto i = it[0];
+        const auto j = it[1];
+        out[i*n+j] += +in[i*n+(j+1)] * static_cast<T>(0.125)
+                              +in[i*n+(j-1)] * static_cast<T>(-0.125)
+                              +in[(i+1)*n+j] * static_cast<T>(0.125)
+                              +in[(i-1)*n+j] * static_cast<T>(-0.125)
+                              +in[i*n+(j+2)] * static_cast<T>(0.0625)
+                              +in[i*n+(j-2)] * static_cast<T>(-0.0625)
+                              +in[(i+2)*n+j] * static_cast<T>(0.0625)
+                              +in[(i-2)*n+j] * static_cast<T>(-0.0625)
+                              +in[i*n+(j+3)] * static_cast<T>(0.0416666666667)
+                              +in[i*n+(j-3)] * static_cast<T>(-0.0416666666667)
+                              +in[(i+3)*n+j] * static_cast<T>(0.0416666666667)
+                              +in[(i-3)*n+j] * static_cast<T>(-0.0416666666667)
+                              +in[i*n+(j+4)] * static_cast<T>(0.03125)
+                              +in[i*n+(j-4)] * static_cast<T>(-0.03125)
+                              +in[(i+4)*n+j] * static_cast<T>(0.03125)
+                              +in[(i-4)*n+j] * static_cast<T>(-0.03125);
     });
   });
 }
@@ -179,21 +253,21 @@ void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::
 template <typename T> class star4_2d;
 
 template <typename T>
-void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, cl::sycl::buffer<T, 2> & d_out)
+void star4(sycl::queue & q, const size_t n, sycl::buffer<T, 2> & d_in, sycl::buffer<T, 2> & d_out)
 {
-  q.submit([&](cl::sycl::handler& h) {
-    auto in  = d_in.template get_access<cl::sycl::access::mode::read>(h);
-    auto out = d_out.template get_access<cl::sycl::access::mode::read_write>(h);
-    cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
-    cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
-    cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0});
-    cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2});
-    cl::sycl::id<2> dx3(cl::sycl::range<2> {3,0});
-    cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3});
-    cl::sycl::id<2> dx4(cl::sycl::range<2> {4,0});
-    cl::sycl::id<2> dy4(cl::sycl::range<2> {0,4});
-    h.parallel_for<class star4_2d<T>>(cl::sycl::range<2> {n-8,n-8}, cl::sycl::id<2> {4,4}, [=] (cl::sycl::item<2> it) {
-        cl::sycl::id<2> xy = it.get_id();
+  q.submit([&](sycl::handler& h) {
+    auto in  = d_in.template get_access<sycl::access::mode::read>(h);
+    auto out = d_out.template get_access<sycl::access::mode::read_write>(h);
+    sycl::id<2> dx1(sycl::range<2> {1,0});
+    sycl::id<2> dy1(sycl::range<2> {0,1});
+    sycl::id<2> dx2(sycl::range<2> {2,0});
+    sycl::id<2> dy2(sycl::range<2> {0,2});
+    sycl::id<2> dx3(sycl::range<2> {3,0});
+    sycl::id<2> dy3(sycl::range<2> {0,3});
+    sycl::id<2> dx4(sycl::range<2> {4,0});
+    sycl::id<2> dy4(sycl::range<2> {0,4});
+    h.parallel_for<class star4_2d<T>>(sycl::range<2> {n-8,n-8}, sycl::id<2> {4,4}, [=] (sycl::item<2> it) {
+        sycl::id<2> xy = it.get_id();
         out[xy] += +in[xy+dx1] * static_cast<T>(0.125)
                    +in[xy-dx1] * static_cast<T>(-0.125)
                    +in[xy+dy1] * static_cast<T>(0.125)
@@ -202,10 +276,10 @@ void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, c
                    +in[xy-dx2] * static_cast<T>(-0.0625)
                    +in[xy+dy2] * static_cast<T>(0.0625)
                    +in[xy-dy2] * static_cast<T>(-0.0625)
-                   +in[xy+dx3] * static_cast<T>(0.041666666666666664)
-                   +in[xy-dx3] * static_cast<T>(-0.041666666666666664)
-                   +in[xy+dy3] * static_cast<T>(0.041666666666666664)
-                   +in[xy-dy3] * static_cast<T>(-0.041666666666666664)
+                   +in[xy+dx3] * static_cast<T>(0.0416666666667)
+                   +in[xy-dx3] * static_cast<T>(-0.0416666666667)
+                   +in[xy+dy3] * static_cast<T>(0.0416666666667)
+                   +in[xy-dy3] * static_cast<T>(-0.0416666666667)
                    +in[xy+dx4] * static_cast<T>(0.03125)
                    +in[xy-dx4] * static_cast<T>(-0.03125)
                    +in[xy+dy4] * static_cast<T>(0.03125)
@@ -214,36 +288,68 @@ void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, c
   });
 }
 
+// declare the kernel name used in SYCL parallel_for
+template <typename T> class star4_usm;
+
+template <typename T>
+void star4(sycl::queue & q, const size_t n, const T * in, T * out)
+{
+  q.submit([&](sycl::handler& h) {
+    h.parallel_for<class star4_usm<T>>(sycl::range<2> {n-8,n-8}, sycl::id<2> {4,4}, [=] (sycl::item<2> it) {
+        const auto i = it[0];
+        const auto j = it[1];
+        out[i*n+j] += +in[i*n+(j+1)] * static_cast<T>(0.125)
+                              +in[i*n+(j-1)] * static_cast<T>(-0.125)
+                              +in[(i+1)*n+j] * static_cast<T>(0.125)
+                              +in[(i-1)*n+j] * static_cast<T>(-0.125)
+                              +in[i*n+(j+2)] * static_cast<T>(0.0625)
+                              +in[i*n+(j-2)] * static_cast<T>(-0.0625)
+                              +in[(i+2)*n+j] * static_cast<T>(0.0625)
+                              +in[(i-2)*n+j] * static_cast<T>(-0.0625)
+                              +in[i*n+(j+3)] * static_cast<T>(0.0416666666667)
+                              +in[i*n+(j-3)] * static_cast<T>(-0.0416666666667)
+                              +in[(i+3)*n+j] * static_cast<T>(0.0416666666667)
+                              +in[(i-3)*n+j] * static_cast<T>(-0.0416666666667)
+                              +in[i*n+(j+4)] * static_cast<T>(0.03125)
+                              +in[i*n+(j-4)] * static_cast<T>(-0.03125)
+                              +in[(i+4)*n+j] * static_cast<T>(0.03125)
+                              +in[(i-4)*n+j] * static_cast<T>(-0.03125);
+    });
+  });
+}
+
 // declare the kernel name used in SYCL parallel_for
 template <typename T> class star5_1d;
 
 template <typename T>
-void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::sycl::buffer<T> & d_out)
+void star5(sycl::queue & q, const size_t n, sycl::buffer<T> & d_in, sycl::buffer<T> & d_out)
 {
-  q.submit([&](cl::sycl::handler& h) {
-    auto in  = d_in.template get_access<cl::sycl::access::mode::read>(h);
-    auto out = d_out.template get_access<cl::sycl::access::mode::read_write>(h);
-    h.parallel_for<class star5_1d<T>>(cl::sycl::range<2> {n-10,n-10}, cl::sycl::id<2> {5,5}, [=] (cl::sycl::item<2> it) {
-        out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast<T>(0.1)
-                              +in[it[0]*n+(it[1]-1)] * static_cast<T>(-0.1)
-                              +in[(it[0]+1)*n+it[1]] * static_cast<T>(0.1)
-                              +in[(it[0]-1)*n+it[1]] * static_cast<T>(-0.1)
-                              +in[it[0]*n+(it[1]+2)] * static_cast<T>(0.05)
-                              +in[it[0]*n+(it[1]-2)] * static_cast<T>(-0.05)
-                              +in[(it[0]+2)*n+it[1]] * static_cast<T>(0.05)
-                              +in[(it[0]-2)*n+it[1]] * static_cast<T>(-0.05)
-                              +in[it[0]*n+(it[1]+3)] * static_cast<T>(0.03333333333333333)
-                              +in[it[0]*n+(it[1]-3)] * static_cast<T>(-0.03333333333333333)
-                              +in[(it[0]+3)*n+it[1]] * static_cast<T>(0.03333333333333333)
-                              +in[(it[0]-3)*n+it[1]] * static_cast<T>(-0.03333333333333333)
-                              +in[it[0]*n+(it[1]+4)] * static_cast<T>(0.025)
-                              +in[it[0]*n+(it[1]-4)] * static_cast<T>(-0.025)
-                              +in[(it[0]+4)*n+it[1]] * static_cast<T>(0.025)
-                              +in[(it[0]-4)*n+it[1]] * static_cast<T>(-0.025)
-                              +in[it[0]*n+(it[1]+5)] * static_cast<T>(0.02)
-                              +in[it[0]*n+(it[1]-5)] * static_cast<T>(-0.02)
-                              +in[(it[0]+5)*n+it[1]] * static_cast<T>(0.02)
-                              +in[(it[0]-5)*n+it[1]] * static_cast<T>(-0.02);
+  q.submit([&](sycl::handler& h) {
+    auto in  = d_in.template get_access<sycl::access::mode::read>(h);
+    auto out = d_out.template get_access<sycl::access::mode::read_write>(h);
+    h.parallel_for<class star5_1d<T>>(sycl::range<2> {n-10,n-10}, sycl::id<2> {5,5}, [=] (sycl::item<2> it) {
+        const auto i = it[0];
+        const auto j = it[1];
+        out[i*n+j] += +in[i*n+(j+1)] * static_cast<T>(0.1)
+                              +in[i*n+(j-1)] * static_cast<T>(-0.1)
+                              +in[(i+1)*n+j] * static_cast<T>(0.1)
+                              +in[(i-1)*n+j] * static_cast<T>(-0.1)
+                              +in[i*n+(j+2)] * static_cast<T>(0.05)
+                              +in[i*n+(j-2)] * static_cast<T>(-0.05)
+                              +in[(i+2)*n+j] * static_cast<T>(0.05)
+                              +in[(i-2)*n+j] * static_cast<T>(-0.05)
+                              +in[i*n+(j+3)] * static_cast<T>(0.0333333333333)
+                              +in[i*n+(j-3)] * static_cast<T>(-0.0333333333333)
+                              +in[(i+3)*n+j] * static_cast<T>(0.0333333333333)
+                              +in[(i-3)*n+j] * static_cast<T>(-0.0333333333333)
+                              +in[i*n+(j+4)] * static_cast<T>(0.025)
+                              +in[i*n+(j-4)] * static_cast<T>(-0.025)
+                              +in[(i+4)*n+j] * static_cast<T>(0.025)
+                              +in[(i-4)*n+j] * static_cast<T>(-0.025)
+                              +in[i*n+(j+5)] * static_cast<T>(0.02)
+                              +in[i*n+(j-5)] * static_cast<T>(-0.02)
+                              +in[(i+5)*n+j] * static_cast<T>(0.02)
+                              +in[(i-5)*n+j] * static_cast<T>(-0.02);
     });
   });
 }
@@ -252,23 +358,23 @@ void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T> & d_in, cl::
 template <typename T> class star5_2d;
 
 template <typename T>
-void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, cl::sycl::buffer<T, 2> & d_out)
+void star5(sycl::queue & q, const size_t n, sycl::buffer<T, 2> & d_in, sycl::buffer<T, 2> & d_out)
 {
-  q.submit([&](cl::sycl::handler& h) {
-    auto in  = d_in.template get_access<cl::sycl::access::mode::read>(h);
-    auto out = d_out.template get_access<cl::sycl::access::mode::read_write>(h);
-    cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0});
-    cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1});
-    cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0});
-    cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2});
-    cl::sycl::id<2> dx3(cl::sycl::range<2> {3,0});
-    cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3});
-    cl::sycl::id<2> dx4(cl::sycl::range<2> {4,0});
-    cl::sycl::id<2> dy4(cl::sycl::range<2> {0,4});
-    cl::sycl::id<2> dx5(cl::sycl::range<2> {5,0});
-    cl::sycl::id<2> dy5(cl::sycl::range<2> {0,5});
-    h.parallel_for<class star5_2d<T>>(cl::sycl::range<2> {n-10,n-10}, cl::sycl::id<2> {5,5}, [=] (cl::sycl::item<2> it) {
-        cl::sycl::id<2> xy = it.get_id();
+  q.submit([&](sycl::handler& h) {
+    auto in  = d_in.template get_access<sycl::access::mode::read>(h);
+    auto out = d_out.template get_access<sycl::access::mode::read_write>(h);
+    sycl::id<2> dx1(sycl::range<2> {1,0});
+    sycl::id<2> dy1(sycl::range<2> {0,1});
+    sycl::id<2> dx2(sycl::range<2> {2,0});
+    sycl::id<2> dy2(sycl::range<2> {0,2});
+    sycl::id<2> dx3(sycl::range<2> {3,0});
+    sycl::id<2> dy3(sycl::range<2> {0,3});
+    sycl::id<2> dx4(sycl::range<2> {4,0});
+    sycl::id<2> dy4(sycl::range<2> {0,4});
+    sycl::id<2> dx5(sycl::range<2> {5,0});
+    sycl::id<2> dy5(sycl::range<2> {0,5});
+    h.parallel_for<class star5_2d<T>>(sycl::range<2> {n-10,n-10}, sycl::id<2> {5,5}, [=] (sycl::item<2> it) {
+        sycl::id<2> xy = it.get_id();
         out[xy] += +in[xy+dx1] * static_cast<T>(0.1)
                    +in[xy-dx1] * static_cast<T>(-0.1)
                    +in[xy+dy1] * static_cast<T>(0.1)
@@ -277,10 +383,10 @@ void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, c
                    +in[xy-dx2] * static_cast<T>(-0.05)
                    +in[xy+dy2] * static_cast<T>(0.05)
                    +in[xy-dy2] * static_cast<T>(-0.05)
-                   +in[xy+dx3] * static_cast<T>(0.03333333333333333)
-                   +in[xy-dx3] * static_cast<T>(-0.03333333333333333)
-                   +in[xy+dy3] * static_cast<T>(0.03333333333333333)
-                   +in[xy-dy3] * static_cast<T>(-0.03333333333333333)
+                   +in[xy+dx3] * static_cast<T>(0.0333333333333)
+                   +in[xy-dx3] * static_cast<T>(-0.0333333333333)
+                   +in[xy+dy3] * static_cast<T>(0.0333333333333)
+                   +in[xy-dy3] * static_cast<T>(-0.0333333333333)
                    +in[xy+dx4] * static_cast<T>(0.025)
                    +in[xy-dx4] * static_cast<T>(-0.025)
                    +in[xy+dy4] * static_cast<T>(0.025)
@@ -293,3 +399,37 @@ void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer<T, 2> & d_in, c
   });
 }
 
+// declare the kernel name used in SYCL parallel_for
+template <typename T> class star5_usm;
+
+template <typename T>
+void star5(sycl::queue & q, const size_t n, const T * in, T * out)
+{
+  q.submit([&](sycl::handler& h) {
+    h.parallel_for<class star5_usm<T>>(sycl::range<2> {n-10,n-10}, sycl::id<2> {5,5}, [=] (sycl::item<2> it) {
+        const auto i = it[0];
+        const auto j = it[1];
+        out[i*n+j] += +in[i*n+(j+1)] * static_cast<T>(0.1)
+                              +in[i*n+(j-1)] * static_cast<T>(-0.1)
+                              +in[(i+1)*n+j] * static_cast<T>(0.1)
+                              +in[(i-1)*n+j] * static_cast<T>(-0.1)
+                              +in[i*n+(j+2)] * static_cast<T>(0.05)
+                              +in[i*n+(j-2)] * static_cast<T>(-0.05)
+                              +in[(i+2)*n+j] * static_cast<T>(0.05)
+                              +in[(i-2)*n+j] * static_cast<T>(-0.05)
+                              +in[i*n+(j+3)] * static_cast<T>(0.0333333333333)
+                              +in[i*n+(j-3)] * static_cast<T>(-0.0333333333333)
+                              +in[(i+3)*n+j] * static_cast<T>(0.0333333333333)
+                              +in[(i-3)*n+j] * static_cast<T>(-0.0333333333333)
+                              +in[i*n+(j+4)] * static_cast<T>(0.025)
+                              +in[i*n+(j-4)] * static_cast<T>(-0.025)
+                              +in[(i+4)*n+j] * static_cast<T>(0.025)
+                              +in[(i-4)*n+j] * static_cast<T>(-0.025)
+                              +in[i*n+(j+5)] * static_cast<T>(0.02)
+                              +in[i*n+(j-5)] * static_cast<T>(-0.02)
+                              +in[(i+5)*n+j] * static_cast<T>(0.02)
+                              +in[(i-5)*n+j] * static_cast<T>(-0.02);
+    });
+  });
+}
+
diff --git a/Cxx11/transpose-explicit-sycl.cc b/Cxx11/transpose-sycl-explicit.cc
similarity index 56%
rename from Cxx11/transpose-explicit-sycl.cc
rename to Cxx11/transpose-sycl-explicit.cc
index e92dfaa1f..a1dae3bc9 100644
--- a/Cxx11/transpose-explicit-sycl.cc
+++ b/Cxx11/transpose-sycl-explicit.cc
@@ -49,19 +49,14 @@
 ///
 //////////////////////////////////////////////////////////////////////
 
-#include "CL/sycl.hpp"
+#include "prk_sycl.h"
 #include "prk_util.h"
 
-#if 0
-#include "prk_opencl.h"
-#define USE_OPENCL 1
-#endif
-
 template <typename T> class iota;
 template <typename T> class transpose;
 
 template <typename T>
-void run(cl::sycl::queue & q, int iterations, size_t order)
+void run(sycl::queue & q, int iterations, size_t order)
 {
   //////////////////////////////////////////////////////////////////////
   // Allocate space for the input and transpose matrix
@@ -73,37 +68,39 @@ void run(cl::sycl::queue & q, int iterations, size_t order)
 
   try {
 
+    auto ctx = q.get_context();
+
 #if PREBUILD_KERNEL
-    cl::sycl::program kernel(q.get_context());
+    sycl::program kernel(ctx);
     kernel.build_with_kernel_type<transpose<T>>();
 #endif
 
 #if USE_2D_INDEXING
-    cl::sycl::buffer<T,2> d_A( cl::sycl::range<2>{order,order} );
-    cl::sycl::buffer<T,2> d_B( cl::sycl::range<2>{order,order} );
+    sycl::buffer<T,2> d_A( sycl::range<2>{order,order} );
+    sycl::buffer<T,2> d_B( sycl::range<2>{order,order} );
 #else
-    cl::sycl::buffer<T> d_A { cl::sycl::range<1>{order*order}  };
-    cl::sycl::buffer<T> d_B { cl::sycl::range<1>{order*order}  };
+    sycl::buffer<T> d_A { sycl::range<1>{order*order}  };
+    sycl::buffer<T> d_B { sycl::range<1>{order*order}  };
 #endif
 
-    q.submit([&](cl::sycl::handler& h) {
+    q.submit([&](sycl::handler& h) {
 #if USE_2D_INDEXING
-        cl::sycl::accessor<T, 2, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> A(d_A, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0));
-        h.parallel_for<class iota<T>>(cl::sycl::range<2>{order,order}, [=] (cl::sycl::item<2> i) {
+        sycl::accessor<T, 2, sycl::access::mode::write, sycl::access::target::global_buffer> A(d_A, h, sycl::range<2>(order,order), sycl::id<2>(0,0));
+        h.parallel_for<class iota<T>>(sycl::range<2>{order,order}, [=] (sycl::item<2> i) {
             A[i] = i[0] * order + i[1];
         });
 #else
-        cl::sycl::accessor<T, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> A(d_A, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0));
-        h.parallel_for<class iota<T>>(cl::sycl::range<1>{order*order}, [=] (cl::sycl::item<1> i) {
+        sycl::accessor<T, 1, sycl::access::mode::write, sycl::access::target::global_buffer> A(d_A, h, sycl::range<1>(order*order), sycl::id<1>(0));
+        h.parallel_for<class iota<T>>(sycl::range<1>{order*order}, [=] (sycl::item<1> i) {
             A[i] = i[0];
         });
 #endif
     });
-    q.submit([&](cl::sycl::handler& h) {
+    q.submit([&](sycl::handler& h) {
 #if USE_2D_INDEXING
-        cl::sycl::accessor<T, 2, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0));
+        sycl::accessor<T, 2, sycl::access::mode::write, sycl::access::target::global_buffer> B(d_B, h, sycl::range<2>(order,order), sycl::id<2>(0,0));
 #else
-        cl::sycl::accessor<T, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0));
+        sycl::accessor<T, 1, sycl::access::mode::write, sycl::access::target::global_buffer> B(d_B, h, sycl::range<1>(order*order), sycl::id<1>(0));
 #endif
         h.fill(B,(T)0);
     });
@@ -113,24 +110,24 @@ void run(cl::sycl::queue & q, int iterations, size_t order)
 
       if (iter==1) trans_time = prk::wtime();
 
-      q.submit([&](cl::sycl::handler& h) {
+      q.submit([&](sycl::handler& h) {
 
 #if USE_2D_INDEXING
-        cl::sycl::accessor<T, 2, cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer> A(d_A, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0));
-        cl::sycl::accessor<T, 2, cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0));
+        sycl::accessor<T, 2, sycl::access::mode::read_write, sycl::access::target::global_buffer> A(d_A, h, sycl::range<2>(order,order), sycl::id<2>(0,0));
+        sycl::accessor<T, 2, sycl::access::mode::read_write, sycl::access::target::global_buffer> B(d_B, h, sycl::range<2>(order,order), sycl::id<2>(0,0));
 #else
-        cl::sycl::accessor<T, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer> A(d_A, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0));
-        cl::sycl::accessor<T, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0));
+        sycl::accessor<T, 1, sycl::access::mode::read_write, sycl::access::target::global_buffer> A(d_A, h, sycl::range<1>(order*order), sycl::id<1>(0));
+        sycl::accessor<T, 1, sycl::access::mode::read_write, sycl::access::target::global_buffer> B(d_B, h, sycl::range<1>(order*order), sycl::id<1>(0));
 #endif
 
         h.parallel_for<class transpose<T>>(
 #if PREBUILD_KERNEL
                 kernel.get_kernel<transpose<T>>(),
 #endif
-                cl::sycl::range<2>{order,order}, [=] (cl::sycl::item<2> it) {
+                sycl::range<2>{order,order}, [=] (sycl::item<2> it) {
 #if USE_2D_INDEXING
-          cl::sycl::id<2> ij{it[0],it[1]};
-          cl::sycl::id<2> ji{it[1],it[0]};
+          sycl::id<2> ij{it[0],it[1]};
+          sycl::id<2> ji{it[1],it[0]};
           B[ij] += A[ji];
           A[ji] += (T)1;
 #else
@@ -147,25 +144,19 @@ void run(cl::sycl::queue & q, int iterations, size_t order)
     // for other device-oriented programming models.
     trans_time = prk::wtime() - trans_time;
 
-    q.submit([&](cl::sycl::handler& h) {
+    q.submit([&](sycl::handler& h) {
 #if USE_2D_INDEXING
-        cl::sycl::accessor<T, 2, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0));
+        sycl::accessor<T, 2, sycl::access::mode::read, sycl::access::target::global_buffer> B(d_B, h, sycl::range<2>(order,order), sycl::id<2>(0,0));
 #else
-        cl::sycl::accessor<T, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> B(d_B, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0));
+        sycl::accessor<T, 1, sycl::access::mode::read, sycl::access::target::global_buffer> B(d_B, h, sycl::range<1>(order*order), sycl::id<1>(0));
 #endif
         h.copy(B,h_B.data());
     });
     q.wait();
   }
-  catch (cl::sycl::exception & e) {
+  catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
-#ifdef __COMPUTECPP__
-    std::cout << e.get_file_name() << std::endl;
-    std::cout << e.get_line_number() << std::endl;
-    std::cout << e.get_description() << std::endl;
-    std::cout << e.get_cl_error_message() << std::endl;
-    std::cout << e.get_cl_code() << std::endl;
-#endif
+    prk::SYCL::print_exception_details(e);
     return;
   }
   catch (std::exception & e) {
@@ -259,84 +250,52 @@ int main(int argc, char * argv[])
 
   try {
 #if SYCL_TRY_CPU_QUEUE
-    if (1) {
-        cl::sycl::queue host(cl::sycl::host_selector{});
-#if !defined(TRISYCL) && !defined(__HIPSYCL__)
-        auto device      = host.get_device();
-        auto platform    = device.get_platform();
-        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
-        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
-#endif
-        run<float>(host, iterations, order);
-        run<double>(host, iterations, order);
+    if (order<10000) {
+        sycl::queue q(sycl::host_selector{});
+        prk::SYCL::print_device_platform(q);
+        run<float>(q, iterations, order);
+        run<double>(q, iterations, order);
+    } else {
+        std::cout << "Skipping host device since it is too slow for large problems" << std::endl;
     }
 #endif
 
     // CPU requires spir64 target
 #if SYCL_TRY_CPU_QUEUE
     if (1) {
-        cl::sycl::queue cpu(cl::sycl::cpu_selector{});
-#if !defined(TRISYCL) && !defined(__HIPSYCL__)
-        auto device      = cpu.get_device();
-        auto platform    = device.get_platform();
-        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
-        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
-        bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir"));
-#else
-        bool has_spir = true; // ?
-#endif
+        sycl::queue q(sycl::cpu_selector{});
+        prk::SYCL::print_device_platform(q);
+        bool has_spir = prk::SYCL::has_spir(q);
         if (has_spir) {
-          run<float>(cpu, iterations, order);
-          run<double>(cpu, iterations, order);
+          run<float>(q, iterations, order);
+          run<double>(q, iterations, order);
         }
     }
 #endif
 
-    // NVIDIA GPU requires ptx64 target and does not work very well
+    // NVIDIA GPU requires ptx64 target
 #if SYCL_TRY_GPU_QUEUE
-    if (0) {
-        cl::sycl::queue gpu(cl::sycl::gpu_selector{});
-#if !defined(TRISYCL) && !defined(__HIPSYCL__)
-        auto device      = gpu.get_device();
-        auto platform    = device.get_platform();
-        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
-        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
-        bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir"));
-        bool has_fp64 = device.has_extension(cl::sycl::string_class("cl_khr_fp64"));
-#else
-        bool has_spir = true; // ?
-        bool has_fp64 = true;
-#endif
+    if (1) {
+        sycl::queue q(sycl::gpu_selector{});
+        prk::SYCL::print_device_platform(q);
+        bool has_spir = prk::SYCL::has_spir(q);
+        bool has_fp64 = prk::SYCL::has_fp64(q);
+        bool has_ptx  = prk::SYCL::has_ptx(q);
         if (!has_fp64) {
           std::cout << "SYCL GPU device lacks FP64 support." << std::endl;
         }
-        if (has_spir) {
-          run<float>(gpu, iterations, order);
+        if (has_spir || has_ptx) {
+          run<float>(q, iterations, order);
           if (has_fp64) {
-            run<double>(gpu, iterations, order);
+            run<double>(q, iterations, order);
           }
-        } else {
-          std::cout << "SYCL GPU device lacks SPIR-V support." << std::endl;
-#ifdef __COMPUTECPP__
-          std::cout << "You are using ComputeCpp so we will try it anyways..." << std::endl;
-          run<float>(gpu, iterations, order);
-          if (has_fp64) {
-            run<double>(gpu, iterations, order);
-          }
-#endif
         }
     }
 #endif
   }
-  catch (cl::sycl::exception & e) {
+  catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
-#ifdef __COMPUTECPP__
-    std::cout << e.get_file_name() << std::endl;
-    std::cout << e.get_line_number() << std::endl;
-    std::cout << e.get_description() << std::endl;
-    std::cout << e.get_cl_error_message() << std::endl;
-    std::cout << e.get_cl_code() << std::endl;
-#endif
+    prk::SYCL::print_exception_details(e);
     return 1;
   }
   catch (std::exception & e) {
diff --git a/Cxx11/transpose-sycl-usm.cc b/Cxx11/transpose-sycl-usm.cc
new file mode 100644
index 000000000..a80ce8c83
--- /dev/null
+++ b/Cxx11/transpose-sycl-usm.cc
@@ -0,0 +1,276 @@
+///
+/// Copyright (c) 2013, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    transpose
+///
+/// PURPOSE: This program measures the time for the transpose of a
+///          column-major stored matrix into a row-major stored matrix.
+///
+/// USAGE:   Program input is the matrix order and the number of times to
+///          repeat the operation:
+///
+///          transpose <matrix_size> <# iterations>
+///
+///          The output consists of diagnostics to make sure the
+///          transpose worked and timing statistics.
+///
+/// HISTORY: Written by  Rob Van der Wijngaart, February 2009.
+///          Converted to C++11 by Jeff Hammond, February 2016 and May 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_sycl.h"
+#include "prk_util.h"
+
+template <typename T> class transpose;
+
+template <typename T>
+void run(sycl::queue & q, int iterations, size_t order)
+{
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space for the input and transpose matrix
+  //////////////////////////////////////////////////////////////////////
+
+  double trans_time(0);
+
+  auto ctx = q.get_context();
+  auto dev = q.get_device();
+
+  T * A = static_cast<T*>(sycl::malloc_shared(order*order * sizeof(T), dev, ctx));
+  T * B = static_cast<T*>(sycl::malloc_shared(order*order * sizeof(T), dev, ctx));
+
+  for (auto i=0;i<order; i++) {
+    for (auto j=0;j<order;j++) {
+      A[i*order+j] = static_cast<double>(i*order+j);
+      B[i*order+j] = 0.0;
+    }
+  }
+
+  try {
+
+#if PREBUILD_KERNEL
+    sycl::program kernel(ctx);
+    kernel.build_with_kernel_type<transpose<T>>();
+#endif
+
+
+    for (int iter = 0; iter<=iterations; ++iter) {
+
+      if (iter==1) trans_time = prk::wtime();
+
+      q.submit([&](sycl::handler& h) {
+
+        h.parallel_for<class transpose<T>>(
+#if PREBUILD_KERNEL
+                kernel.get_kernel<transpose<T>>(),
+#endif
+                sycl::range<2>{order,order}, [=] (sycl::id<2> it) {
+#if USE_2D_INDEXING
+          sycl::id<2> ij{it[0],it[1]};
+          sycl::id<2> ji{it[1],it[0]};
+          B[ij] += A[ji];
+          A[ji] += (T)1;
+#else
+          B[it[0] * order + it[1]] += A[it[1] * order + it[0]];
+          A[it[1] * order + it[0]] += (T)1;
+#endif
+        });
+      });
+      q.wait();
+    }
+
+    // Stop timer before buffer+accessor destructors fire,
+    // since that will move data, and we do not time that
+    // for other device-oriented programming models.
+    trans_time = prk::wtime() - trans_time;
+  }
+  catch (sycl::exception & e) {
+    std::cout << e.what() << std::endl;
+    prk::SYCL::print_exception_details(e);
+    return;
+  }
+  catch (std::exception & e) {
+    std::cout << e.what() << std::endl;
+    return;
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return;
+  }
+
+  sycl::free(A, ctx);
+  sycl::free(B, ctx);
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  // TODO: replace with std::generate, std::accumulate, or similar
+  const T addit = (iterations+1.) * (iterations/2.);
+  double abserr(0);
+  for (size_t i=0; i<order; ++i) {
+    for (size_t j=0; j<order; ++j) {
+      size_t const ij = i*order+j;
+      size_t const ji = j*order+i;
+      const T reference = static_cast<T>(ij)*(1.+iterations)+addit;
+      abserr += std::fabs(B[ji] - reference);
+    }
+  }
+
+#ifdef VERBOSE
+  std::cout << "Sum of absolute differences: " << abserr << std::endl;
+#endif
+
+  const double epsilon(1.0e-8);
+  if (abserr < epsilon) {
+    std::cout << "Solution validates" << std::endl;
+    double avgtime = trans_time/iterations;
+    double bytes = (size_t)order * (size_t)order * sizeof(T);
+    std::cout << 8*sizeof(T) << "B "
+              << "Rate (MB/s): " << 1.0e-6 * (2.*bytes)/avgtime
+              << " Avg time (s): " << avgtime << std::endl;
+  } else {
+    std::cout << "ERROR: Aggregate squared error " << abserr
+              << " exceeds threshold " << epsilon << std::endl;
+  }
+}
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/SYCL Matrix transpose: B = A^T" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations;
+  size_t order;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <matrix order>";
+      }
+
+      // number of times to do the transpose
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      // order of a the matrix
+      order = std::atoi(argv[2]);
+      if (order <= 0) {
+        throw "ERROR: Matrix Order must be greater than 0";
+      } else if (order > std::floor(std::sqrt(INT_MAX))) {
+        throw "ERROR: matrix dimension too large - overflow risk";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations  = " << iterations << std::endl;
+  std::cout << "Matrix order          = " << order << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Setup SYCL environment
+  //////////////////////////////////////////////////////////////////////
+
+#ifdef USE_OPENCL
+  prk::opencl::listPlatforms();
+#endif
+
+  try {
+#if SYCL_TRY_CPU_QUEUE
+    if (order<10000) {
+        sycl::queue q(sycl::host_selector{});
+        prk::SYCL::print_device_platform(q);
+        run<float>(q, iterations, order);
+        run<double>(q, iterations, order);
+    } else {
+        std::cout << "Skipping host device since it is too slow for large problems" << std::endl;
+    }
+#endif
+
+    // CPU requires spir64 target
+#if SYCL_TRY_CPU_QUEUE
+    if (1) {
+        sycl::queue q(sycl::cpu_selector{});
+        prk::SYCL::print_device_platform(q);
+        bool has_spir = prk::SYCL::has_spir(q);
+        if (has_spir) {
+          run<float>(q, iterations, order);
+          run<double>(q, iterations, order);
+        }
+    }
+#endif
+
+    // NVIDIA GPU requires ptx64 target
+#if SYCL_TRY_GPU_QUEUE
+    if (1) {
+        sycl::queue q(sycl::gpu_selector{});
+        prk::SYCL::print_device_platform(q);
+        bool has_spir = prk::SYCL::has_spir(q);
+        bool has_fp64 = prk::SYCL::has_fp64(q);
+        bool has_ptx  = prk::SYCL::has_ptx(q);
+        if (!has_fp64) {
+          std::cout << "SYCL GPU device lacks FP64 support." << std::endl;
+        }
+        if (has_spir || has_ptx) {
+          run<float>(q, iterations, order);
+          if (has_fp64) {
+            run<double>(q, iterations, order);
+          }
+        }
+    }
+#endif
+  }
+  catch (sycl::exception & e) {
+    std::cout << e.what() << std::endl;
+    prk::SYCL::print_exception_details(e);
+    return 1;
+  }
+  catch (std::exception & e) {
+    std::cout << e.what() << std::endl;
+    return 1;
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
+
+
diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc
index b22b162be..289127265 100644
--- a/Cxx11/transpose-sycl.cc
+++ b/Cxx11/transpose-sycl.cc
@@ -49,18 +49,13 @@
 ///
 //////////////////////////////////////////////////////////////////////
 
-#include "CL/sycl.hpp"
+#include "prk_sycl.h"
 #include "prk_util.h"
 
-#if 0
-#include "prk_opencl.h"
-#define USE_OPENCL 1
-#endif
-
 template <typename T> class transpose;
 
 template <typename T>
-void run(cl::sycl::queue & q, int iterations, size_t order)
+void run(sycl::queue & q, int iterations, size_t order)
 {
   //////////////////////////////////////////////////////////////////////
   // Allocate space for the input and transpose matrix
@@ -76,37 +71,39 @@ void run(cl::sycl::queue & q, int iterations, size_t order)
 
   try {
 
+    auto ctx = q.get_context();
+
 #if PREBUILD_KERNEL
-    cl::sycl::program kernel(q.get_context());
+    sycl::program kernel(ctx);
     kernel.build_with_kernel_type<transpose<T>>();
 #endif
 
 #if USE_2D_INDEXING
-    cl::sycl::buffer<T,2> d_A( h_A.data(), cl::sycl::range<2>{order,order} );
-    cl::sycl::buffer<T,2> d_B( h_B.data(), cl::sycl::range<2>{order,order} );
+    sycl::buffer<T,2> d_A( h_A.data(), sycl::range<2>{order,order} );
+    sycl::buffer<T,2> d_B( h_B.data(), sycl::range<2>{order,order} );
 #else
-    cl::sycl::buffer<T> d_A { h_A.data(), h_A.size() };
-    cl::sycl::buffer<T> d_B { h_B.data(), h_B.size() };
+    sycl::buffer<T> d_A { h_A.data(), h_A.size() };
+    sycl::buffer<T> d_B { h_B.data(), h_B.size() };
 #endif
 
     for (int iter = 0; iter<=iterations; ++iter) {
 
       if (iter==1) trans_time = prk::wtime();
 
-      q.submit([&](cl::sycl::handler& h) {
+      q.submit([&](sycl::handler& h) {
 
         // accessor methods
-        auto A = d_A.template get_access<cl::sycl::access::mode::read_write>(h);
-        auto B = d_B.template get_access<cl::sycl::access::mode::read_write>(h);
+        auto A = d_A.template get_access<sycl::access::mode::read_write>(h);
+        auto B = d_B.template get_access<sycl::access::mode::read_write>(h);
 
         h.parallel_for<class transpose<T>>(
 #if PREBUILD_KERNEL
                 kernel.get_kernel<transpose<T>>(),
 #endif
-                cl::sycl::range<2>{order,order}, [=] (cl::sycl::item<2> it) {
+                sycl::range<2>{order,order}, [=] (sycl::item<2> it) {
 #if USE_2D_INDEXING
-          cl::sycl::id<2> ij{it[0],it[1]};
-          cl::sycl::id<2> ji{it[1],it[0]};
+          sycl::id<2> ij{it[0],it[1]};
+          sycl::id<2> ji{it[1],it[0]};
           B[ij] += A[ji];
           A[ji] += (T)1;
 #else
@@ -123,15 +120,9 @@ void run(cl::sycl::queue & q, int iterations, size_t order)
     // for other device-oriented programming models.
     trans_time = prk::wtime() - trans_time;
   }
-  catch (cl::sycl::exception & e) {
+  catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
-#ifdef __COMPUTECPP__
-    std::cout << e.get_file_name() << std::endl;
-    std::cout << e.get_line_number() << std::endl;
-    std::cout << e.get_description() << std::endl;
-    std::cout << e.get_cl_error_message() << std::endl;
-    std::cout << e.get_cl_code() << std::endl;
-#endif
+    prk::SYCL::print_exception_details(e);
     return;
   }
   catch (std::exception & e) {
@@ -225,84 +216,52 @@ int main(int argc, char * argv[])
 
   try {
 #if SYCL_TRY_CPU_QUEUE
-    if (1) {
-        cl::sycl::queue host(cl::sycl::host_selector{});
-#ifndef TRISYCL
-        auto device      = host.get_device();
-        auto platform    = device.get_platform();
-        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
-        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
-#endif
-        run<float>(host, iterations, order);
-        run<double>(host, iterations, order);
+    if (order<10000) {
+        sycl::queue q(sycl::host_selector{});
+        prk::SYCL::print_device_platform(q);
+        run<float>(q, iterations, order);
+        run<double>(q, iterations, order);
+    } else {
+        std::cout << "Skipping host device since it is too slow for large problems" << std::endl;
     }
 #endif
 
     // CPU requires spir64 target
 #if SYCL_TRY_CPU_QUEUE
     if (1) {
-        cl::sycl::queue cpu(cl::sycl::cpu_selector{});
-#if !defined(TRISYCL) && !defined(__HIPSYCL__)
-        auto device      = cpu.get_device();
-        auto platform    = device.get_platform();
-        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
-        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
-        bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir"));
-#else
-        bool has_spir = true; // ?
-#endif
+        sycl::queue q(sycl::cpu_selector{});
+        prk::SYCL::print_device_platform(q);
+        bool has_spir = prk::SYCL::has_spir(q);
         if (has_spir) {
-          run<float>(cpu, iterations, order);
-          run<double>(cpu, iterations, order);
+          run<float>(q, iterations, order);
+          run<double>(q, iterations, order);
         }
     }
 #endif
 
-    // NVIDIA GPU requires ptx64 target and does not work very well
+    // NVIDIA GPU requires ptx64 target
 #if SYCL_TRY_GPU_QUEUE
-    if (0) {
-        cl::sycl::queue gpu(cl::sycl::gpu_selector{});
-#if !defined(TRISYCL) && !defined(__HIPSYCL__)
-        auto device      = gpu.get_device();
-        auto platform    = device.get_platform();
-        std::cout << "SYCL Device:   " << device.get_info<cl::sycl::info::device::name>() << std::endl;
-        std::cout << "SYCL Platform: " << platform.get_info<cl::sycl::info::platform::name>() << std::endl;
-        bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir"));
-        bool has_fp64 = device.has_extension(cl::sycl::string_class("cl_khr_fp64"));
-#else
-        bool has_spir = true; // ?
-        bool has_fp64 = true;
-#endif
+    if (1) {
+        sycl::queue q(sycl::gpu_selector{});
+        prk::SYCL::print_device_platform(q);
+        bool has_spir = prk::SYCL::has_spir(q);
+        bool has_fp64 = prk::SYCL::has_fp64(q);
+        bool has_ptx  = prk::SYCL::has_ptx(q);
         if (!has_fp64) {
           std::cout << "SYCL GPU device lacks FP64 support." << std::endl;
         }
-        if (has_spir) {
-          run<float>(gpu, iterations, order);
+        if (has_spir || has_ptx) {
+          run<float>(q, iterations, order);
           if (has_fp64) {
-            run<double>(gpu, iterations, order);
+            run<double>(q, iterations, order);
           }
-        } else {
-          std::cout << "SYCL GPU device lacks SPIR-V support." << std::endl;
-#ifdef __COMPUTECPP__
-          std::cout << "You are using ComputeCpp so we will try it anyways..." << std::endl;
-          run<float>(gpu, iterations, order);
-          if (has_fp64) {
-            run<double>(gpu, iterations, order);
-          }
-#endif
         }
     }
 #endif
   }
-  catch (cl::sycl::exception & e) {
+  catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
-#ifdef __COMPUTECPP__
-    std::cout << e.get_file_name() << std::endl;
-    std::cout << e.get_line_number() << std::endl;
-    std::cout << e.get_description() << std::endl;
-    std::cout << e.get_cl_error_message() << std::endl;
-    std::cout << e.get_cl_code() << std::endl;
-#endif
+    prk::SYCL::print_exception_details(e);
     return 1;
   }
   catch (std::exception & e) {
diff --git a/common/README.freebsd b/common/README.freebsd
index 8a52f24da..a55746adc 100644
--- a/common/README.freebsd
+++ b/common/README.freebsd
@@ -13,7 +13,7 @@ sudo pkg install clang flang libpgmath
 
 ## C++ dependencies
 
-sudo pkg install opencl-2.2_1 
+sudo pkg install opencl-2.2_1
 sudo pkg install devel/clinfo devel/ocl-icd lang/beignet lang/pocl
 sudo pkg install tbb
 sudo pkg install boost-all
diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index f4552bd87..51e0827cb 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -54,9 +54,9 @@ METALFLAG=-framework MetalPerformanceShaders
 #
 # triSYCL
 # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
-SYCLDIR=./triSYCL
-SYCLCXX=${CXX} -std=c++17 ${OPENMPFLAG}
-SYCLFLAG=-I$(SYCLDIR)/include
+#SYCLDIR=./triSYCL
+#SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS)
+#SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL
 # ProGTX
 # https://github.com/ProGTX/sycl-gtx
 #SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx
@@ -74,16 +74,19 @@ METALFLAG=-framework MetalPerformanceShaders
 #
 # TBB
 #
-TBBDIR=/usr/local/Cellar/tbb/2019_U5_1
-TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
+#TBBDIR=/usr/lib/x86_64-linux-gnu
+TBBDIR=/usr/local/Cellar/tbb/2019_U8
+TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
+#TBBDIR=/opt/intel/compilers_and_libraries_2019.2.159/linux/tbb
+#TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -L${TBBDIR}/lib/intel64_lin/gcc4.7 -ltbb
 #
 # Parallel STL, Boost, etc.
 #
-BOOSTFLAG=-I/usr/local/Cellar/boost/1.69.0_2/include
+#BOOSTFLAG=-I/usr/local/Cellar/boost/1.71.0/include
 RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
 #RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
 PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG}
-#PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG}
+#PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG} -Wno-\#pragma-messages
 KOKKOSDIR=/opt/kokkos/gcc
 KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG}
 RAJADIR=/opt/raja/gcc
@@ -91,33 +94,6 @@ RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
 THRUSTDIR=/Users/jrhammon/Work/NVIDIA/thrust
 THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG}
 #
-# SYCL flags
-#
-# triSYCL
-# https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
-SYCLDIR=./triSYCL
-SYCLCXX=${CXX} -O3 -Wall -std=c++17 ${OPENMPFLAG}
-SYCLFLAG=-I${SYCLDIR}/include ${BOOSTFLAG} -DTRISYCL
-# ProGTX
-# https://github.com/ProGTX/sycl-gtx
-#SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx
-#SYCLCXX=${CXX} ${OPENMPFLAG}
-#SYCLFLAG=-I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG}
-SYCLFLAG+=${RANGEFLAG}
-#
-# SYCL flags
-#
-# triSYCL
-# https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
-SYCLDIR=./triSYCL
-SYCLCXX=${CXX} -std=c++17 ${OPENMPFLAG}
-SYCLFLAG=-I${SYCLDIR}/include ${BOOSTFLAG}
-# ProGTX
-# https://github.com/ProGTX/sycl-gtx
-#SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx
-#SYCLCXX=${CXX} ${OPENMPFLAG}
-#SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG}
-#
 # CBLAS for C++ DGEMM
 #
 BLASFLAG=-DACCELERATE -framework Accelerate
diff --git a/common/make.defs.llvm b/common/make.defs.llvm
index 318e64595..5804f0681 100644
--- a/common/make.defs.llvm
+++ b/common/make.defs.llvm
@@ -4,7 +4,7 @@
 #
 # Base compilers and language options
 #
-#LLVM_ROOT=/usr/local/Cellar/llvm/6.0.0
+#LLVM_ROOT=/usr/local/Cellar/llvm/9.0.0
 #LLVM_PATH=${LLVM_ROOT}/bin/
 #LLVM_PATH=/opt/llvm/HEAD/bin/
 # C99 is required in some implementations.
@@ -30,6 +30,10 @@ DEFAULT_OPT_FLAGS=-g -O3 -mtune=native -ffast-math
 # These are useful to understand why the compiler does not vectorize loops:
 #   DEFAULT_OPT_FLAGS+=-Rpass-analysis=loop-vectorize
 #   DEFAULT_OPT_FLAGS+=-Rpass=loop-vectorize
+#DEFAULT_OPT_FLAGS+=-fopt-info-vec-missed
+DEFAULT_OPT_FLAGS+=-Wall #-Werror
+DEFAULT_OPT_FLAGS+=-Wno-ignored-attributes -Wno-deprecated-declarations
+#DEFAULT_OPT_FLAGS+=-mavx -mfma
 #
 # OpenMP flags
 #
@@ -81,9 +85,8 @@ OPENMPFLAG+=-L${LLVM_ROOT}/lib
 # triSYCL
 # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
 SYCLDIR=./triSYCL
-#SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS)
 SYCLCXX=${CXX} $(DEFAULT_OPT_FLAGS)
-SYCLFLAG=-std=gnu++17 -I$(SYCLDIR)/include -DTRISYCL
+SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL
 # ProGTX
 # https://github.com/ProGTX/sycl-gtx
 #SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx
@@ -97,14 +100,14 @@ SYCLFLAG=-std=gnu++17 -I$(SYCLDIR)/include -DTRISYCL
 # TBB
 #
 #TBBDIR=/usr/lib/x86_64-linux-gnu
-TBBDIR=/usr/local/Cellar/tbb/2018_U3_1
+TBBDIR=/usr/local/Cellar/tbb/2019_U8
 TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
 #TBBDIR=/opt/intel/compilers_and_libraries_2019.2.159/linux/tbb
 #TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -L${TBBDIR}/lib/intel64_lin/gcc4.7 -ltbb
 #
 # Parallel STL, Boost, etc.
 #
-#BOOSTFLAG=-I/usr/local/Cellar/boost/1.65.1/include
+#BOOSTFLAG=-I/usr/local/Cellar/boost/1.71.0/include
 #RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
 RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
 PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG} -Wno-\#pragma-messages
@@ -130,6 +133,16 @@ CUDAFLAGS=-g -O3 -std=c++11
 # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233
 CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED
 #
+# Halide
+#
+HALIDECXX=c++
+HALIDEDIR=/opt/halide
+HALIDEFLAG=-I${HALIDEDIR}/include
+HALIDEFLAG+=-L${HALIDEDIR}/lib -lhalide
+#HALIDEFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0
+HALIDEFLAG+=${DEFAULT_OPT_FLAGS}
+HALIDEFLAG+=-std=c++17 -g3
+#
 # ISPC
 #
 ISPC=ispc

From 6f7bc302aa85c59097bbbc92d10486f6019c376d Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Tue, 8 Oct 2019 21:54:41 -0700
Subject: [PATCH 227/245] IBM compiler fixes (#421)

* XLF also missing norm2 intrinsic
* remove unnecessary (erroneous) declare target
* add IBM POWER9 + NVIDIA V100
* rename for preprocessing
* IBM Clang fixes
* do not map RO arrays as tofrom
* override _OPENMP for XLC
* more IBM NV fixes
* add prk::alloc/dealloc to match C (unused)
* Kokkos-CUDA still broken...
* remove unnecessary range
* add missing includes
* update example build file
---
 C1z/Makefile                                  |   4 +
 C1z/prk_util.h                                |   2 +-
 Cxx11/Makefile                                |   5 +
 Cxx11/nstream-device-thrust.cu                |   3 +-
 Cxx11/nstream-openmp-target.cc                |   4 +-
 Cxx11/prk_openmp.h                            |   2 +-
 Cxx11/prk_util.h                              |  27 +++-
 Cxx11/transpose-device-thrust.cu              |   2 +
 FORTRAN/Makefile                              |  28 ++--
 ...nmp-target.f90 => dgemm-openmp-target.F90} |   0
 .../{dgemm-pretty.f90 => dgemm-pretty.F90}    |   0
 ...p-openmp.f90 => dgemm-taskloop-openmp.F90} |   0
 FORTRAN/{dgemm.f90 => dgemm.F90}              |   4 +-
 ...p-target.f90 => nstream-openmp-target.F90} |   0
 ...stream-ornlacc.f90 => nstream-ornlacc.F90} |   0
 ...{nstream-pretty.f90 => nstream-pretty.F90} |   0
 ...openmp.f90 => nstream-taskloop-openmp.F90} |   0
 FORTRAN/{nstream.f90 => nstream.F90}          |   0
 ...sync-ornlacc.f90 => p2p-async-ornlacc.F90} |   0
 FORTRAN/{p2p-coarray.f90 => p2p-coarray.F90}  |   0
 ...oss-openmp.f90 => p2p-doacross-openmp.F90} |   0
 ...op-openmp.f90 => p2p-innerloop-openmp.F90} |   0
 ...-ornlacc.f90 => p2p-innerloop-ornlacc.F90} |   0
 .../{p2p-innerloop.f90 => p2p-innerloop.F90}  |   0
 ...penmp-target.f90 => p2p-openmp-target.F90} |   0
 FORTRAN/{p2p-ornlacc.f90 => p2p-ornlacc.F90}  |   0
 ...-tasks-openmp.f90 => p2p-tasks-openmp.F90} |   0
 FORTRAN/{p2p.f90 => p2p.F90}                  |   0
 ...tencil-coarray.f90 => stencil-coarray.F90} |   0
 ...p-target.f90 => stencil-openmp-target.F90} |   1 -
 ...tencil-ornlacc.f90 => stencil-ornlacc.F90} |   0
 ...{stencil-pretty.f90 => stencil-pretty.F90} |   0
 ...openmp.f90 => stencil-taskloop-openmp.F90} |   0
 FORTRAN/{stencil.f90 => stencil.F90}          |   0
 ...{stencil_openmp.f90 => stencil_openmp.F90} |   0
 ...{stencil_pretty.f90 => stencil_pretty.F90} |   0
 ...{stencil_serial.f90 => stencil_serial.F90} |   0
 ...{stencil_target.f90 => stencil_target.F90} |   0
 ...ncil_taskloop.f90 => stencil_taskloop.F90} |   0
 ...pose-coarray.f90 => transpose-coarray.F90} |   0
 ...target.f90 => transpose-openmp-target.F90} |   0
 ...pose-ornlacc.f90 => transpose-ornlacc.F90} |   0
 ...nspose-pretty.f90 => transpose-pretty.F90} |   2 +-
 ...enmp.f90 => transpose-taskloop-openmp.F90} |   0
 ...-openmp.f90 => transpose-tasks-openmp.F90} |   0
 FORTRAN/{transpose.f90 => transpose.F90}      |   0
 common/make.defs.ibmp9nv                      | 121 ++++++++++++++++++
 47 files changed, 182 insertions(+), 23 deletions(-)
 rename FORTRAN/{dgemm-openmp-target.f90 => dgemm-openmp-target.F90} (100%)
 rename FORTRAN/{dgemm-pretty.f90 => dgemm-pretty.F90} (100%)
 rename FORTRAN/{dgemm-taskloop-openmp.f90 => dgemm-taskloop-openmp.F90} (100%)
 rename FORTRAN/{dgemm.f90 => dgemm.F90} (99%)
 rename FORTRAN/{nstream-openmp-target.f90 => nstream-openmp-target.F90} (100%)
 rename FORTRAN/{nstream-ornlacc.f90 => nstream-ornlacc.F90} (100%)
 rename FORTRAN/{nstream-pretty.f90 => nstream-pretty.F90} (100%)
 rename FORTRAN/{nstream-taskloop-openmp.f90 => nstream-taskloop-openmp.F90} (100%)
 rename FORTRAN/{nstream.f90 => nstream.F90} (100%)
 rename FORTRAN/{p2p-async-ornlacc.f90 => p2p-async-ornlacc.F90} (100%)
 rename FORTRAN/{p2p-coarray.f90 => p2p-coarray.F90} (100%)
 rename FORTRAN/{p2p-doacross-openmp.f90 => p2p-doacross-openmp.F90} (100%)
 rename FORTRAN/{p2p-innerloop-openmp.f90 => p2p-innerloop-openmp.F90} (100%)
 rename FORTRAN/{p2p-innerloop-ornlacc.f90 => p2p-innerloop-ornlacc.F90} (100%)
 rename FORTRAN/{p2p-innerloop.f90 => p2p-innerloop.F90} (100%)
 rename FORTRAN/{p2p-openmp-target.f90 => p2p-openmp-target.F90} (100%)
 rename FORTRAN/{p2p-ornlacc.f90 => p2p-ornlacc.F90} (100%)
 rename FORTRAN/{p2p-tasks-openmp.f90 => p2p-tasks-openmp.F90} (100%)
 rename FORTRAN/{p2p.f90 => p2p.F90} (100%)
 rename FORTRAN/{stencil-coarray.f90 => stencil-coarray.F90} (100%)
 rename FORTRAN/{stencil-openmp-target.f90 => stencil-openmp-target.F90} (99%)
 rename FORTRAN/{stencil-ornlacc.f90 => stencil-ornlacc.F90} (100%)
 rename FORTRAN/{stencil-pretty.f90 => stencil-pretty.F90} (100%)
 rename FORTRAN/{stencil-taskloop-openmp.f90 => stencil-taskloop-openmp.F90} (100%)
 rename FORTRAN/{stencil.f90 => stencil.F90} (100%)
 rename FORTRAN/{stencil_openmp.f90 => stencil_openmp.F90} (100%)
 rename FORTRAN/{stencil_pretty.f90 => stencil_pretty.F90} (100%)
 rename FORTRAN/{stencil_serial.f90 => stencil_serial.F90} (100%)
 rename FORTRAN/{stencil_target.f90 => stencil_target.F90} (100%)
 rename FORTRAN/{stencil_taskloop.f90 => stencil_taskloop.F90} (100%)
 rename FORTRAN/{transpose-coarray.f90 => transpose-coarray.F90} (100%)
 rename FORTRAN/{transpose-openmp-target.f90 => transpose-openmp-target.F90} (100%)
 rename FORTRAN/{transpose-ornlacc.f90 => transpose-ornlacc.F90} (100%)
 rename FORTRAN/{transpose-pretty.f90 => transpose-pretty.F90} (99%)
 rename FORTRAN/{transpose-taskloop-openmp.f90 => transpose-taskloop-openmp.F90} (100%)
 rename FORTRAN/{transpose-tasks-openmp.f90 => transpose-tasks-openmp.F90} (100%)
 rename FORTRAN/{transpose.f90 => transpose.F90} (100%)
 create mode 100644 common/make.defs.ibmp9nv

diff --git a/C1z/Makefile b/C1z/Makefile
index 9125fef1f..0c854088e 100644
--- a/C1z/Makefile
+++ b/C1z/Makefile
@@ -42,6 +42,10 @@ endif
 ifneq ($(CILKFLAG),)
   EXTRA += cilk
 endif
+ifeq ($(findstring xlc,$(CC)),xlc)
+  EXTRA = target
+  CFLAGS += -DXLC
+endif
 
 all: serial thread openmp taskloop $(EXTRA)
 
diff --git a/C1z/prk_util.h b/C1z/prk_util.h
index 313cca471..1cb2d4467 100644
--- a/C1z/prk_util.h
+++ b/C1z/prk_util.h
@@ -77,7 +77,7 @@ const bool false=0;
 # define OMP_BARRIER PRAGMA(omp barrier)
 # define OMP_FOR(x) PRAGMA(omp for x)
 # define OMP_FOR_REDUCE(x) PRAGMA(omp for reduction (x) )
-# if (_OPENMP >= 201300)
+# if (_OPENMP >= 201300) || (__ibmxl_version__ >= 16)
 #  define OMP_SIMD PRAGMA(omp simd)
 #  define OMP_FOR_SIMD(x) PRAGMA(omp for simd x)
 #  define OMP_TASK(x) PRAGMA(omp task x)
diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index 84665feaf..596c87793 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -193,9 +193,14 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h
 %-raja: %-raja.cc prk_util.h
 	$(CXX) $(CXXFLAGS) $< $(RAJAFLAGS) -o $@
 
+ifeq ($(PRK_KOKKOS_BACKEND),Cuda)
+%-kokkos: %-kokkos.cc prk_util.h
+	${KOKKOSDIR}/bin/nvcc_wrapper $(CUDAFLAGS) $< $(KOKKOSFLAG) -DUSE_KOKKOS -DPRK_KOKKOS_BACKEND=Cuda -o $@
+else
 %-kokkos: %-kokkos.cc prk_util.h
 	$(info PRK help: Set USE_PRK_KOKKOS_BACKEND={Threads,Serial,Cuda} when invoking make to not use OpenMP)
 	$(CXX) $(CXXFLAGS) $< $(KOKKOSFLAGS) -o $@
+endif
 
 # for host execution
 %-thrust: %-thrust.cc prk_util.h
diff --git a/Cxx11/nstream-device-thrust.cu b/Cxx11/nstream-device-thrust.cu
index 13cd1a4e5..8ecbee9bf 100644
--- a/Cxx11/nstream-device-thrust.cu
+++ b/Cxx11/nstream-device-thrust.cu
@@ -64,6 +64,7 @@
 
 #include "prk_util.h"
 #include "prk_cuda.h"
+#include "prk_thrust.h"
 
 int main(int argc, char * argv[])
 {
@@ -115,8 +116,6 @@ int main(int argc, char * argv[])
   thrust::device_vector<double> B(length);
   thrust::device_vector<double> C(length);
 
-  auto range = prk::range(static_cast<size_t>(0), length);
-
   double scalar(3);
   {
     thrust::fill(thrust::device, A.begin(), A.end(), 0.0);
diff --git a/Cxx11/nstream-openmp-target.cc b/Cxx11/nstream-openmp-target.cc
index d4a437a08..8715962a8 100644
--- a/Cxx11/nstream-openmp-target.cc
+++ b/Cxx11/nstream-openmp-target.cc
@@ -129,9 +129,9 @@ int main(int argc, char * argv[])
   }
 
   // DEVICE
-  OMP_TARGET( data map(tofrom: A[0:length], B[0:length], C[0:length]) )
+  OMP_TARGET( data map(tofrom: A[0:length]) map(to: B[0:length], C[0:length]) )
   {
-    for (auto iter = 0; iter<=iterations; iter++) {
+    for (int iter = 0; iter<=iterations; iter++) {
 
       if (iter==1) nstream_time = prk::wtime();
 
diff --git a/Cxx11/prk_openmp.h b/Cxx11/prk_openmp.h
index 578e713e5..c562630f3 100644
--- a/Cxx11/prk_openmp.h
+++ b/Cxx11/prk_openmp.h
@@ -44,7 +44,7 @@
 # define OMP_FOR(x) PRAGMA(omp for x)
 # define OMP_FOR_REDUCE(x) PRAGMA(omp for reduction (x) )
 // OpenMP SIMD if supported, else not.
-# if (_OPENMP >= 201300)
+# if (_OPENMP >= 201300) || (__ibmxl_version__ >= 16)
 #  define OMP_SIMD PRAGMA(omp simd)
 #  define OMP_FOR_SIMD PRAGMA(omp for simd)
 #  define OMP_TASK(x) PRAGMA(omp task x)
diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h
index abdf6388d..ed798b0b1 100644
--- a/Cxx11/prk_util.h
+++ b/Cxx11/prk_util.h
@@ -40,7 +40,7 @@
 #include <cassert>
 
 // Test standard library _after_ standard headers have been included...
-#if !defined(__NVCC__) && !defined(__PGI) && (defined(__GLIBCXX__) || defined(_GLIBCXX_RELEASE) ) && !defined(_GLIBCXX_USE_CXX11_ABI)
+#if !defined(__NVCC__) && !defined(__PGI) && !defined(__ibmxl__) && (defined(__GLIBCXX__) || defined(_GLIBCXX_RELEASE) ) && !defined(_GLIBCXX_USE_CXX11_ABI)
 # error You are using an ancient version GNU libstdc++.  Either upgrade your GCC or tell ICC to use a newer version via the -gxx-name= option.
 #endif
 
@@ -278,6 +278,31 @@ namespace prk {
         return ( numerator / denominator + (numerator % denominator > 0) );
     }
 
+    template<typename T>
+    T * alloc(size_t bytes)
+    {
+        int alignment = ::prk::get_alignment();
+#if defined(__INTEL_COMPILER)
+        return (void*)_mm_malloc(bytes,alignment);
+#else
+        T * ptr = nullptr;
+        int ret = posix_memalign((void**)&ptr,alignment,bytes);
+        if (ret!=0) ptr = NULL;
+        return ptr;
+#endif
+
+    }
+
+    template<typename T>
+    void dealloc(T * p)
+    {
+#if defined(__INTEL_COMPILER)
+        _mm_free((void*)p);
+#else
+        free((void*)p);
+#endif
+    }
+
 } // namespace prk
 
 #endif /* PRK_UTIL_H */
diff --git a/Cxx11/transpose-device-thrust.cu b/Cxx11/transpose-device-thrust.cu
index 907f45e94..044032dd2 100644
--- a/Cxx11/transpose-device-thrust.cu
+++ b/Cxx11/transpose-device-thrust.cu
@@ -50,6 +50,8 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
+#include "prk_cuda.h"
+#include "prk_thrust.h"
 
 struct x : public thrust::unary_function<void,int>
 {
diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile
index d96f87cce..e9b1fa471 100644
--- a/FORTRAN/Makefile
+++ b/FORTRAN/Makefile
@@ -8,10 +8,10 @@ ifndef RADIUS
   RADIUS=2
 endif
 
-STARFLAG   = -DSTAR
+STARFLAG = $(XLFPP)-DSTAR
 
 FCFLAGS  = $(DEFAULT_OPT_FLAGS)
-FCFLAGS += -DRADIUS=$(RADIUS) $(STARFLAG)
+FCFLAGS += $(XLFPP)-DRADIUS=$(RADIUS) $(STARFLAG)
 
 ifeq ($(findstring ifort,$(FC)),ifort)
   BLASFLAGS += -heap-arrays
@@ -40,6 +40,10 @@ ifeq ($(findstring flang,$(FC)),flang)
   EXTRA = target ornlacc
   FCFLAGS += -DPGI
 endif
+ifeq ($(findstring xlf,$(FC)),xlf)
+  EXTRA = target
+  FCFLAGS += $(XLFPP)-DXLF
+endif
 
 all: serial pretty openmp tasks $(EXTRA)
 
@@ -59,32 +63,32 @@ target: stencil-openmp-target transpose-openmp-target nstream-openmp-target dgem
 
 ornlacc: p2p-ornlacc p2p-innerloop-ornlacc stencil-ornlacc transpose-ornlacc nstream-ornlacc
 
-%: %.f90
+%: %.F90
 	$(FC) $(FCFLAGS) $< -o $@
 
-stencil: stencil.f90 stencil_serial.f90
-	#$(FC) $(FCFLAGS) -c stencil_serial.f90 -o stencil_serial.o
+stencil: stencil.F90 stencil_serial.F90
+	#$(FC) $(FCFLAGS) -c stencil_serial.F90 -o stencil_serial.o
 	$(FC) $(FCFLAGS) $< -o $@
 
-dgemm-pretty: dgemm-pretty.f90
+dgemm-pretty: dgemm-pretty.F90
 	$(FC) $(FCFLAGS) $(BLASFLAGS) $< -o $@
 
-%-pretty: %-pretty.f90
+%-pretty: %-pretty.F90
 	$(FC) $(FCFLAGS) $< -o $@
 
-%-openmp: %.f90
+%-openmp: %.F90
 	$(FC) $(FCFLAGS) $(OPENMPFLAG) $< -o $@
 
-%-openmp: %-openmp.f90
+%-openmp: %-openmp.F90
 	$(FC) $(FCFLAGS) $(OPENMPFLAG) $< -o $@
 
-%-coarray: %-coarray.f90
+%-coarray: %-coarray.F90
 	$(CAFC) $(FCFLAGS) $< $(COARRAYFLAG) -o $@
 
-%-target: %-target.f90
+%-target: %-target.F90
 	$(FC) $(FCFLAGS) $(OPENMPFLAG) $(OFFLOADFLAG) $< -o $@
 
-%-ornlacc: %-ornlacc.f90
+%-ornlacc: %-ornlacc.F90
 	$(FC) $(FCFLAGS) $(ORNLACCFLAG) $< -o $@
 
 clean:
diff --git a/FORTRAN/dgemm-openmp-target.f90 b/FORTRAN/dgemm-openmp-target.F90
similarity index 100%
rename from FORTRAN/dgemm-openmp-target.f90
rename to FORTRAN/dgemm-openmp-target.F90
diff --git a/FORTRAN/dgemm-pretty.f90 b/FORTRAN/dgemm-pretty.F90
similarity index 100%
rename from FORTRAN/dgemm-pretty.f90
rename to FORTRAN/dgemm-pretty.F90
diff --git a/FORTRAN/dgemm-taskloop-openmp.f90 b/FORTRAN/dgemm-taskloop-openmp.F90
similarity index 100%
rename from FORTRAN/dgemm-taskloop-openmp.f90
rename to FORTRAN/dgemm-taskloop-openmp.F90
diff --git a/FORTRAN/dgemm.f90 b/FORTRAN/dgemm.F90
similarity index 99%
rename from FORTRAN/dgemm.f90
rename to FORTRAN/dgemm.F90
index 7123882a6..55edc2977 100644
--- a/FORTRAN/dgemm.f90
+++ b/FORTRAN/dgemm.F90
@@ -279,13 +279,13 @@ program main
   forder = real(order,REAL64)
   reference = 0.25d0 * forder**3 * (forder-1)**2 * (iterations+1)
   checksum = 0.0d0
-  !$omp parallel do simd reduction(+:checksum)
+  !$omp parallel do reduction(+:checksum)
   do j=1,order
     do i=1,order
       checksum = checksum + C(i,j)
     enddo
   enddo
-  !$omp end parallel do simd
+  !$omp end parallel do
 
   deallocate( C )
 
diff --git a/FORTRAN/nstream-openmp-target.f90 b/FORTRAN/nstream-openmp-target.F90
similarity index 100%
rename from FORTRAN/nstream-openmp-target.f90
rename to FORTRAN/nstream-openmp-target.F90
diff --git a/FORTRAN/nstream-ornlacc.f90 b/FORTRAN/nstream-ornlacc.F90
similarity index 100%
rename from FORTRAN/nstream-ornlacc.f90
rename to FORTRAN/nstream-ornlacc.F90
diff --git a/FORTRAN/nstream-pretty.f90 b/FORTRAN/nstream-pretty.F90
similarity index 100%
rename from FORTRAN/nstream-pretty.f90
rename to FORTRAN/nstream-pretty.F90
diff --git a/FORTRAN/nstream-taskloop-openmp.f90 b/FORTRAN/nstream-taskloop-openmp.F90
similarity index 100%
rename from FORTRAN/nstream-taskloop-openmp.f90
rename to FORTRAN/nstream-taskloop-openmp.F90
diff --git a/FORTRAN/nstream.f90 b/FORTRAN/nstream.F90
similarity index 100%
rename from FORTRAN/nstream.f90
rename to FORTRAN/nstream.F90
diff --git a/FORTRAN/p2p-async-ornlacc.f90 b/FORTRAN/p2p-async-ornlacc.F90
similarity index 100%
rename from FORTRAN/p2p-async-ornlacc.f90
rename to FORTRAN/p2p-async-ornlacc.F90
diff --git a/FORTRAN/p2p-coarray.f90 b/FORTRAN/p2p-coarray.F90
similarity index 100%
rename from FORTRAN/p2p-coarray.f90
rename to FORTRAN/p2p-coarray.F90
diff --git a/FORTRAN/p2p-doacross-openmp.f90 b/FORTRAN/p2p-doacross-openmp.F90
similarity index 100%
rename from FORTRAN/p2p-doacross-openmp.f90
rename to FORTRAN/p2p-doacross-openmp.F90
diff --git a/FORTRAN/p2p-innerloop-openmp.f90 b/FORTRAN/p2p-innerloop-openmp.F90
similarity index 100%
rename from FORTRAN/p2p-innerloop-openmp.f90
rename to FORTRAN/p2p-innerloop-openmp.F90
diff --git a/FORTRAN/p2p-innerloop-ornlacc.f90 b/FORTRAN/p2p-innerloop-ornlacc.F90
similarity index 100%
rename from FORTRAN/p2p-innerloop-ornlacc.f90
rename to FORTRAN/p2p-innerloop-ornlacc.F90
diff --git a/FORTRAN/p2p-innerloop.f90 b/FORTRAN/p2p-innerloop.F90
similarity index 100%
rename from FORTRAN/p2p-innerloop.f90
rename to FORTRAN/p2p-innerloop.F90
diff --git a/FORTRAN/p2p-openmp-target.f90 b/FORTRAN/p2p-openmp-target.F90
similarity index 100%
rename from FORTRAN/p2p-openmp-target.f90
rename to FORTRAN/p2p-openmp-target.F90
diff --git a/FORTRAN/p2p-ornlacc.f90 b/FORTRAN/p2p-ornlacc.F90
similarity index 100%
rename from FORTRAN/p2p-ornlacc.f90
rename to FORTRAN/p2p-ornlacc.F90
diff --git a/FORTRAN/p2p-tasks-openmp.f90 b/FORTRAN/p2p-tasks-openmp.F90
similarity index 100%
rename from FORTRAN/p2p-tasks-openmp.f90
rename to FORTRAN/p2p-tasks-openmp.F90
diff --git a/FORTRAN/p2p.f90 b/FORTRAN/p2p.F90
similarity index 100%
rename from FORTRAN/p2p.f90
rename to FORTRAN/p2p.F90
diff --git a/FORTRAN/stencil-coarray.f90 b/FORTRAN/stencil-coarray.F90
similarity index 100%
rename from FORTRAN/stencil-coarray.f90
rename to FORTRAN/stencil-coarray.F90
diff --git a/FORTRAN/stencil-openmp-target.f90 b/FORTRAN/stencil-openmp-target.F90
similarity index 99%
rename from FORTRAN/stencil-openmp-target.f90
rename to FORTRAN/stencil-openmp-target.F90
index 7bceb70e1..f910f3245 100644
--- a/FORTRAN/stencil-openmp-target.f90
+++ b/FORTRAN/stencil-openmp-target.F90
@@ -94,7 +94,6 @@ end subroutine initialize_w
 subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B)
   use iso_fortran_env
   implicit none
-  !$omp declare target
   logical, intent(in) :: is_star, tiling
   integer(kind=INT32), intent(in) :: tile_size, r, n
   real(kind=REAL64), intent(in) :: W(-r:r,-r:r)
diff --git a/FORTRAN/stencil-ornlacc.f90 b/FORTRAN/stencil-ornlacc.F90
similarity index 100%
rename from FORTRAN/stencil-ornlacc.f90
rename to FORTRAN/stencil-ornlacc.F90
diff --git a/FORTRAN/stencil-pretty.f90 b/FORTRAN/stencil-pretty.F90
similarity index 100%
rename from FORTRAN/stencil-pretty.f90
rename to FORTRAN/stencil-pretty.F90
diff --git a/FORTRAN/stencil-taskloop-openmp.f90 b/FORTRAN/stencil-taskloop-openmp.F90
similarity index 100%
rename from FORTRAN/stencil-taskloop-openmp.f90
rename to FORTRAN/stencil-taskloop-openmp.F90
diff --git a/FORTRAN/stencil.f90 b/FORTRAN/stencil.F90
similarity index 100%
rename from FORTRAN/stencil.f90
rename to FORTRAN/stencil.F90
diff --git a/FORTRAN/stencil_openmp.f90 b/FORTRAN/stencil_openmp.F90
similarity index 100%
rename from FORTRAN/stencil_openmp.f90
rename to FORTRAN/stencil_openmp.F90
diff --git a/FORTRAN/stencil_pretty.f90 b/FORTRAN/stencil_pretty.F90
similarity index 100%
rename from FORTRAN/stencil_pretty.f90
rename to FORTRAN/stencil_pretty.F90
diff --git a/FORTRAN/stencil_serial.f90 b/FORTRAN/stencil_serial.F90
similarity index 100%
rename from FORTRAN/stencil_serial.f90
rename to FORTRAN/stencil_serial.F90
diff --git a/FORTRAN/stencil_target.f90 b/FORTRAN/stencil_target.F90
similarity index 100%
rename from FORTRAN/stencil_target.f90
rename to FORTRAN/stencil_target.F90
diff --git a/FORTRAN/stencil_taskloop.f90 b/FORTRAN/stencil_taskloop.F90
similarity index 100%
rename from FORTRAN/stencil_taskloop.f90
rename to FORTRAN/stencil_taskloop.F90
diff --git a/FORTRAN/transpose-coarray.f90 b/FORTRAN/transpose-coarray.F90
similarity index 100%
rename from FORTRAN/transpose-coarray.f90
rename to FORTRAN/transpose-coarray.F90
diff --git a/FORTRAN/transpose-openmp-target.f90 b/FORTRAN/transpose-openmp-target.F90
similarity index 100%
rename from FORTRAN/transpose-openmp-target.f90
rename to FORTRAN/transpose-openmp-target.F90
diff --git a/FORTRAN/transpose-ornlacc.f90 b/FORTRAN/transpose-ornlacc.F90
similarity index 100%
rename from FORTRAN/transpose-ornlacc.f90
rename to FORTRAN/transpose-ornlacc.F90
diff --git a/FORTRAN/transpose-pretty.f90 b/FORTRAN/transpose-pretty.F90
similarity index 99%
rename from FORTRAN/transpose-pretty.f90
rename to FORTRAN/transpose-pretty.F90
index 31c88b378..6185431a4 100644
--- a/FORTRAN/transpose-pretty.f90
+++ b/FORTRAN/transpose-pretty.F90
@@ -152,7 +152,7 @@ program main
   A = ( transpose(reshape((/ (j2, j2 = 0,o2) /),(/order, order/))) &
         * real(iterations+1,REAL64) ) &
       + real((iterations*(iterations+1))/2,REAL64)
-#if defined(PGI)
+#if defined(PGI) || defined(XLF)
   abserr = 0.0d0
   do j=1,order
     do i=1,order
diff --git a/FORTRAN/transpose-taskloop-openmp.f90 b/FORTRAN/transpose-taskloop-openmp.F90
similarity index 100%
rename from FORTRAN/transpose-taskloop-openmp.f90
rename to FORTRAN/transpose-taskloop-openmp.F90
diff --git a/FORTRAN/transpose-tasks-openmp.f90 b/FORTRAN/transpose-tasks-openmp.F90
similarity index 100%
rename from FORTRAN/transpose-tasks-openmp.f90
rename to FORTRAN/transpose-tasks-openmp.F90
diff --git a/FORTRAN/transpose.f90 b/FORTRAN/transpose.F90
similarity index 100%
rename from FORTRAN/transpose.f90
rename to FORTRAN/transpose.F90
diff --git a/common/make.defs.ibmp9nv b/common/make.defs.ibmp9nv
new file mode 100644
index 000000000..3fd2badf8
--- /dev/null
+++ b/common/make.defs.ibmp9nv
@@ -0,0 +1,121 @@
+#
+# This file shows the IBM POWER9 + NVIDIA V100 toolchain options for PRKs using
+# OpenMP, MPI and/or Fortran (sans coarrays) only.
+#
+# Base compilers and language options
+#
+# C99 is required in some implementations.
+CC=xlc_r -qlanglvl=stdc99
+# All of the Fortran code is written for the 2008 standard and requires preprocessing.
+# You might need to modify the build system for the preprocessor options to work.
+FC=xlf2008_r
+XLFPP=-WF,
+# C++11 may not be required but does no harm here.
+CXX=xlc++_r -qlanglvl=extended1y
+#
+# Compiler flags
+#
+DEFAULT_OPT_FLAGS=-O3
+#
+# OpenMP flags
+#
+# You can also use -qopenmp.  -openmp is deprecated.
+OPENMPFLAG=-qsmp=omp
+OPENMPSIMDFLAG=-qsmp=omp
+OFFLOADFLAG=-qoffload -qtgtarch=sm_70
+#
+# OpenCL flags
+#
+# POCL
+# http://portablecl.org/docs/html/using.html#linking-your-program-directly-with-pocl is not correct...
+#OPENCLFLAG=-I/opt/pocl/latest/include -L/opt/pocl/latest/lib -lpoclu -I/opt/pocl/latest/share/pocl/include -lOpenCL
+# Linux
+OPENCLDIR=/usr
+OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL -Wno-deprecated-declarations
+#
+# SYCL flags
+#
+# triSYCL
+# https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
+SYCLDIR=./triSYCL
+#SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS)
+SYCLCXX=${CXX} $(DEFAULT_OPT_FLAGS)
+SYCLFLAG=-std=gnu++17 -I$(SYCLDIR)/include -DTRISYCL
+#
+# OCCA
+#
+#OCCADIR=${HOME}/prk-repo/Cxx11/occa
+#
+# TBB
+#
+TBBDIR=${HOME}/TBB
+TBBLIBDIR=${HOME}/TBB/build/linux_ppc64le_xl_cc4.8.5_libc2.17_kernel4.14.0_release
+TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBLIBDIR} -ltbb
+#
+# Parallel STL, Boost, etc.
+#
+BOOSTROOT=${HOME}/boost_1_71_0/include
+BOOSTFLAG=
+BOOSTFLAG+=-I${BOOSTROOT}/circular_buffer/include
+BOOSTFLAG+=-I${BOOSTROOT}/compute/include
+BOOSTFLAG+=-I${BOOSTROOT}/algorithm/include
+BOOSTFLAG+=-I${BOOSTROOT}/config/include
+BOOSTFLAG+=-I${BOOSTROOT}/core/include
+BOOSTFLAG+=-I${BOOSTROOT}/log/include
+BOOSTFLAG+=-I${BOOSTROOT}/array/include
+BOOSTFLAG+=-I${BOOSTROOT}/multi_array/include
+BOOSTFLAG+=-I${BOOSTROOT}/optional/include
+BOOSTFLAG+=-I${BOOSTROOT}/preprocessor/include
+BOOSTFLAG+=-I${BOOSTROOT}/type_index/include
+BOOSTFLAG+=-I${BOOSTROOT}/utility/include
+BOOSTFLAG+=-I${BOOSTROOT}/assert/include
+BOOSTFLAG+=-I${BOOSTROOT}/static_assert/include
+BOOSTFLAG+=-I${BOOSTROOT}/exception/include
+BOOSTFLAG+=-I${BOOSTROOT}/throw_exception/include
+BOOSTFLAG+=-I${BOOSTROOT}/concept_check/include
+BOOSTFLAG+=-I${BOOSTROOT}/type_traits/include
+BOOSTFLAG+=-I${BOOSTROOT}/iterator/include
+BOOSTFLAG+=-I${BOOSTROOT}/mpl/include
+BOOSTFLAG+=-I${BOOSTROOT}/detail/include
+BOOSTFLAG+=-I${BOOSTROOT}/functional/include
+BOOSTFLAG+=-I${BOOSTROOT}/move/include
+BOOSTFLAG+=-I${BOOSTROOT}/range/include
+BOOSTFLAG+=-I${BOOSTROOT}/function/include
+BOOSTFLAG+=-I${BOOSTROOT}/integer/include
+BOOSTFLAG+=-I${BOOSTROOT}/container_hash/include
+BOOSTFLAG+=-I${BOOSTROOT}/bind/include
+BOOSTFLAG+=-I${BOOSTROOT}/chrono/include
+BOOSTFLAG+=-I${BOOSTROOT}/predef/include
+BOOSTFLAG+=-I${BOOSTROOT}/ratio/include
+BOOSTFLAG+=-I${BOOSTROOT}/function_types/include
+BOOSTFLAG+=-I${BOOSTROOT}/tuple/include
+BOOSTFLAG+=-I${BOOSTROOT}/lexical_cast/include
+BOOSTFLAG+=-I${BOOSTROOT}/numeric/conversion/include
+BOOSTFLAG+=-I${BOOSTROOT}/container/include
+BOOSTFLAG+=-I${BOOSTROOT}/math/include
+BOOSTFLAG+=-I${BOOSTROOT}/fusion/include
+BOOSTFLAG+=-I${BOOSTROOT}/typeof/include
+BOOSTFLAG+=-I${BOOSTROOT}/uuid/include
+BOOSTFLAG+=-I${BOOSTROOT}/smart_ptr/include
+BOOSTFLAG+=-I${BOOSTROOT}/proto/include
+BOOSTFLAG+=-DBOOST_COMPUTE_USE_CPP11
+RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
+#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
+PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG} -Wno-\#pragma-messages
+KOKKOSDIR=${HOME}/KOKKOS/install-cuda
+KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos -ldl
+RAJADIR=
+RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
+THRUSTDIR=/usr/local/cuda-10.1/targets/ppc64le-linux/include
+THRUSTFLAG=-I${THRUSTDIR}
+#
+# CBLAS for C++ DGEMM
+#
+BLASFLAG=-DESSL
+CBLASFLAG=-DESSL
+#
+# CUDA flags
+#
+# Linux w/ NVIDIA CUDA
+NVCC=/usr/local/cuda-10.1/bin/nvcc -arch=sm_70
+CUDAFLAGS=-g -O3 -std=c++11 --expt-extended-lambda

From c6aca6b603d3da5d35ed716234977067031fef3a Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Thu, 10 Oct 2019 11:24:18 -0400
Subject: [PATCH 228/245] C++ nstream error check precision fix (#423)

---
 Cxx11/nstream-cublas.cu        | 1 +
 Cxx11/nstream-cuda.cu          | 1 +
 Cxx11/nstream-device-thrust.cu | 1 +
 Cxx11/nstream-host-thrust.cc   | 1 +
 Cxx11/nstream-kokkos.cc        | 1 +
 Cxx11/nstream-occa.cc          | 1 +
 Cxx11/nstream-opencl.cc        | 3 ++-
 Cxx11/nstream-openmp-target.cc | 1 +
 Cxx11/nstream-openmp.cc        | 1 +
 Cxx11/nstream-raja.cc          | 1 +
 Cxx11/nstream-sycl-explicit.cc | 3 ++-
 Cxx11/nstream-sycl-usm.cc      | 3 ++-
 Cxx11/nstream-sycl.cc          | 3 ++-
 Cxx11/nstream.cc               | 1 +
 14 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/Cxx11/nstream-cublas.cu b/Cxx11/nstream-cublas.cu
index 65989a3af..ffd8fa0b0 100644
--- a/Cxx11/nstream-cublas.cu
+++ b/Cxx11/nstream-cublas.cu
@@ -199,6 +199,7 @@ int main(int argc, char * argv[])
   double epsilon=1.e-8;
   if (std::fabs(ar-asum)/asum > epsilon) {
       std::cout << "Failed Validation on output array\n"
+                << std::setprecision(16)
                 << "       Expected checksum: " << ar << "\n"
                 << "       Observed checksum: " << asum << std::endl;
       std::cout << "ERROR: solution did not validate" << std::endl;
diff --git a/Cxx11/nstream-cuda.cu b/Cxx11/nstream-cuda.cu
index 4597021bb..594d75369 100644
--- a/Cxx11/nstream-cuda.cu
+++ b/Cxx11/nstream-cuda.cu
@@ -207,6 +207,7 @@ int main(int argc, char * argv[])
   double epsilon=1.e-8;
   if (std::fabs(ar-asum)/asum > epsilon) {
       std::cout << "Failed Validation on output array\n"
+                << std::setprecision(16)
                 << "       Expected checksum: " << ar << "\n"
                 << "       Observed checksum: " << asum << std::endl;
       std::cout << "ERROR: solution did not validate" << std::endl;
diff --git a/Cxx11/nstream-device-thrust.cu b/Cxx11/nstream-device-thrust.cu
index 8ecbee9bf..7f2ea6168 100644
--- a/Cxx11/nstream-device-thrust.cu
+++ b/Cxx11/nstream-device-thrust.cu
@@ -162,6 +162,7 @@ int main(int argc, char * argv[])
   double epsilon(1.e-8);
   if (std::fabs(ar-asum)/asum > epsilon) {
       std::cout << "Failed Validation on output array\n"
+                << std::setprecision(16)
                 << "       Expected checksum: " << ar << "\n"
                 << "       Observed checksum: " << asum << std::endl;
       std::cout << "ERROR: solution did not validate" << std::endl;
diff --git a/Cxx11/nstream-host-thrust.cc b/Cxx11/nstream-host-thrust.cc
index ac82f33d3..7b5123c06 100644
--- a/Cxx11/nstream-host-thrust.cc
+++ b/Cxx11/nstream-host-thrust.cc
@@ -160,6 +160,7 @@ int main(int argc, char * argv[])
   double epsilon(1.e-8);
   if (std::fabs(ar-asum)/asum > epsilon) {
       std::cout << "Failed Validation on output array\n"
+                << std::setprecision(16)
                 << "       Expected checksum: " << ar << "\n"
                 << "       Observed checksum: " << asum << std::endl;
       std::cout << "ERROR: solution did not validate" << std::endl;
diff --git a/Cxx11/nstream-kokkos.cc b/Cxx11/nstream-kokkos.cc
index be425e75b..0d09d4079 100644
--- a/Cxx11/nstream-kokkos.cc
+++ b/Cxx11/nstream-kokkos.cc
@@ -177,6 +177,7 @@ int main(int argc, char * argv[])
     double epsilon(1.e-8);
     if (std::fabs(ar-asum)/asum > epsilon) {
         std::cout << "Failed Validation on output array\n"
+                  << std::setprecision(16)
                   << "       Expected checksum: " << ar << "\n"
                   << "       Observed checksum: " << asum << std::endl;
         std::cout << "ERROR: solution did not validate" << std::endl;
diff --git a/Cxx11/nstream-occa.cc b/Cxx11/nstream-occa.cc
index 6d584e893..ee2e15e61 100644
--- a/Cxx11/nstream-occa.cc
+++ b/Cxx11/nstream-occa.cc
@@ -188,6 +188,7 @@ int main(int argc, char * argv[])
   double epsilon=1.e-8;
   if (std::fabs(ar-asum)/asum > epsilon) {
       std::cout << "Failed Validation on output array\n"
+                << std::setprecision(16)
                 << "       Expected checksum: " << ar << "\n"
                 << "       Observed checksum: " << asum << std::endl;
       std::cout << "ERROR: solution did not validate" << std::endl;
diff --git a/Cxx11/nstream-opencl.cc b/Cxx11/nstream-opencl.cc
index 4ef40bd64..103980dc6 100644
--- a/Cxx11/nstream-opencl.cc
+++ b/Cxx11/nstream-opencl.cc
@@ -117,7 +117,7 @@ void run(cl::Context context, int iterations, size_t length)
   /// Analyze and output results
   //////////////////////////////////////////////////////////////////////
 
-  T ar(0);
+  double ar(0);
   T br(2);
   T cr(2);
   for (auto i=0; i<=iterations; i++) {
@@ -134,6 +134,7 @@ void run(cl::Context context, int iterations, size_t length)
   const double epsilon = (precision==64) ? 1.0e-8 : 1.0e-4;
   if (std::fabs(ar-asum)/asum > epsilon) {
       std::cout << "Failed Validation on output array\n"
+                << std::setprecision(16)
                 << "       Expected checksum: " << ar << "\n"
                 << "       Observed checksum: " << asum << std::endl;
       std::cout << "ERROR: solution did not validate" << std::endl;
diff --git a/Cxx11/nstream-openmp-target.cc b/Cxx11/nstream-openmp-target.cc
index 8715962a8..6eb800e3e 100644
--- a/Cxx11/nstream-openmp-target.cc
+++ b/Cxx11/nstream-openmp-target.cc
@@ -165,6 +165,7 @@ int main(int argc, char * argv[])
   double epsilon=1.e-8;
   if (std::fabs(ar-asum)/asum > epsilon) {
       std::cout << "Failed Validation on output array\n"
+                << std::setprecision(16)
                 << "       Expected checksum: " << ar << "\n"
                 << "       Observed checksum: " << asum << std::endl;
       std::cout << "ERROR: solution did not validate" << std::endl;
diff --git a/Cxx11/nstream-openmp.cc b/Cxx11/nstream-openmp.cc
index f3ea9bbd8..1eb24321a 100644
--- a/Cxx11/nstream-openmp.cc
+++ b/Cxx11/nstream-openmp.cc
@@ -172,6 +172,7 @@ int main(int argc, char * argv[])
   double epsilon=1.e-8;
   if (std::fabs(ar-asum)/asum > epsilon) {
       std::cout << "Failed Validation on output array\n"
+                << std::setprecision(16)
                 << "       Expected checksum: " << ar << "\n"
                 << "       Observed checksum: " << asum << std::endl;
       std::cout << "ERROR: solution did not validate" << std::endl;
diff --git a/Cxx11/nstream-raja.cc b/Cxx11/nstream-raja.cc
index dcba4cbf2..f86ebaf33 100644
--- a/Cxx11/nstream-raja.cc
+++ b/Cxx11/nstream-raja.cc
@@ -175,6 +175,7 @@ int main(int argc, char * argv[])
   double epsilon=1.e-8;
   if (std::fabs(ar-asum)/asum > epsilon) {
       std::cout << "Failed Validation on output array\n"
+                << std::setprecision(16)
                 << "       Expected checksum: " << ar << "\n"
                 << "       Observed checksum: " << asum << std::endl;
       std::cout << "ERROR: solution did not validate" << std::endl;
diff --git a/Cxx11/nstream-sycl-explicit.cc b/Cxx11/nstream-sycl-explicit.cc
index ef2a0392b..aee3d8167 100644
--- a/Cxx11/nstream-sycl-explicit.cc
+++ b/Cxx11/nstream-sycl-explicit.cc
@@ -159,7 +159,7 @@ void run(sycl::queue & q, int iterations, size_t length)
   /// Analyze and output results
   //////////////////////////////////////////////////////////////////////
 
-  T ar(0);
+  double ar(0);
   T br(2);
   T cr(2);
   for (int i=0; i<=iterations; ++i) {
@@ -176,6 +176,7 @@ void run(sycl::queue & q, int iterations, size_t length)
   const double epsilon(1.e-8);
   if (std::fabs(ar-asum)/asum > epsilon) {
       std::cout << "Failed Validation on output array\n"
+                << std::setprecision(16)
                 << "       Expected checksum: " << ar << "\n"
                 << "       Observed checksum: " << asum << std::endl;
       std::cout << "ERROR: solution did not validate" << std::endl;
diff --git a/Cxx11/nstream-sycl-usm.cc b/Cxx11/nstream-sycl-usm.cc
index c92a52bc9..d56df67bf 100644
--- a/Cxx11/nstream-sycl-usm.cc
+++ b/Cxx11/nstream-sycl-usm.cc
@@ -147,7 +147,7 @@ void run(sycl::queue & q, int iterations, size_t length)
   /// Analyze and output results
   //////////////////////////////////////////////////////////////////////
 
-  T ar(0);
+  double ar(0);
   T br(2);
   T cr(2);
   for (int i=0; i<=iterations; ++i) {
@@ -164,6 +164,7 @@ void run(sycl::queue & q, int iterations, size_t length)
   const double epsilon(1.e-8);
   if (std::fabs(ar-asum)/asum > epsilon) {
       std::cout << "Failed Validation on output array\n"
+                << std::setprecision(16)
                 << "       Expected checksum: " << ar << "\n"
                 << "       Observed checksum: " << asum << std::endl;
       std::cout << "ERROR: solution did not validate" << std::endl;
diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc
index bc52e6649..f9a891407 100644
--- a/Cxx11/nstream-sycl.cc
+++ b/Cxx11/nstream-sycl.cc
@@ -140,7 +140,7 @@ void run(sycl::queue & q, int iterations, size_t length)
   /// Analyze and output results
   //////////////////////////////////////////////////////////////////////
 
-  T ar(0);
+  double ar(0);
   T br(2);
   T cr(2);
   for (int i=0; i<=iterations; ++i) {
@@ -157,6 +157,7 @@ void run(sycl::queue & q, int iterations, size_t length)
   const double epsilon(1.e-8);
   if (std::fabs(ar-asum)/asum > epsilon) {
       std::cout << "Failed Validation on output array\n"
+                << std::setprecision(16)
                 << "       Expected checksum: " << ar << "\n"
                 << "       Observed checksum: " << asum << std::endl;
       std::cout << "ERROR: solution did not validate" << std::endl;
diff --git a/Cxx11/nstream.cc b/Cxx11/nstream.cc
index 5673d3cf6..d97765c48 100644
--- a/Cxx11/nstream.cc
+++ b/Cxx11/nstream.cc
@@ -148,6 +148,7 @@ int main(int argc, char * argv[])
   double epsilon=1.e-8;
   if (std::fabs(ar-asum)/asum > epsilon) {
       std::cout << "Failed Validation on output array\n"
+                << std::setprecision(16)
                 << "       Expected checksum: " << ar << "\n"
                 << "       Observed checksum: " << asum << std::endl;
       std::cout << "ERROR: solution did not validate" << std::endl;

From 12f34a29007cc353ce7420cdcadd1f78a6007dc1 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 19 Oct 2019 18:57:26 -0700
Subject: [PATCH 229/245] fix Kokkos CUDA issues on POWER9+V100 (#425)

* add Kokkos README
* update CUDA and Kokkos related things
* add missing CPPFLAGS
---
 Cxx11/Makefile           |  2 +-
 common/KOKKOS.md         | 17 +++++++++++++++++
 common/make.defs.ibmp9nv |  6 +++---
 3 files changed, 21 insertions(+), 4 deletions(-)
 create mode 100644 common/KOKKOS.md

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index 596c87793..e97c3fd2c 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -195,7 +195,7 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h
 
 ifeq ($(PRK_KOKKOS_BACKEND),Cuda)
 %-kokkos: %-kokkos.cc prk_util.h
-	${KOKKOSDIR}/bin/nvcc_wrapper $(CUDAFLAGS) $< $(KOKKOSFLAG) -DUSE_KOKKOS -DPRK_KOKKOS_BACKEND=Cuda -o $@
+	${KOKKOSDIR}/bin/nvcc_wrapper $(CPPFLAGS) $(CUDAFLAGS) $< $(KOKKOSFLAG) -DUSE_KOKKOS -DPRK_KOKKOS_BACKEND=Cuda -o $@
 else
 %-kokkos: %-kokkos.cc prk_util.h
 	$(info PRK help: Set USE_PRK_KOKKOS_BACKEND={Threads,Serial,Cuda} when invoking make to not use OpenMP)
diff --git a/common/KOKKOS.md b/common/KOKKOS.md
new file mode 100644
index 000000000..4a069d0b4
--- /dev/null
+++ b/common/KOKKOS.md
@@ -0,0 +1,17 @@
+# Kokkos README
+
+## IBM POWER9 + NVIDIA V100
+
+If you do not enable GPU arch >5, it fails at runtime.
+
+If you do not enable lambda support, `parallel_reduce` will not compile.
+
+```
+cmake .. -DKokkos_ENABLE_CUDA=True \
+         -DCMAKE_CXX_COMPILER=$HOME/KOKKOS/git/bin/nvcc_wrapper \
+         -DCMAKE_INSTALL_PREFIX=$HOME/KOKKOS/install-cuda \
+         -DKokkos_ARCH_POWER9=ON \
+         -DKokkos_ARCH_VOLTA70=ON \
+         -DKokkos_ENABLE_CUDA_LAMBDA=ON \
+ && make -j install
+```
diff --git a/common/make.defs.ibmp9nv b/common/make.defs.ibmp9nv
index 3fd2badf8..c222c0ce3 100644
--- a/common/make.defs.ibmp9nv
+++ b/common/make.defs.ibmp9nv
@@ -103,7 +103,7 @@ RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
 #RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
 PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG} -Wno-\#pragma-messages
 KOKKOSDIR=${HOME}/KOKKOS/install-cuda
-KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos -ldl
+KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib64 -lkokkoscore
 RAJADIR=
 RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
 THRUSTDIR=/usr/local/cuda-10.1/targets/ppc64le-linux/include
@@ -117,5 +117,5 @@ CBLASFLAG=-DESSL
 # CUDA flags
 #
 # Linux w/ NVIDIA CUDA
-NVCC=/usr/local/cuda-10.1/bin/nvcc -arch=sm_70
-CUDAFLAGS=-g -O3 -std=c++11 --expt-extended-lambda
+NVCC=/usr/local/cuda-10.1/bin/nvcc
+CUDAFLAGS=-g -O3 -std=c++11 --expt-extended-lambda -arch=sm_70

From e1c5652520fb4077b5fbeb8cbb127c12d4930ef2 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 19 Oct 2019 22:14:07 -0600
Subject: [PATCH 230/245] more NVCC fun

---
 common/make.defs.ibmp9nv | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/common/make.defs.ibmp9nv b/common/make.defs.ibmp9nv
index c222c0ce3..0d59c4c3c 100644
--- a/common/make.defs.ibmp9nv
+++ b/common/make.defs.ibmp9nv
@@ -118,4 +118,7 @@ CBLASFLAG=-DESSL
 #
 # Linux w/ NVIDIA CUDA
 NVCC=/usr/local/cuda-10.1/bin/nvcc
-CUDAFLAGS=-g -O3 -std=c++11 --expt-extended-lambda -arch=sm_70
+CUDAFLAGS=-g -O3 -std=c++11
+CUDAFLAGS+=--expt-extended-lambda
+CUDAFLAGS+=-arch=sm_70
+CUDAFLAGS+=-Xcudafe --diag_suppress=esa_on_defaulted_function_ignored

From 78d47c0be844b3a347e879b9365e6058ca6753c2 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 19 Oct 2019 22:51:48 -0600
Subject: [PATCH 231/245] xlc suppress warning in OpenCL code

    1500-029: (W) WARNING: subprogram cl::Platform::getDevices(cl_device_type, std::vector<Device> *) could not be inlined into cl::Context::Context(cl_device_type, cl_context_properties *, void (*)(const char *, const void *, ::size_t, void *), void *, cl_int *).
---
 common/make.defs.ibmp9nv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/make.defs.ibmp9nv b/common/make.defs.ibmp9nv
index 0d59c4c3c..a9684645f 100644
--- a/common/make.defs.ibmp9nv
+++ b/common/make.defs.ibmp9nv
@@ -31,7 +31,7 @@ OFFLOADFLAG=-qoffload -qtgtarch=sm_70
 #OPENCLFLAG=-I/opt/pocl/latest/include -L/opt/pocl/latest/lib -lpoclu -I/opt/pocl/latest/share/pocl/include -lOpenCL
 # Linux
 OPENCLDIR=/usr
-OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL -Wno-deprecated-declarations
+OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL -Wno-deprecated-declarations -qsuppress=1500-029
 #
 # SYCL flags
 #

From 15a4cd6763cc683bb33cd3dae2083874706a1a93 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 19 Oct 2019 23:14:51 -0600
Subject: [PATCH 232/245] more IBM fixes, Boost stuff

---
 common/make.defs.ibmp9nv | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/common/make.defs.ibmp9nv b/common/make.defs.ibmp9nv
index a9684645f..bd6d25d8c 100644
--- a/common/make.defs.ibmp9nv
+++ b/common/make.defs.ibmp9nv
@@ -15,7 +15,7 @@ CXX=xlc++_r -qlanglvl=extended1y
 #
 # Compiler flags
 #
-DEFAULT_OPT_FLAGS=-O3
+DEFAULT_OPT_FLAGS=-O3 -qsuppress=1500-036
 #
 # OpenMP flags
 #
@@ -33,15 +33,6 @@ OFFLOADFLAG=-qoffload -qtgtarch=sm_70
 OPENCLDIR=/usr
 OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL -Wno-deprecated-declarations -qsuppress=1500-029
 #
-# SYCL flags
-#
-# triSYCL
-# https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
-SYCLDIR=./triSYCL
-#SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS)
-SYCLCXX=${CXX} $(DEFAULT_OPT_FLAGS)
-SYCLFLAG=-std=gnu++17 -I$(SYCLDIR)/include -DTRISYCL
-#
 # OCCA
 #
 #OCCADIR=${HOME}/prk-repo/Cxx11/occa
@@ -54,7 +45,7 @@ TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBLIBDIR} -ltbb
 #
 # Parallel STL, Boost, etc.
 #
-BOOSTROOT=${HOME}/boost_1_71_0/include
+BOOSTROOT=${HOME}/boost/libs
 BOOSTFLAG=
 BOOSTFLAG+=-I${BOOSTROOT}/circular_buffer/include
 BOOSTFLAG+=-I${BOOSTROOT}/compute/include
@@ -109,6 +100,17 @@ RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
 THRUSTDIR=/usr/local/cuda-10.1/targets/ppc64le-linux/include
 THRUSTFLAG=-I${THRUSTDIR}
 #
+# SYCL flags
+#
+# triSYCL
+# https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
+SYCLDIR=./triSYCL
+#SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS)
+#SYCLCXX=${CXX} $(DEFAULT_OPT_FLAGS)
+SYCLCXX=g++ -O3 -std=c++17
+SYCLFLAG=-I$(SYCLDIR)/include -DTRISYCL
+SYCLFLAG+=$(BOOSTFLAG)
+#
 # CBLAS for C++ DGEMM
 #
 BLASFLAG=-DESSL

From 70d04eb2706e7b1b5e418d1ec44abd01ae38ea1c Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 19 Oct 2019 23:25:27 -0600
Subject: [PATCH 233/245] change stdlib check from error to warning

---
 Cxx11/prk_util.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h
index ed798b0b1..a6a9af3c2 100644
--- a/Cxx11/prk_util.h
+++ b/Cxx11/prk_util.h
@@ -41,7 +41,7 @@
 
 // Test standard library _after_ standard headers have been included...
 #if !defined(__NVCC__) && !defined(__PGI) && !defined(__ibmxl__) && (defined(__GLIBCXX__) || defined(_GLIBCXX_RELEASE) ) && !defined(_GLIBCXX_USE_CXX11_ABI)
-# error You are using an ancient version GNU libstdc++.  Either upgrade your GCC or tell ICC to use a newer version via the -gxx-name= option.
+# warning You are using an ancient version GNU libstdc++.  Either upgrade your GCC or tell ICC to use a newer version via the -gxx-name= option.
 #endif
 
 #if !(defined(__cplusplus) && (__cplusplus >= 201103L))

From f4a174013ff70648e39268d404bfe69beaae4adb Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 19 Oct 2019 23:26:01 -0600
Subject: [PATCH 234/245] disable boost-compute by default; use CXX to build
 OpenCL

---
 Cxx11/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index e97c3fd2c..a40207f76 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -69,7 +69,7 @@ ifneq ($(findstring pgc++,$(CXX)),pgc++)
   EXTRA += tbb pstl
 endif
 
-all: sequential vector valarray openmp taskloop stl rangefor opencl sycl boost-compute $(EXTRA)
+all: sequential vector valarray openmp taskloop stl rangefor opencl sycl $(EXTRA)
 
 #p2p: p2p-vector p2p-doacross-openmp p2p-hyperplane-openmp p2p-tasks-openmp p2p-openmp-target \
      p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl p2p-hyperplane-vector-tbb \
@@ -154,7 +154,7 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h
 	$(CXX) $(CXXFLAGS) $< $(OPENCLFLAGS) -o $@
 
 %-opencl: %-opencl.cc prk_util.h prk_opencl.h
-	$(SYCLCXX) $(CXXFLAGS) $< $(OPENCLFLAGS) -o $@
+	$(CXX) $(CXXFLAGS) $< $(OPENCLFLAGS) -o $@
 
 %-sycl: %-sycl.cc prk_util.h
 	$(SYCLCXX) $(CPPFLAGS) $(SYCLFLAGS) $< -o $@

From d897d6a5f15bcc8e635565da02331a8d633acedb Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 19 Oct 2019 23:27:43 -0600
Subject: [PATCH 235/245] change flags for OpenCL and SYCL

---
 common/make.defs.ibmp9nv | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/make.defs.ibmp9nv b/common/make.defs.ibmp9nv
index bd6d25d8c..e4ee52866 100644
--- a/common/make.defs.ibmp9nv
+++ b/common/make.defs.ibmp9nv
@@ -31,7 +31,7 @@ OFFLOADFLAG=-qoffload -qtgtarch=sm_70
 #OPENCLFLAG=-I/opt/pocl/latest/include -L/opt/pocl/latest/lib -lpoclu -I/opt/pocl/latest/share/pocl/include -lOpenCL
 # Linux
 OPENCLDIR=/usr
-OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL -Wno-deprecated-declarations -qsuppress=1500-029
+OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL -Wno-deprecated-declarations -qsuppress=1500-029 -qstrict
 #
 # OCCA
 #
@@ -107,7 +107,7 @@ THRUSTFLAG=-I${THRUSTDIR}
 SYCLDIR=./triSYCL
 #SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS)
 #SYCLCXX=${CXX} $(DEFAULT_OPT_FLAGS)
-SYCLCXX=g++ -O3 -std=c++17
+SYCLCXX=g++ -O3 -std=gnu++11
 SYCLFLAG=-I$(SYCLDIR)/include -DTRISYCL
 SYCLFLAG+=$(BOOSTFLAG)
 #

From 11dadb9f47cf80a2cd5a9ee2bdfe2df25995d83d Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sun, 20 Oct 2019 10:52:48 -0700
Subject: [PATCH 236/245] RAJA docs (WIP)

---
 common/RAJA.md | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 common/RAJA.md

diff --git a/common/RAJA.md b/common/RAJA.md
new file mode 100644
index 000000000..3b5ce0e81
--- /dev/null
+++ b/common/RAJA.md
@@ -0,0 +1,11 @@
+# RAJA README
+
+## IBM POWER9 + NVIDIA V100
+
+```
+cmake .. -DCMAKE_INSTALL_PREFIX=$HOME/RAJA/install-cuda \
+         -DCMAKE_CXX_COMPILER=xlc++_r -DCMAKE_C_COMPILER=xlc_r \
+         -DENABLE_OPENMP=On -DENABLE_TARGET_OPENMP=On -DOpenMP_CXX_FLAGS="-qsmp -qoffload" \
+         -DENABLE_CUDA=On -DCUDA_ARCH=sm_70
+ && make -j install
+```

From ac48fbcede5b216d5d29b4dfcb175652ded1aacc Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sun, 20 Oct 2019 10:56:42 -0700
Subject: [PATCH 237/245] Update RAJA.md

---
 common/RAJA.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/common/RAJA.md b/common/RAJA.md
index 3b5ce0e81..abba1b578 100644
--- a/common/RAJA.md
+++ b/common/RAJA.md
@@ -9,3 +9,6 @@ cmake .. -DCMAKE_INSTALL_PREFIX=$HOME/RAJA/install-cuda \
          -DENABLE_CUDA=On -DCUDA_ARCH=sm_70
  && make -j install
 ```
+
+Optional extras: `-qsuppress=1500-030` or `-qmaxmem=-1`
+

From e584b4a23d590709aca441bb0e97c1ffe258529e Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Tue, 17 Dec 2019 09:13:14 -0800
Subject: [PATCH 238/245] multi-GPU CUBLAS DGEMM (#430)

* add single-threaded multi-GPU CUBLAS
* added MPI helper API in prk_mpi.h
* add build system support for MPI+CUBLAS
* add option to use specific number of GPUs

CUBLAS multi-GPU support is weird. CUBLAS handles do not capture GPU device id.

Tested and working on a 2 V100 x86 system.
---
 Cxx11/Makefile                 |   5 +-
 Cxx11/dgemm-mpi-cublas.cu      | 272 ++++++++++++++++++++++++++
 Cxx11/dgemm-multigpu-cublas.cu | 335 +++++++++++++++++++++++++++++++++
 Cxx11/prk_cuda.h               |  17 ++
 Cxx11/prk_mpi.h                | 115 +++++++++++
 common/make.defs.cuda          |   1 +
 6 files changed, 744 insertions(+), 1 deletion(-)
 create mode 100644 Cxx11/dgemm-mpi-cublas.cu
 create mode 100644 Cxx11/dgemm-multigpu-cublas.cu
 create mode 100644 Cxx11/prk_mpi.h

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index a40207f76..bc2c8cd32 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -132,7 +132,7 @@ thrust: nstream-host-thrust nstream-device-thrust \
 
 cuda: transpose-cuda
 
-cublas: transpose-cublas nstream-cublas dgemm-cublas
+cublas: transpose-cublas nstream-cublas dgemm-cublas dgemm-multigpu-cublas dgemm-mpi-cublas
 
 cblas: transpose-cblas dgemm-cblas
 
@@ -213,6 +213,9 @@ endif
 %-cuda: %-cuda.cu prk_util.h prk_cuda.h
 	$(NVCC) $(CUDAFLAGS) $(CPPFLAGS) $< -o $@
 
+%-mpi-cublas: %-mpi-cublas.cu prk_util.h prk_cuda.h prk_mpi.h
+	$(NVCC) $(CUDAFLAGS) $(CPPFLAGS) $< -lcublas $(MPIFLAGS) -o $@
+
 %-cublas: %-cublas.cu prk_util.h prk_cuda.h
 	$(NVCC) $(CUDAFLAGS) $(CPPFLAGS) $< -lcublas -o $@
 
diff --git a/Cxx11/dgemm-mpi-cublas.cu b/Cxx11/dgemm-mpi-cublas.cu
new file mode 100644
index 000000000..c78da594a
--- /dev/null
+++ b/Cxx11/dgemm-mpi-cublas.cu
@@ -0,0 +1,272 @@
+///
+/// Copyright (c) 2018, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    dgemm
+///
+/// PURPOSE: This program tests the efficiency with which a dense matrix
+///          dense multiplication is carried out
+///
+/// USAGE:   The program takes as input the matrix order,
+///          the number of times the matrix-matrix multiplication
+///          is carried out, and, optionally, a tile size for matrix
+///          blocking
+///
+///          <progname> <# iterations> <matrix order>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// FUNCTIONS CALLED:
+///
+///          Other than OpenMP or standard C functions, the following
+///          functions are used in this program:
+///
+///          cblasDgemm()
+///
+/// HISTORY: Written by Rob Van der Wijngaart, February 2009.
+///          Converted to C++11 by Jeff Hammond, December, 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+#include "prk_cuda.h"
+#include "prk_mpi.h"
+
+__global__ void init(int order, double * A, double * B, double * C)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int j = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if ((i<order) && (j<order)) {
+      A[i*order+j] = i;
+      B[i*order+j] = i;
+      C[i*order+j] = 0;
+    }
+}
+
+__global__ void init(int order, double * C)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int j = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if ((i<order) && (j<order)) {
+      C[i*order+j] = 0;
+    }
+}
+
+int main(int argc, char * argv[])
+{
+  {
+    prk::MPI::state mpi(argc,argv);
+
+    int np = prk::MPI::size();
+    int me = prk::MPI::rank();
+
+    prk::CUDA::info cuda;
+
+    if (me == 0) {
+      std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+      std::cout << "MPI/C++11/CUBLAS Dense matrix-matrix multiplication: C += A x B" << std::endl;
+      cuda.print();
+    }
+
+    int ngpu = cuda.num_gpus();
+
+    if (ngpu != np) {
+        std::cout << "Please run with one MPI process per GPU (single-node only)" << std::endl;
+        return (np-ngpu);
+    }
+
+    // assign a GPU per MPI process
+    cuda.set_gpu(me);
+
+    //////////////////////////////////////////////////////////////////////
+    /// Read and test input parameters
+    //////////////////////////////////////////////////////////////////////
+
+    int iterations;
+    int order;
+    try {
+        if (argc < 2) {
+          throw "Usage: <# iterations> <matrix order>";
+        }
+
+        iterations  = std::atoi(argv[1]);
+        if (iterations < 1) {
+          throw "ERROR: iterations must be >= 1";
+        }
+
+        order = std::atoi(argv[2]);
+        if (order <= 0) {
+          throw "ERROR: Matrix Order must be greater than 0";
+        } else if (order > std::floor(std::sqrt(INT_MAX))) {
+          throw "ERROR: matrix dimension too large - overflow risk";
+        }
+    }
+    catch (const char * e) {
+      std::cout << e << std::endl;
+      return 1;
+    }
+
+    if (me == 0) {
+      std::cout << "Number of iterations = " << iterations << std::endl;
+      std::cout << "Matrix order         = " << order << std::endl;
+    }
+
+    cublasHandle_t h;
+    prk::CUDA::check( cublasCreate(&h) );
+
+    const int tile_size = 32;
+    dim3 dimGrid(prk::divceil(order,tile_size),prk::divceil(order,tile_size),1);
+    dim3 dimBlock(tile_size, tile_size, 1);
+
+    cuda.checkDims(dimBlock, dimGrid);
+
+    //////////////////////////////////////////////////////////////////////
+    // Allocate space for matrices
+    //////////////////////////////////////////////////////////////////////
+
+    double dgemm_time(0);
+
+    const size_t nelems = (size_t)order * (size_t)order;
+    const size_t bytes = nelems * sizeof(double);
+
+    // host buffers
+    double * h_c;
+    prk::CUDA::check( cudaMallocHost((void**)&h_c, bytes) );
+
+    // device buffers
+    double * d_a;
+    double * d_b;
+    double * d_c;
+    prk::CUDA::check( cudaMalloc((void**)&d_a, bytes) );
+    prk::CUDA::check( cudaMalloc((void**)&d_b, bytes) );
+    prk::CUDA::check( cudaMalloc((void**)&d_c, bytes) );
+
+    init<<<dimGrid, dimBlock>>>(order, d_a, d_b, d_c);
+
+    {
+      for (auto iter = 0; iter<=iterations; iter++) {
+
+        if (iter==1) {
+            prk::MPI::barrier();
+            dgemm_time = prk::wtime();
+        }
+
+        double alpha = 1.0;
+        double beta  = 1.0;
+        prk::CUDA::check( cublasDgemm(h,
+                                      CUBLAS_OP_N, CUBLAS_OP_N, // opA, opB
+                                      order, order, order,      // m, n, k
+                                      &alpha,                   // alpha
+                                      d_a, order,               // A, lda
+                                      d_b, order,               // B, ldb
+                                      &beta,                    // beta
+                                      d_c, order) );            // C, ldc
+
+        prk::CUDA::check( cudaDeviceSynchronize() );
+      }
+      prk::MPI::barrier();
+      dgemm_time = prk::wtime() - dgemm_time;
+    }
+
+    // copy output back to host
+    prk::CUDA::check( cudaMemcpyAsync(&(h_c[0]), d_c, bytes, cudaMemcpyDeviceToHost) );
+
+    prk::CUDA::check( cudaFree(d_c) );
+    prk::CUDA::check( cudaFree(d_b) );
+    prk::CUDA::check( cudaFree(d_a) );
+
+    prk::CUDA::check( cublasDestroy(h) );
+
+    prk::CUDA::check( cudaDeviceSynchronize() );
+
+    //////////////////////////////////////////////////////////////////////
+    /// Analyze and output results
+    //////////////////////////////////////////////////////////////////////
+
+    const double epsilon = 1.0e-8;
+    const double forder = static_cast<double>(order);
+    const double reference = 0.25 * std::pow(forder,3) * std::pow(forder-1.0,2) * (iterations+1);
+    double residuum(0);
+    const auto checksum = prk::reduce( &(h_c[0]), &(h_c[nelems]), 0.0);
+    residuum += std::abs(checksum-reference)/reference;
+
+    // take the global max to make sure everyone passes...
+    residuum = prk::MPI::max(residuum);
+
+#ifndef VERBOSE
+    if (residuum >= epsilon)
+#endif
+    {
+      for (int r=0; r<np; ++r) {
+        prk::MPI::barrier();
+        if (r==me) {
+          std::cout << "Reference checksum = " << reference << "\n"
+                    << "Actual checksum = " << residuum << std::endl;
+        }
+      }
+    }
+
+    if (residuum < epsilon) {
+      prk::MPI::barrier();
+      if (me==0) {
+        std::cout << "Solution validates" << std::endl;
+      }
+      auto time = dgemm_time/iterations;
+      auto nflops = 2.0 * std::pow(forder,3);
+      auto rate = 1.0e-6 * nflops/time;
+
+      double minrate = prk::MPI::min(rate);
+      double maxrate = prk::MPI::max(rate);
+      double avgrate = prk::MPI::avg(rate);
+
+      double mintime = prk::MPI::min(time);
+      double maxtime = prk::MPI::max(time);
+      double avgtime = prk::MPI::avg(time);
+
+      if (me==0) {
+        std::cout << "MIN Rate (MF/s): " << minrate << " Avg time (s): " << maxtime << std::endl;
+        std::cout << "MAX Rate (MF/s): " << maxrate << " Avg time (s): " << mintime << std::endl;
+        std::cout << "AVG Rate (MF/s): " << avgrate << " Avg time (s): " << avgtime << std::endl;
+      }
+    }
+
+    prk::CUDA::check( cudaFreeHost(h_c) );
+
+  } // prk::MPI:state goes out of scope here
+
+  return 0;
+}
+
+
diff --git a/Cxx11/dgemm-multigpu-cublas.cu b/Cxx11/dgemm-multigpu-cublas.cu
new file mode 100644
index 000000000..b4e1bff49
--- /dev/null
+++ b/Cxx11/dgemm-multigpu-cublas.cu
@@ -0,0 +1,335 @@
+///
+/// Copyright (c) 2018, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    dgemm
+///
+/// PURPOSE: This program tests the efficiency with which a dense matrix
+///          dense multiplication is carried out
+///
+/// USAGE:   The program takes as input the matrix order,
+///          the number of times the matrix-matrix multiplication
+///          is carried out, and, optionally, a tile size for matrix
+///          blocking
+///
+///          <progname> <# iterations> <matrix order> [<batches>]
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// FUNCTIONS CALLED:
+///
+///          Other than OpenMP or standard C functions, the following
+///          functions are used in this program:
+///
+///          cblasDgemm()
+///          cublasDgemmStridedBatched()
+///
+/// HISTORY: Written by Rob Van der Wijngaart, February 2009.
+///          Converted to C++11 by Jeff Hammond, December, 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+#include "prk_cuda.h"
+
+__global__ void init(int order, const int matrices, double * A, double * B, double * C)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int j = blockIdx.y * blockDim.y + threadIdx.y;
+
+    for (int b=0; b<matrices; ++b) {
+      if ((i<order) && (j<order)) {
+        A[b*order*order+i*order+j] = i;
+        B[b*order*order+i*order+j] = i;
+        C[b*order*order+i*order+j] = 0;
+      }
+    }
+}
+
+__global__ void init(int order, const int matrices, double * C)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int j = blockIdx.y * blockDim.y + threadIdx.y;
+
+    for (int b=0; b<matrices; ++b) {
+      if ((i<order) && (j<order)) {
+        C[b*order*order+i*order+j] = 0;
+      }
+    }
+}
+
+void prk_dgemm(const cublasHandle_t & h,
+               const int order,
+               const int batches,
+               double * A,
+               double * B,
+               double * C)
+{
+    const double alpha = 1.0;
+    const double beta  = 1.0;
+
+    for (int b=0; b<batches; ++b) {
+        double * pA = &(A[b*order*order]);
+        double * pB = &(B[b*order*order]);
+        double * pC = &(C[b*order*order]);
+        prk::CUDA::check( cublasDgemm(h,
+                                      CUBLAS_OP_N, CUBLAS_OP_N, // opA, opB
+                                      order, order, order,      // m, n, k
+                                      &alpha,                   // alpha
+                                      pA, order,                // A, lda
+                                      pB, order,                // B, ldb
+                                      &beta,                    // beta
+                                      pC, order) );             // C, ldc
+    }
+}
+
+void prk_bgemm(const cublasHandle_t & h,
+               const int order,
+               const int batches,
+               double * A,
+               double * B,
+               double * C)
+{
+    const double alpha = 1.0;
+    const double beta  = 1.0;
+
+    prk::CUDA::check( cublasDgemmStridedBatched(h,
+                                                CUBLAS_OP_N, CUBLAS_OP_N,
+                                                order, order, order,
+                                                &alpha,
+                                                (const double *)A, order, order*order,
+                                                (const double *)B, order, order*order,
+                                                &beta,
+                                                C, order, order*order,
+                                                batches) );
+
+    //  cublasStatus_t cublasDgemmBatched(cublasHandle_t handle,
+    //                                    cublasOperation_t transa,
+    //                                    cublasOperation_t transb,
+    //                                    int m, int n, int k,
+    //                                    const double          *alpha,
+    //                                    const double          *Aarray[], int lda,
+    //                                    const double          *Barray[], int ldb,
+    //                                    const double          *beta,
+    //                                    double          *Carray[], int ldc,
+    //                                    int batchCount)
+}
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/CUBLAS Dense matrix-matrix multiplication: C += A x B" << std::endl;
+
+  prk::CUDA::info info;
+  info.print();
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations;
+  int order;
+  int batches = 0;
+  int use_ngpu = 1;
+  try {
+      if (argc < 2) {
+        throw "Usage: <# iterations> <matrix order> [<batches>] [<use_ngpu>]";
+      }
+
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      order = std::atoi(argv[2]);
+      if (order <= 0) {
+        throw "ERROR: Matrix Order must be greater than 0";
+      } else if (order > std::floor(std::sqrt(INT_MAX))) {
+        throw "ERROR: matrix dimension too large - overflow risk";
+      }
+
+      if (argc>3) {
+        batches = std::atoi(argv[3]);
+      }
+
+      if (argc>4) {
+        use_ngpu = std::atoi(argv[4]);
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations  = " << iterations << std::endl;
+  std::cout << "Matrix order          = " << order << std::endl;
+  if (batches == 0) {
+      std::cout << "No batching" << std::endl;
+  } else if (batches < 0) {
+      std::cout << "Batch size            = " << -batches << " (loop over legacy BLAS)" << std::endl;
+  } else if (batches > 0) {
+      std::cout << "Batch size            = " <<  batches << " (batched BLAS)" << std::endl;
+  }
+  std::cout << "Number of GPUs to use = " << use_ngpu << std::endl;
+
+  int haz_ngpu = info.num_gpus();
+  std::cout << "Number of GPUs found  = " << haz_ngpu << std::endl;
+
+  if (use_ngpu > haz_ngpu) {
+      std::cout << "You cannot use more GPUs (" << use_ngpu << ") than you have (" << haz_ngpu << ")" << std::endl;
+  }
+
+  int ngpus = use_ngpu;
+
+  std::vector<cublasHandle_t> contexts(ngpus);
+  for (int i=0; i<ngpus; ++i) {
+      prk::CUDA::check( cudaSetDevice(i) );
+      prk::CUDA::check( cublasCreate(&contexts[i]) );
+  }
+
+  const int tile_size = 32;
+  dim3 dimGrid(prk::divceil(order,tile_size),prk::divceil(order,tile_size),1);
+  dim3 dimBlock(tile_size, tile_size, 1);
+
+  info.checkDims(dimBlock, dimGrid);
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space for matrices
+  //////////////////////////////////////////////////////////////////////
+
+  double dgemm_time(0);
+
+  const int matrices = (batches==0 ? 1 : abs(batches));
+  const size_t nelems = (size_t)order * (size_t)order;
+  const size_t bytes = nelems * sizeof(double);
+
+  // host buffers
+  std::vector<double*> h_c(ngpus,nullptr);
+  for (int i=0; i<ngpus; ++i) {
+      prk::CUDA::check( cudaMallocHost((void**)&h_c[i], matrices*bytes) );
+  }
+
+  // device buffers
+  std::vector<double*> d_a(ngpus,nullptr);
+  std::vector<double*> d_b(ngpus,nullptr);
+  std::vector<double*> d_c(ngpus,nullptr);
+  for (int i=0; i<ngpus; ++i) {
+      prk::CUDA::check( cudaSetDevice(i) );
+      prk::CUDA::check( cudaMalloc((void**)&d_a[i], matrices*bytes) );
+      prk::CUDA::check( cudaMalloc((void**)&d_b[i], matrices*bytes) );
+      prk::CUDA::check( cudaMalloc((void**)&d_c[i], matrices*bytes) );
+      init<<<dimGrid, dimBlock>>>(order, matrices, d_a[i], d_b[i], d_c[i]);
+  }
+  for (int i=0; i<ngpus; ++i) {
+      prk::CUDA::check( cudaSetDevice(i) );
+      prk::CUDA::check( cudaDeviceSynchronize() );
+  }
+
+  for (int iter = 0; iter<=iterations; iter++) {
+
+    if (iter==1) dgemm_time = prk::wtime();
+
+    for (int i=0; i<ngpus; ++i) {
+        prk::CUDA::check( cudaSetDevice(i) );
+        if (batches == 0) {
+            prk_dgemm(contexts[i], order, matrices, d_a[i], d_b[i], d_c[i]);
+        } else if (batches < 0) {
+            prk_dgemm(contexts[i], order, matrices, d_a[i], d_b[i], d_c[i]);
+        } else if (batches > 0) {
+            prk_bgemm(contexts[i], order, matrices, d_a[i], d_b[i], d_c[i]);
+        }
+    }
+    for (int i=0; i<ngpus; ++i) {
+        prk::CUDA::check( cudaSetDevice(i) );
+        prk::CUDA::check( cudaDeviceSynchronize() );
+    }
+  }
+  dgemm_time = prk::wtime() - dgemm_time;
+
+  // copy output back to host
+  for (int i=0; i<ngpus; ++i) {
+      prk::CUDA::check( cudaSetDevice(i) );
+      prk::CUDA::check( cudaMemcpyAsync(h_c[i], d_c[i], matrices*bytes, cudaMemcpyDeviceToHost) );
+  }
+
+  for (int i=0; i<ngpus; ++i) {
+      prk::CUDA::check( cudaSetDevice(i) );
+      prk::CUDA::check( cudaDeviceSynchronize() );
+      prk::CUDA::check( cudaFree(d_c[i]) );
+      prk::CUDA::check( cudaFree(d_b[i]) );
+      prk::CUDA::check( cudaFree(d_a[i]) );
+      prk::CUDA::check( cublasDestroy(contexts[i]) );
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  const double epsilon = 1.0e-8;
+  const double forder = static_cast<double>(order);
+  const double reference = 0.25 * std::pow(forder,3) * std::pow(forder-1.0,2) * (iterations+1);
+
+  double residuum(0);
+  for (int i=0; i<ngpus; ++i) {
+      for (int b=0; b<matrices; ++b) {
+          const auto checksum = prk::reduce( &(h_c[i][b*order*order+0]), &(h_c[i][b*order*order+nelems]), 0.0);
+          residuum += std::abs(checksum-reference)/reference;
+      }
+  }
+  residuum/=matrices;
+  residuum/=ngpus;
+
+  if (residuum < epsilon) {
+#if VERBOSE
+    std::cout << "Reference checksum = " << reference << "\n"
+              << "Actual checksum = " << checksum << std::endl;
+#endif
+    std::cout << "Solution validates" << std::endl;
+    auto avgtime = dgemm_time/iterations/matrices;
+    auto nflops = 2.0 * std::pow(forder,3) * ngpus;
+    std::cout << "Rate (MF/s): " << 1.0e-6 * nflops/avgtime
+              << " Avg time (s): " << avgtime << std::endl;
+  } else {
+    std::cout << "Reference checksum = " << reference << "\n"
+              << "Residuum           = " << residuum << std::endl;
+    return 1;
+  }
+
+  for (int i=0; i<ngpus; ++i) {
+      prk::CUDA::check( cudaFreeHost(h_c[i]) );
+  }
+
+  return 0;
+}
+
+
diff --git a/Cxx11/prk_cuda.h b/Cxx11/prk_cuda.h
index bb1d6b19c..d68650708 100644
--- a/Cxx11/prk_cuda.h
+++ b/Cxx11/prk_cuda.h
@@ -83,6 +83,23 @@ namespace prk
                     }
                 }
 
+                // do not use cached value as a hedge against weird stuff happening
+                int num_gpus() {
+                    int g;
+                    prk::CUDA::check( cudaGetDeviceCount(&g) );
+                    return g;
+                }
+
+                int get_gpu() {
+                    int g;
+                    prk::CUDA::check( cudaGetDevice(&g) );
+                    return g;
+                }
+
+                void set_gpu(int g) {
+                    prk::CUDA::check( cudaSetDevice(g) );
+                }
+
                 void print() {
                     for (auto i=0; i<nDevices; ++i) {
                         std::cout << "device name: " << vDevices[i].name << "\n";
diff --git a/Cxx11/prk_mpi.h b/Cxx11/prk_mpi.h
new file mode 100644
index 000000000..d3a0b764a
--- /dev/null
+++ b/Cxx11/prk_mpi.h
@@ -0,0 +1,115 @@
+#ifndef PRK_MPI_HPP
+#define PRK_MPI_HPP
+
+#include <iostream>
+#include <vector>
+#include <string>
+#include <mpi.h>
+
+namespace prk
+{
+    namespace MPI
+    {
+        void check(int errorcode)
+        {
+            if (errorcode==MPI_SUCCESS) {
+                return;
+            } else {
+                int resultlen;
+
+                char errorcode_string[MPI_MAX_ERROR_STRING];
+                char errorclass_string[MPI_MAX_ERROR_STRING];
+
+                int errorclass;
+                MPI_Error_class(errorcode, &errorclass);
+
+                MPI_Error_string(errorclass, errorclass_string, &resultlen);
+                std::cerr << "MPI error: class " << errorclass << ", " << errorclass_string << std::endl;
+
+                MPI_Error_string(errorcode, errorcode_string, &resultlen);
+                std::cerr << "MPI error: code " << errorcode << ", " << errorcode_string << std::endl;
+
+                MPI_Abort(MPI_COMM_WORLD, errorcode);
+                std::abort(); // unreachable
+            }
+        }
+
+        class state {
+
+          public:
+            state(void) {
+                int is_init, is_final;
+                MPI_Initialized(&is_init);
+                MPI_Finalized(&is_final);
+                if (!is_init && !is_final) {
+                    MPI_Init(NULL,NULL);
+                }
+            }
+
+            state(int argc, char** argv) {
+                int is_init, is_final;
+                MPI_Initialized(&is_init);
+                MPI_Finalized(&is_final);
+                if (!is_init && !is_final) {
+                    MPI_Init(&argc,&argv);
+                }
+            }
+
+            ~state(void) {
+                int is_init, is_final;
+                MPI_Initialized(&is_init);
+                MPI_Finalized(&is_final);
+                if (is_init && !is_final) {
+                    MPI_Finalize();
+                }
+            }
+
+        };
+
+        int rank(MPI_Comm comm = MPI_COMM_WORLD) {
+            int rank;
+            prk::MPI::check( MPI_Comm_rank(comm,&rank) );
+            return rank;
+        }
+
+        int size(MPI_Comm comm = MPI_COMM_WORLD) {
+            int size;
+            prk::MPI::check( MPI_Comm_size(comm,&size) );
+            return size;
+        }
+
+        void barrier(MPI_Comm comm = MPI_COMM_WORLD) {
+            prk::MPI::check( MPI_Barrier(comm) );
+        }
+
+        double min(double in, MPI_Comm comm = MPI_COMM_WORLD) {
+            double out;
+            prk::MPI::check( MPI_Allreduce(&in, &out, 1, MPI_DOUBLE, MPI_MIN, comm) );
+            return out;
+        }
+
+        double max(double in, MPI_Comm comm = MPI_COMM_WORLD) {
+            double out;
+            prk::MPI::check( MPI_Allreduce(&in, &out, 1, MPI_DOUBLE, MPI_MAX, comm) );
+            return out;
+        }
+
+        double avg(double in, MPI_Comm comm = MPI_COMM_WORLD) {
+            double out;
+            prk::MPI::check( MPI_Allreduce(&in, &out, 1, MPI_DOUBLE, MPI_SUM, comm) );
+            out /= prk::MPI::size(comm);
+            return out;
+        }
+
+        void stats(double in, double * min, double * max, double * avg, MPI_Comm comm = MPI_COMM_WORLD) {
+            prk::MPI::check( MPI_Allreduce(&in, min, 1, MPI_DOUBLE, MPI_MIN, comm) );
+            prk::MPI::check( MPI_Allreduce(&in, max, 1, MPI_DOUBLE, MPI_MAX, comm) );
+            prk::MPI::check( MPI_Allreduce(&in, avg, 1, MPI_DOUBLE, MPI_SUM, comm) );
+            *avg /= prk::MPI::size(comm);
+        }
+
+    } // MPI namespace
+
+} // prk namespace
+
+#endif // PRK_MPI_HPP
diff --git a/common/make.defs.cuda b/common/make.defs.cuda
index 0f5fafb75..9b1188db9 100644
--- a/common/make.defs.cuda
+++ b/common/make.defs.cuda
@@ -132,6 +132,7 @@ CUDAFLAGS+=-D_AVX512ERINTRIN_H_INCLUDED
 #
 # We assume you have installed an implementation of MPI-3 that is in your path.
 MPICC=mpicc
+MPIFLAGS=-lmpi
 #
 # Fortran 2008 coarrays
 #

From 6eb5b36f65fce2c54eaf9cbbb70e97e8aaf8a320 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Wed, 15 Jan 2020 09:26:41 -0800
Subject: [PATCH 239/245] more NVCC fun (#431)


From 93d58c2abc28ad560dcb079140a1bfacd5b2ad66 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Wed, 22 Jan 2020 13:38:35 -0800
Subject: [PATCH 240/245] oneAPI make.defs (#432)

---
 common/make.defs.oneapi | 116 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 116 insertions(+)
 create mode 100644 common/make.defs.oneapi

diff --git a/common/make.defs.oneapi b/common/make.defs.oneapi
new file mode 100644
index 000000000..4786d23ce
--- /dev/null
+++ b/common/make.defs.oneapi
@@ -0,0 +1,116 @@
+#
+# This file shows the Intel toolchain options for PRKs using
+# OpenMP, MPI and/or Fortran coarrays only.
+#
+# Base compilers and language options
+#
+# We assume you have Intel MPI and have setup your environment with e.g.
+# .  /opt/intel/compilers_and_libraries/linux/bin/compilervars.sh intel64
+# in your .bashrc.
+#
+# C99 is required in some implementations.
+CC=icx -std=c11 -pthread
+#EXTRA_CLIBS=-lrt
+# All of the Fortran code is written for the 2008 standard and requires preprocessing.
+FC=ifx -fpp
+# C++11 may not be required but does no harm here.
+CXX=icpx -std=gnu++17 -pthread
+#
+# Compiler flags
+#
+# -xHOST is appropriate for most cases.
+DEFAULT_OPT_FLAGS=-g -O3 -xHOST
+#
+# If you are compiling for KNL on a Xeon login node, use the following:
+# DEFAULT_OPT_FLAGS=-g -O3 -xMIC-AVX512
+#
+#DEFAULT_OPT_FLAGS+=-qopt-report=5
+#
+# OpenMP flags
+#
+OPENMPFLAG=-fiopenmp
+OPENMPSIMDFLAG=-fiopenmp
+OFFLOADFLAG=-fopenmp-targets=spir64
+#
+# OpenCL flags
+#
+# MacOS
+#OPENCLFLAG=-framework OpenCL
+# POCL
+# http://portablecl.org/docs/html/using.html#linking-your-program-directly-with-pocl is not correct...
+#OPENCLFLAG=-I/opt/pocl/latest/include -L/opt/pocl/latest/lib -lpoclu -I/opt/pocl/latest/share/pocl/include -lOpenCL
+# Linux
+OPENCLDIR=/etc/alternatives/opencl-intel-tools
+OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL
+#
+# SYCL flags
+#
+SYCLCXX=dpcpp
+SYCLFLAG=-fsycl -fsycl-unnamed-lambda
+SYCLFLAG+=-std=c++17 -O3
+#
+#
+# OCCA
+#
+#OCCADIR=${HOME}/prk-repo/Cxx11/occa
+#
+# TBB
+#
+TBBFLAG=-tbb
+#TBBFLAG=-tbb_preview -DTBB_PREVIEW_FLOW_GRAPH_TRACE
+#
+# Parallel STL, Boost, etc.
+#
+BOOSTFLAG=-I/usr/local/Cellar/boost/1.65.1/include
+RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
+#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
+PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG}
+KOKKOSDIR=/opt/kokkos/intel
+KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl
+RAJADIR=/opt/raja/intel
+RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
+THRUSTDIR=/opt/nvidia/thrust
+THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG}
+#
+# CBLAS for C++ DGEMM
+#
+#CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions
+CBLASFLAG=-DMKL -mkl
+#
+# CUDA flags
+#
+# Mac w/ CUDA emulation via https://github.com/hughperkins/coriander
+#NVCC=/opt/llvm/cocl/bin/cocl
+# Linux w/ NVIDIA CUDA
+NVCC=nvcc
+CUDAFLAGS=-g -O3 -std=c++11
+CUDAFLAGS+=-arch=sm_50
+# https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233
+CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED
+#
+# ISPC
+#
+ISPC=ispc
+ISPCFLAG=-O3 --target=host --opt=fast-math
+#
+# MPI
+#
+# We assume you have Intel MPI and have setup your environment with e.g.
+# . /opt/intel/compilers_and_libraries/linux/mpi/intel64/bin/mpivars.sh
+# in your .bashrc.
+#
+# mpiicc wraps icc.  mpicc and mpigcc wrap gcc.
+MPICC=mpiicc -std=c99
+#
+# Fortran 2008 coarrays
+#
+# see https://github.com/ParRes/Kernels/blob/master/FORTRAN/README.md for details
+# single-node
+COARRAYFLAG=-coarray
+# multi-node
+# COARRAYFLAG=-coarray=distributed
+#
+# MEMKIND (used in C1z)
+#
+MEMKINDDIR=/home/parallels/PRK/deps
+MEMKINDFLAGS=-I${MEMKINDDIR}/include -L${MEMKINDDIR}/lib -lmemkind -Wl,-rpath=${MEMKINDDIR}/lib

From f33b188bfbc69c03c7b6823bf8523637063b9187 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Wed, 22 Jan 2020 13:47:02 -0800
Subject: [PATCH 241/245] disable prebuilt SYCL kernels w/ DPC++ (#433)

* disable use of pre-build kernels for DPC++

/tmp/nstream-sycl-b825e1.o: In function
`cl::sycl::detail::program_impl::build(std::string const&)':
nstream-sycl.cc:(.text._ZN2cl4sycl6detail12program_impl5buildERKSs[_ZN2cl4sycl6detail12program_impl5buildERKSs]+0x3bc):
undefined reference to
`cl::sycl::detail::ProgramManager::getProgramBuildLog(_pi_program*
const&)'
clang++: error: linker command failed with exit code 1 (use -v to see
invocation)
---
 Cxx11/prk_sycl.h        | 2 +-
 common/make.defs.oneapi | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/Cxx11/prk_sycl.h b/Cxx11/prk_sycl.h
index cdd18d211..6e54a35c0 100644
--- a/Cxx11/prk_sycl.h
+++ b/Cxx11/prk_sycl.h
@@ -9,7 +9,7 @@
 namespace sycl = cl::sycl;
 
 // prebuilt kernels are not required/not fully supported on hipSYCL and triSYCL
-#if defined(TRISYCL) || defined(__HIPSYCL__)
+#if defined(TRISYCL) || defined(__HIPSYCL__) || defined(DPCPP)
 #define PREBUILD_KERNEL 0
 #else
 #define PREBUILD_KERNEL 1
diff --git a/common/make.defs.oneapi b/common/make.defs.oneapi
index 4786d23ce..b736557d3 100644
--- a/common/make.defs.oneapi
+++ b/common/make.defs.oneapi
@@ -48,6 +48,7 @@ OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL
 SYCLCXX=dpcpp
 SYCLFLAG=-fsycl -fsycl-unnamed-lambda
 SYCLFLAG+=-std=c++17 -O3
+SYCLFLAG+=-DDPCPP
 #
 #
 # OCCA

From 0c6957ea933a2c4b2c2072514f7ae619e5ea69f2 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Fri, 7 Feb 2020 18:45:49 -0600
Subject: [PATCH 242/245] improve ISPC w/ help from Jeff Amstutz (#434)

Signed-off-by: Jeff Hammond <jeff.r.hammond@intel.com>
---
 C1z/transpose-ispc.c |  4 ++--
 C1z/transpose.ispc   | 12 +++++-------
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/C1z/transpose-ispc.c b/C1z/transpose-ispc.c
index 442a4f1e9..a659bad83 100644
--- a/C1z/transpose-ispc.c
+++ b/C1z/transpose-ispc.c
@@ -55,7 +55,7 @@
 
 #include "prk_util.h"
 
-int ispc_num_threads(void);
+//int ispc_num_threads(void);
 void initialize(const int order, double A[], double B[]);
 void transpose(const int order, double A[], double B[]);
 void transpose_tiled(const int order, double A[], double B[], const int tile_size);
@@ -93,7 +93,7 @@ int main(int argc, char * argv[])
   // a negative tile size means no tiling of the local transpose
   if (tile_size <= 0) tile_size = order;
 
-  printf("ISPC threads          = %d\n", ispc_num_threads());
+  //printf("ISPC threads          = %d\n", ispc_num_threads());
   printf("Number of iterations  = %d\n", iterations);
   printf("Matrix order          = %d\n", order);
   printf("Tile size             = %d\n", tile_size);
diff --git a/C1z/transpose.ispc b/C1z/transpose.ispc
index 086709d57..d2ff472fb 100644
--- a/C1z/transpose.ispc
+++ b/C1z/transpose.ispc
@@ -19,16 +19,14 @@ export void initialize(uniform const int order,
   }
 }
 
-#if 0
+#if 1
 export void transpose(uniform const int order,
                       uniform double A[],
                       uniform double B[])
 {
-  foreach (i = 0 ... order) {
-    for (uniform int j=0;j<order;j++) {
-      B[i*order+j] += A[j*order+i];
-      A[j*order+i] += 1.0;
-    }
+  foreach(i = 0 ... order, j = 0 ... order) {
+    B[i*order+j] += A[j*order+i];
+    A[j*order+i] += 1.0;
   }
 }
 #else
@@ -37,7 +35,7 @@ export void transpose(uniform const int order,
                       uniform double B[])
 {
   for (uniform int i=0;i<order; i++) {
-    for (uniform int j=0;j<order;j++) {
+    for (varying int j=programIndex;j<order;j+=programCount) {
       B[i*order+j] += A[j*order+i];
       A[j*order+i] += 1.0;
     }

From 5c669cdcb12c48beb34a5b33c69c93e8b128247f Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sun, 9 Feb 2020 17:44:26 -0800
Subject: [PATCH 243/245] Create SYCL.md

---
 common/SYCL.md | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 common/SYCL.md

diff --git a/common/SYCL.md b/common/SYCL.md
new file mode 100644
index 000000000..f0edf26f1
--- /dev/null
+++ b/common/SYCL.md
@@ -0,0 +1,70 @@
+# How to Install SYCL
+
+## triSYCL
+
+See https://github.com/triSYCL/triSYCL.  This is a header-only implementation, so you can use
+any C++17 compiler (C++14 might be sufficient).  You need Boost, while OpenMP or TBB are optional
+for threaded parallelism on the CPU.
+
+## CodePlay ComputeCpp
+
+See https://www.codeplay.com/products/computesuite/computecpp.
+
+## Intel Data Parallel C++
+
+This comes in two flavors.  You can compile the open-source version on GitHub and use `clang++ -fsycl`,
+or you can install oneAPI and use the `dpcpp` driver, which is a wrapper around `clang++ -fsycl`.
+
+### oneAPI Download
+
+See https://software.intel.com/en-us/articles/installation-guide-for-intel-oneapi-toolkits.
+
+### Linux packages
+
+See https://software.intel.com/en-us/articles/oneapi-repo-instructions.
+
+### Build from source
+
+See https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedWithSYCLCompiler.md for details.
+
+The following is my automation once the repo is cloned.
+
+```sh
+#!/bin/bash
+
+export SYCL_HOME=$HOME/ISYCL
+
+#cd $SYCL_HOME/llvm && time git checkout usmapi && time git pull
+cd $SYCL_HOME/llvm && time git checkout sycl && time git pull
+
+rm -rf $SYCL_HOME/build
+
+mkdir -p $SYCL_HOME/build && \
+    cd $SYCL_HOME/build && \
+    time cmake \
+        -DCMAKE_INSTALL_PREFIX=/opt/isycl \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DLLVM_ENABLE_PROJECTS="clang;llvm-spirv;sycl" \
+        -DLLVM_EXTERNAL_PROJECTS="llvm-spirv;sycl" \
+        -DLLVM_EXTERNAL_SYCL_SOURCE_DIR=$SYCL_HOME/llvm/sycl \
+        -DLLVM_EXTERNAL_LLVM_SPIRV_SOURCE_DIR=$SYCL_HOME/llvm/llvm-spirv \
+        -DLLVM_TOOL_SYCL_BUILD=ON \
+        -DLLVM_TOOL_LLVM_SPIRV_BUILD=ON \
+        $SYCL_HOME/llvm/llvm && \
+
+time make -j4 sycl-toolchain
+
+time make -j4 sycl-toolchain install #DESTDIR=/opt/isycl
+```
+
+## hipSYCL
+
+See https://github.com/illuhad/hipSYCL/tree/master/doc for other options.
+
+### Spack
+
+https://github.com/spack/spack/pull/14051 is not merged yet but this works if you grab the PR.
+
+```sh
+./bin/spack install  hipsycl +cuda
+```

From bdaa73871f8be5e69f7e2ef9fbd5bd20145749f5 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sun, 9 Feb 2020 17:57:45 -0800
Subject: [PATCH 244/245] update docs

---
 README.md | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 7214b0a9e..02170261d 100644
--- a/README.md
+++ b/README.md
@@ -89,10 +89,10 @@ f = see footnotes
 | OpenMP tasks         |  y  |    y    |     y     |    y    |        |       |
 | OpenMP target        |  y  |    y    |     y     |    y    |        |       |
 | OpenCL 1.x           |  i  |    y    |     y     |    y    |        |       |
-| SYCL                 |     |    y    |     y     |    y    |        |       |
+| SYCL                 |  i  |    y    |     y     |    y    |        |       |
 | Boost.Compute        |     |         |           |    y    |        |       |
 | Parallel STL         |  y  |    y    |     y     |    y    |        |       |
-| Thrust               |     |         |           |    y    |        |       |
+| Thrust               |     |         |     i     |    y    |        |       |
 | TBB                  |  y  |    y    |     y     |    y    |        |       |
 | Kokkos               |  y  |    y    |     y     |    y    |        |       |
 | RAJA                 |  y  |    y    |     y     |    y    |        |       |
@@ -111,13 +111,19 @@ f = see footnotes
 
 | Parallelism          | p2p | stencil | transpose | nstream | sparse |
 |----------------------|-----|---------|-----------|---------|--------|
-| None                 |  y  |    y    |     y     |         |        |
+| None                 |  y  |    y    |     y     |    y    |        |
 | C11 threads          |     |         |     y     |         |        |
-| OpenMP               |  y  |    y    |     y     |         |        |
-| OpenMP tasks         |  y  |    y    |     y     |         |        |
-| OpenMP target        |  y  |    y    |     y     |         |        |
+| OpenMP               |  y  |    y    |     y     |    y    |        |
+| OpenMP tasks         |  y  |    y    |     y     |    y    |        |
+| OpenMP target        |  y  |    y    |     y     |    y    |        |
 | Cilk                 |     |    y    |     y     |         |        |
 | ISPC                 |     |         |     y     |         |        |
+| MPI                  |     |         |           |    y    |        |
+
+There are versions of nstream with OpenMP that support memory allocation
+using [mmap](http://man7.org/linux/man-pages/man2/mmap.2.html)
+and [memkind](https://github.com/memkind/memkind), which can be used
+for testing novel memory systems, including persistent memory.
 
 * [ISPC](https://ispc.github.io/)
 

From 465169dee3bc15a05c9d1efa394755b2b4d62749 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Mon, 10 Feb 2020 13:53:08 -0800
Subject: [PATCH 245/245] SYCL soft fail (#435)

* remove MacOS Homebrew Boost path

Signed-off-by: Jeff Hammond <jeff.r.hammond@intel.com>

* SYCL codes shouldn't need ranges

Signed-off-by: Jeff Hammond <jeff.r.hammond@intel.com>

* do not exit on exceptions for device attempts

Signed-off-by: Jeff Hammond <jeff.r.hammond@intel.com>
---
 Cxx11/Makefile          |  2 +-
 Cxx11/nstream-sycl.cc   | 31 ++++++++++++++++++++++++++-----
 common/make.defs.oneapi |  2 +-
 3 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index bc2c8cd32..f40e0b8ac 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -46,7 +46,7 @@ PSTLFLAGS = $(PSTLFLAG) $(RANGEFLAGS) -DUSE_PSTL
 RAJAFLAGS = $(RAJAFLAG) -DUSE_RAJA
 THRUSTFLAGS = $(THRUSTFLAG) $(RANGEFLAGS) -DUSE_THRUST
 KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) $(RANGEFLAGS) -DUSE_KOKKOS
-SYCLFLAGS = $(SYCLFLAG) -DUSE_SYCL -DUSE_2D_INDEXING=0 $(RANGEFLAGS)
+SYCLFLAGS = $(SYCLFLAG) -DUSE_SYCL -DUSE_2D_INDEXING=0
 ORNLACCFLAGS = $(ORNLACCFLAG)
 
 ifdef OCCADIR
diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc
index f9a891407..096a8f948 100644
--- a/Cxx11/nstream-sycl.cc
+++ b/Cxx11/nstream-sycl.cc
@@ -219,8 +219,8 @@ int main(int argc, char * argv[])
   prk::opencl::listPlatforms();
 #endif
 
-  try {
 #if SYCL_TRY_CPU_QUEUE
+  try {
     if (length<100000) {
         sycl::queue q(sycl::host_selector{});
         prk::SYCL::print_device_platform(q);
@@ -229,10 +229,22 @@ int main(int argc, char * argv[])
     } else {
         std::cout << "Skipping host device since it is too slow for large problems" << std::endl;
     }
+  }
+  catch (sycl::exception & e) {
+    std::cout << e.what() << std::endl;
+    prk::SYCL::print_exception_details(e);
+  }
+  catch (std::exception & e) {
+    std::cout << e.what() << std::endl;
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+  }
 #endif
 
     // CPU requires spir64 target
 #if SYCL_TRY_CPU_QUEUE
+  try {
     if (1) {
         sycl::queue q(sycl::cpu_selector{});
         prk::SYCL::print_device_platform(q);
@@ -242,10 +254,22 @@ int main(int argc, char * argv[])
           run<double>(q, iterations, length);
         }
     }
+  }
+  catch (sycl::exception & e) {
+    std::cout << e.what() << std::endl;
+    prk::SYCL::print_exception_details(e);
+  }
+  catch (std::exception & e) {
+    std::cout << e.what() << std::endl;
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+  }
 #endif
 
     // NVIDIA GPU requires ptx64 target
 #if SYCL_TRY_GPU_QUEUE
+  try {
     if (1) {
         sycl::queue q(sycl::gpu_selector{});
         prk::SYCL::print_device_platform(q);
@@ -262,21 +286,18 @@ int main(int argc, char * argv[])
           }
         }
     }
-#endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
     prk::SYCL::print_exception_details(e);
-    return 1;
   }
   catch (std::exception & e) {
     std::cout << e.what() << std::endl;
-    return 1;
   }
   catch (const char * e) {
     std::cout << e << std::endl;
-    return 1;
   }
+#endif
 
   return 0;
 }
diff --git a/common/make.defs.oneapi b/common/make.defs.oneapi
index b736557d3..edb4a8274 100644
--- a/common/make.defs.oneapi
+++ b/common/make.defs.oneapi
@@ -62,7 +62,7 @@ TBBFLAG=-tbb
 #
 # Parallel STL, Boost, etc.
 #
-BOOSTFLAG=-I/usr/local/Cellar/boost/1.65.1/include
+BOOSTFLAG=
 RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
 #RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
 PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG}